gpu: nvgpu: support GMMU mappings for vidmem page allocator

Switch to use page allocator for vidmem Support GMMU mappings for page (non-contiguous page allocator) in update_gmmu_ptes_locked() If aperture is VIDMEM, traverse each chunk in an allocation and map it to GPU VA separately Fix CE page clearing to support page allocator Fix gk20a_pramin_enter() to get base address from new allocator Define API gk20a_mem_get_vidmem_addr() to get base address of allocation. Note that this API should not be used if we have more than 1 chunk Jira DNVGPU-96 Change-Id: I725422f3538aeb477ca4220ba57ef8b3c53db703 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/1199177 (cherry picked from commit 1afae6ee6529ab88cedd5bcbe458fbdc0d4b1fd8) Reviewed-on: http://git-master/r/1197647 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2016-08-04 19:56:42 +05:30
parent c38cc24e1a
commit c845b21012
2 changed files with 151 additions and 62 deletions
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -29,6 +29,7 @@
 #include <linux/lcm.h>
 #include <uapi/linux/nvgpu.h>
 #include <trace/events/gk20a.h>
+#include <gk20a/page_allocator_priv.h>

 #include "gk20a.h"
 #include "mm_gk20a.h"
@@ -84,10 +85,31 @@ void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
 	mem->cpu_va = NULL;
 }

+static u64 gk20a_mem_get_vidmem_addr(struct gk20a *g, struct mem_desc *mem)
+{
+	struct gk20a_page_alloc *alloc;
+	struct page_alloc_chunk *chunk;
+
+	if (mem && mem->aperture == APERTURE_VIDMEM) {
+		alloc = (struct gk20a_page_alloc *)
+				sg_dma_address(mem->sgt->sgl);
+
+		/* This API should not be used with > 1 chunks */
+		if (alloc->nr_chunks != 1)
+			return 0;
+
+		chunk = list_first_entry(&alloc->alloc_chunks,
+				struct page_alloc_chunk, list_entry);
+		return chunk->base;
+	}
+
+	return 0;
+}
+
 /* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */
 static u32 gk20a_pramin_enter(struct gk20a *g, struct mem_desc *mem, u32 w)
 {
-	u64 bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+	u64 bufbase = gk20a_mem_get_vidmem_addr(g, mem);
 	u64 addr = bufbase + w * sizeof(u32);
 	u32 hi = (u32)((addr & ~(u64)0xfffff)
 		>> bus_bar0_window_target_bar0_window_base_shift_v());
@@ -765,9 +787,7 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm)
 		return 0;

 	err = gk20a_page_allocator_init(&g->mm.vidmem.allocator, "vidmem",
-					SZ_4K, size - SZ_4K, SZ_4K,
-					GPU_ALLOC_FORCE_CONTIG |
-					GPU_ALLOC_NO_SCATTER_GATHER);
+					SZ_4K, size - SZ_4K, SZ_4K, 0);
 	if (err) {
 		gk20a_err(d, "Failed to register vidmem for size %zu: %d",
 				size, err);
@@ -2721,7 +2741,6 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 #if defined(CONFIG_GK20A_VIDMEM)
 	u64 addr;
 	int err;
-	bool need_pramin_access = true;

 	gk20a_dbg_fn("");

@@ -2764,13 +2783,22 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,

 	if (g->mm.vidmem.ce_ctx_id != ~0) {
 		struct gk20a_fence *gk20a_fence_out = NULL;
-		u64 dst_bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+		struct gk20a_fence *gk20a_last_fence = NULL;
+		struct gk20a_page_alloc *alloc = NULL;
+		struct page_alloc_chunk *chunk = NULL;

-		err = gk20a_ce_execute_ops(g->dev,
+		alloc = (struct gk20a_page_alloc *)
+				g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+
+		list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) {
+			if (gk20a_last_fence)
+				gk20a_fence_put(gk20a_last_fence);
+
+			err = gk20a_ce_execute_ops(g->dev,
 				g->mm.vidmem.ce_ctx_id,
 				0,
-				dst_bufbase,
-				(u64)size,
+				chunk->base,
+				chunk->length,
 				0x00000000,
 				NVGPU_CE_DST_LOCATION_LOCAL_FB,
 				NVGPU_CE_MEMSET,
@@ -2778,27 +2806,31 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 				0,
 				&gk20a_fence_out);

-		if (!err) {
-			if (gk20a_fence_out) {
-				err = gk20a_fence_wait(gk20a_fence_out, gk20a_get_gr_idle_timeout(g));
-				gk20a_fence_put(gk20a_fence_out);
-				if (err)
-					gk20a_err(g->dev,
-						"Failed to get the fence_out from CE execute ops");
-				else
-					need_pramin_access = false;
+			if (err) {
+				gk20a_err(g->dev,
+					"Failed gk20a_ce_execute_ops[%d]", err);
+				goto fail_free_table;
 			}
-		} else
-			gk20a_err(g->dev, "Failed gk20a_ce_execute_ops[%d]",err);
-	}

-	if (need_pramin_access)
-		gk20a_memset(g, mem, 0, 0, size);
+			gk20a_last_fence = gk20a_fence_out;
+		}
+
+		if (gk20a_last_fence) {
+			err = gk20a_fence_wait(gk20a_last_fence,
+					gk20a_get_gr_idle_timeout(g));
+			gk20a_fence_put(gk20a_last_fence);
+			if (err)
+				gk20a_err(g->dev,
+					"Failed to get the fence_out from CE execute ops");
+		}
+	}

 	gk20a_dbg_fn("done at 0x%llx size %zu", addr, size);

 	return 0;

+fail_free_table:
+	sg_free_table(mem->sgt);
 fail_kfree:
 	kfree(mem->sgt);
 fail_physfree:
@@ -3381,13 +3413,9 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
 	int err;
 	struct scatterlist *sgl = NULL;
-
-	gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d",
-		   pgsz_idx,
-		   sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
-		       : 0ULL,
-		   buffer_offset,
-		   sgt ? sgt->nents : 0);
+	struct gk20a_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *chunk = NULL;
+	u64 length;

 	/* note: here we need to map kernel to small, since the
 	 * low-level mmu code assumes 0 is small and 1 is big pages */
@@ -3397,30 +3425,6 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 	if (space_to_skip & (page_size - 1))
 		return -EINVAL;

-	if (sgt) {
-		iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
-		if (!vm->mm->bypass_smmu && iova) {
-			iova += space_to_skip;
-		} else {
-			sgl = sgt->sgl;
-
-			gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-					(u64)sg_phys(sgl),
-					sgl->length);
-			while (space_to_skip && sgl &&
-			       space_to_skip + page_size > sgl->length) {
-				space_to_skip -= sgl->length;
-				sgl = sg_next(sgl);
-				gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-						(u64)sg_phys(sgl),
-						sgl->length);
-			}
-			iova = sg_phys(sgl) + space_to_skip;
-		}
-	}
-
-	gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
-			pgsz_idx, gpu_va, gpu_end-1, iova);
 	err = map_gmmu_pages(g, &vm->pdb);
 	if (err) {
 		gk20a_err(dev_from_vm(vm),
@@ -3428,14 +3432,98 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 			   vm_aspace_id(vm));
 		return err;
 	}
-	err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
-			&sgl,
-			&space_to_skip,
-			&iova,
-			gpu_va, gpu_end,
-			kind_v, &ctag,
-			cacheable, unmapped_pte, rw_flag, sparse, 0, priv,
-			aperture);
+
+	if (aperture == APERTURE_VIDMEM) {
+		gk20a_dbg(gpu_dbg_map_v, "vidmem map size_idx=%d, gpu_va=[%llx,%llx], alloc=%llx",
+				pgsz_idx, gpu_va, gpu_end-1, iova);
+
+		if (sgt) {
+			alloc = (struct gk20a_page_alloc *)
+				g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
+
+			list_for_each_entry(chunk, &alloc->alloc_chunks,
+							list_entry) {
+				if (space_to_skip &&
+				    space_to_skip > chunk->length) {
+					space_to_skip -= chunk->length;
+				} else {
+					iova = chunk->base + space_to_skip;
+					length = chunk->length - space_to_skip;
+					space_to_skip = 0;
+
+					err = update_gmmu_level_locked(vm,
+						&vm->pdb, pgsz_idx,
+						&sgl,
+						&space_to_skip,
+						&iova,
+						gpu_va, gpu_va + length,
+						kind_v, &ctag,
+						cacheable, unmapped_pte,
+						rw_flag, sparse, 0, priv,
+						aperture);
+
+					/* need to set explicit zero here */
+					space_to_skip = 0;
+					gpu_va += length;
+				}
+			}
+		} else {
+			err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
+					&sgl,
+					&space_to_skip,
+					&iova,
+					gpu_va, gpu_end,
+					kind_v, &ctag,
+					cacheable, unmapped_pte, rw_flag,
+					sparse, 0, priv,
+					aperture);
+		}
+	} else {
+		gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d",
+			   pgsz_idx,
+			   sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
+			       : 0ULL,
+			   buffer_offset,
+			   sgt ? sgt->nents : 0);
+
+		gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
+				pgsz_idx, gpu_va, gpu_end-1, iova);
+
+		if (sgt) {
+			iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
+			if (!vm->mm->bypass_smmu && iova) {
+				iova += space_to_skip;
+			} else {
+				sgl = sgt->sgl;
+
+				gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+						(u64)sg_phys(sgl),
+						sgl->length);
+
+				while (space_to_skip && sgl &&
+				      space_to_skip + page_size > sgl->length) {
+					space_to_skip -= sgl->length;
+					sgl = sg_next(sgl);
+					gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
+							(u64)sg_phys(sgl),
+							sgl->length);
+				}
+
+				iova = sg_phys(sgl) + space_to_skip;
+			}
+		}
+
+		err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
+				&sgl,
+				&space_to_skip,
+				&iova,
+				gpu_va, gpu_end,
+				kind_v, &ctag,
+				cacheable, unmapped_pte, rw_flag,
+				sparse, 0, priv,
+				aperture);
+	}
+
 	unmap_gmmu_pages(g, &vm->pdb);

 	smp_mb();
--- a/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
+++ b/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
@@ -19,6 +19,7 @@

 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <gk20a/gk20a_allocator.h>

 #include "gk20a_allocator.h"