gpu: nvgpu: Update pd_cache to handle 64K pages

Update the PD cache code to handle 64KB pages. To do this the number of partial/full lists is expanded for when we have 64K pages. Currently we only explicitly support 4K and 64K page sizes. Other pages sizes (16K for example) will fail compilation during preprocessing. This change also cleans up the definitions for some of the internal structs. They have been moved into pd_cache.c since they are not used outside of pd_cache.c. This allows the following functions to be removed from the global context: __nvgpu_pd_cache_alloc_direct() __nvgpu_pd_cache_free_direct() They have been replaced by calls to nvgpu_pd_{alloc,free}(). The nvgpu_pd_mem_entry alloc_map also had to be expanded to a real bitmap. 32 or 64 bits is not sufficient for packing 256 byte PDs into a 64K page (there's 256 PDs per nvgpu_pd_mem_entry in that case). To prevent doing too many find_first_zero operations on the bitmap an 'allocs' field was also added which tracks how many allocs are done. We can use this instead of comparing a mask against the bitmap to determine if an nvgpu_pd_mem_entry is full. Note: there's still a limitation with the TLB invalidate code: it simply assumes an nvgpu_mem is a 1 to 1 with a PDB. This means we can't invalidate a PDB allocated at an offset greater than 0 in a nvgpu_pd_mem_entry. This in turn means we must always use a full page size for a context's PDB. Bug 1977822 Change-Id: I6a7a3a95b7c902bc6487cba05fde58fbc4a25da5 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1718755 Reviewed-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2018-05-14 15:07:47 -07:00
parent eb11d6a7ed
commit aee5511bc8
6 changed files with 117 additions and 107 deletions
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -194,13 +194,16 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 	};

 	/*
-	 * PDB size here must be one page so that its address is page size
+	 * PDB size here must be at least 4096 bytes so that its address is 4K
 	 * aligned. Although lower PDE tables can be aligned at 256B boundaries
-	 * the main PDB must be page aligned.
+	 * the PDB must be 4K aligned.
+	 *
+	 * Currently PAGE_SIZE is used, even when 64K, to work around an issue
+	 * with the PDB TLB invalidate code not being pd_cache aware yet.
 	 */
 	pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);

-	err = nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
+	err = nvgpu_pd_alloc(vm, &vm->pdb, pdb_size);
 	if (WARN_ON(err != 0)) {
 		return err;
 	}
@@ -217,7 +220,7 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 /*
 * Return the _physical_ address of a page directory.
 */
-static u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+u64 nvgpu_pde_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
 	u64 page_addr;

@@ -434,7 +437,7 @@ static int __set_pd_level(struct vm_gk20a *vm,
 		 * target addr is the real physical address we are aiming for.
 		 */
 		target_addr = (next_pd != NULL) ?
-			nvgpu_pde_phys_addr(g, next_pd) :
+			nvgpu_pde_gpu_addr(g, next_pd) :
 			phys_addr;

 		l->update_entry(vm, l,
--- a/drivers/gpu/nvgpu/common/mm/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -36,11 +36,14 @@
 /**
 * DOC: PD cache
 *
- * In the name of saving memory with the many sub-page sized PD levels in Pascal
- * and beyond a way of packing PD tables together is necessary. This code here
- * does just that. If a PD table only requires 1024 bytes, then it is possible
- * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
- * PD tables.
+ * To save memory when using sub-page sized PD levels in Pascal and beyond a way
+ * of packing PD tables together is necessary. If a PD table only requires 1024
+ * bytes, then it is possible to have 4 of these PDs in one page. This is even
+ * more pronounced for 256 byte PD tables.
+ *
+ * This also matters for page directories on any chip when using a 64K page
+ * granule. Having 4K PDs packed into a 64K page saves a bunch of memory. Even
+ * more so for the 256B PDs on Pascal+.
 *
 * The pd cache is basially just a slab allocator. Each instance of the nvgpu
 * driver makes one of these structs:
@@ -68,23 +71,91 @@
 * size is page size or larger and choose the correct allocation scheme - either
 * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD
 * allocated by nvgpu_pd_alloc().
- *
- * Since the top level PD (the PDB) is a page aligned pointer but less than a
- * page size the direct functions must be used for allocating PDBs. Otherwise
- * there would be alignment issues for the PDBs when they get packed.
 */

+/*
+ * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
+ * structure is of course depending on this. The MIN_SHIFT define is the right
+ * number of bits to shift to determine which list to use in the array of lists.
+ */
+#define NVGPU_PD_CACHE_MIN		256U
+#define NVGPU_PD_CACHE_MIN_SHIFT	9U
+#if PAGE_SIZE == 4096
+#define NVGPU_PD_CACHE_COUNT		4U
+#elif PAGE_SIZE == 65536
+#define NVGPU_PD_CACHE_COUNT		8U
+#else
+#error "Unsupported page size."
+#endif
+
+struct nvgpu_pd_mem_entry {
+	struct nvgpu_mem		mem;
+
+	/*
+	 * Size of the page directories (not the mem). alloc_map is a bitmap
+	 * showing which PDs have been allocated. The size of mem will always
+	 * be one page. pd_size will always be a power of 2.
+	 */
+	u32				pd_size;
+	DECLARE_BITMAP(alloc_map, PAGE_SIZE / NVGPU_PD_CACHE_MIN);
+	u32				allocs;
+
+	struct nvgpu_list_node		list_entry;
+	struct nvgpu_rbtree_node	tree_entry;
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, list_entry));
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
+};
+
+/*
+ * A cache for allocating PD memory from. This enables smaller PDs to be packed
+ * into single pages.
+ *
+ * This is fairly complex so see the documentation in pd_cache.c for a full
+ * description of how this is organized.
+ */
+struct nvgpu_pd_cache {
+	/*
+	 * Array of lists of full nvgpu_pd_mem_entries and partially full (or
+	 * empty) nvgpu_pd_mem_entries.
+	 */
+	struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+	struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+
+	/*
+	 * Tree of all allocated struct nvgpu_mem's for fast look up.
+	 */
+	struct nvgpu_rbtree_node	*mem_tree;
+
+	/*
+	 * All access to the cache much be locked. This protects the lists and
+	 * the rb tree.
+	 */
+	struct nvgpu_mutex		 lock;
+};
+
 static u32 nvgpu_pd_cache_nr(u32 bytes)
 {
 	return ilog2((unsigned long)bytes >>
 			((unsigned long)NVGPU_PD_CACHE_MIN_SHIFT - 1UL));
 }

-static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
 {
-	u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
-
-	return mask_offset - 1U;
+	return PAGE_SIZE / pentry->pd_size;
 }

 int nvgpu_pd_cache_init(struct gk20a *g)
@@ -123,6 +194,7 @@ int nvgpu_pd_cache_init(struct gk20a *g)
 	}

 	g->mm.pd_cache = cache;
+
 	pd_dbg(g, "PD cache initialized!");

 	return 0;
@@ -151,8 +223,8 @@ void nvgpu_pd_cache_fini(struct gk20a *g)
 * Note: this does not need the cache lock since it does not modify any of the
 * PD cache data structures.
 */
-int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
-				struct nvgpu_gmmu_pd *pd, u32 bytes)
+static int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd, u32 bytes)
 {
 	int err;
 	unsigned long flags = 0;
@@ -225,7 +297,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 	 * This allocates the very first PD table in the set of tables in this
 	 * nvgpu_pd_mem_entry.
 	 */
-	pentry->alloc_map = 1;
+	set_bit(0, pentry->alloc_map);
+	pentry->allocs = 1;

 	/*
 	 * Now update the nvgpu_gmmu_pd to reflect this allocation.
@@ -247,22 +320,24 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
 {
 	unsigned long bit_offs;
 	u32 mem_offs;
-	u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+	u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);

 	/*
 	 * Find and allocate an open PD.
 	 */
-	bit_offs = ffz(pentry->alloc_map);
+	bit_offs = find_first_zero_bit(pentry->alloc_map, nr_bits);
 	mem_offs = bit_offs * pentry->pd_size;

+	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu nr_bits=%d src=0x%p",
+	       bit_offs, nr_bits, pentry);
+
 	/* Bit map full. Somethings wrong. */
-	if (WARN_ON(bit_offs >= ffz(pentry_mask))) {
+	if (WARN_ON(bit_offs >= nr_bits)) {
 		return -ENOMEM;
 	}

-	pentry->alloc_map |= BIT64(bit_offs);
-
-	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
+	set_bit((int)bit_offs, pentry->alloc_map);
+	pentry->allocs += 1U;

 	/*
 	 * First update the pd.
@@ -274,7 +349,7 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
 	/*
 	 * Now make sure the pentry is in the correct list (full vs partial).
 	 */
-	if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+	if (pentry->allocs >= nr_bits) {
 		pd_dbg(g, "Adding pentry to full list!");
 		nvgpu_list_del(&pentry->list_entry);
 		nvgpu_list_add(&pentry->list_entry,
@@ -369,7 +444,8 @@ int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
 	return err;
 }

-void nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+static void nvgpu_pd_cache_free_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd)
 {
 	pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);

@@ -397,13 +473,13 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
 				   struct nvgpu_pd_mem_entry *pentry,
 				   struct nvgpu_gmmu_pd *pd)
 {
-	u32 index = pd->mem_offs / pentry->pd_size;
-	u32 bit = 1 << index;
+	u32 bit = pd->mem_offs / pentry->pd_size;

 	/* Mark entry as free. */
-	pentry->alloc_map &= ~bit;
+	clear_bit((int)bit, pentry->alloc_map);
+	pentry->allocs -= 1U;

-	if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+	if (pentry->allocs > 0U) {
 		/*
 		 * Partially full still. If it was already on the partial list
 		 * this just re-adds it.
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -109,7 +109,7 @@ static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
 	struct gk20a *g = vm->mm->g;
 	int i;

-	nvgpu_pd_cache_free_direct(g, pdb);
+	nvgpu_pd_free(vm, pdb);

 	if (pdb->entries == NULL) {
 		return;
@@ -521,7 +521,7 @@ clean_up_allocators:
 	}
 clean_up_page_tables:
 	/* Cleans up nvgpu_gmmu_init_page_table() */
-	nvgpu_pd_cache_free_direct(g, &vm->pdb);
+	nvgpu_pd_free(vm, &vm->pdb);
 clean_up_vgpu_vm:
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -364,7 +364,7 @@ int gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
 void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
+	u64 pdb_addr = nvgpu_pde_gpu_addr(g, &vm->pdb);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);

--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -380,7 +380,7 @@ const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
+	u64 pdb_addr = nvgpu_pde_gpu_addr(g, &vm->pdb);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);

--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -49,73 +49,6 @@ enum gk20a_mem_rw_flag {
 	gk20a_mem_flag_write_only = 2,	/* WO */
 };

-/*
- * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
- * structure is of course depending on this. The MIN_SHIFT define is the right
- * number of bits to shift to determine which list to use in the array of lists.
- */
-#define NVGPU_PD_CACHE_MIN		256U
-#define NVGPU_PD_CACHE_MIN_SHIFT	9U
-#define NVGPU_PD_CACHE_COUNT		4U
-
-struct nvgpu_pd_mem_entry {
-	struct nvgpu_mem		mem;
-
-	/*
-	 * Size of the page directories (not the mem). bmap is a bitmap showing
-	 * which PDs have been allocated. The size of mem will always be one
-	 * page. pd_size will always be a power of 2.
-	 */
-	u32				pd_size;
-	unsigned long			alloc_map;
-
-	struct nvgpu_list_node		list_entry;
-	struct nvgpu_rbtree_node	tree_entry;
-};
-
-static inline struct nvgpu_pd_mem_entry *
-nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
-{
-	return (struct nvgpu_pd_mem_entry *)
-		((uintptr_t)node -
-		 offsetof(struct nvgpu_pd_mem_entry, list_entry));
-};
-
-static inline struct nvgpu_pd_mem_entry *
-nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
-{
-	return (struct nvgpu_pd_mem_entry *)
-		((uintptr_t)node -
-		 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
-};
-
-/*
- * A cache for allocating PD memory from. This enables smaller PDs to be packed
- * into single pages.
- *
- * This is fairly complex so see the documentation in pd_cache.c for a full
- * description of how this is organized.
- */
-struct nvgpu_pd_cache {
-	/*
-	 * Array of lists of full nvgpu_pd_mem_entries and partially full (or
-	 * empty) nvgpu_pd_mem_entries.
-	 */
-	struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
-	struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
-
-	/*
-	 * Tree of all allocated struct nvgpu_mem's for fast look up.
-	 */
-	struct nvgpu_rbtree_node	*mem_tree;
-
-	/*
-	 * All access to the cache much be locked. This protects the lists and
-	 * the rb tree.
-	 */
-	struct nvgpu_mutex		 lock;
-};
-
 /*
 * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
 * in the GMMU.
@@ -253,11 +186,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,

 int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
 void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
-int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
-				  struct nvgpu_gmmu_pd *pd, u32 bytes);
-void nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
 int nvgpu_pd_cache_init(struct gk20a *g);
 void nvgpu_pd_cache_fini(struct gk20a *g);
+u64 nvgpu_pde_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd);

 /*
 * Some useful routines that are shared across chips.