gpu: nvgpu: pd_cache enablement for >4k allocations in qnx

Mapping of large buffers to GMMU end up needing many pages for the PTE tables. Allocating these one by one can end up being a performance bottleneck, particularly in the virtualized case. This is adding the following changes: - As the TLB invalidation doesn't have access to mem_off, allow top-level allocation by alloc_cache_direct(). - Define NVGPU_PD_CACHE_SIZE, the allocation size for a new slab for the PD cache, effectively set to 64K bytes - Use the PD cache for any allocation < NVGPU_PD_CACHE_SIZE When freeing up cached entries, avoid prefetch errors by invalidating the entry (memset to 0). - Try to fall back to direct allocation of smaller chunk for contiguous allocation failures. - Unit test changes. Bug 200649243 Change-Id: I0a667af0ba01d9147c703e64fc970880e52a8fbc Signed-off-by: dt <dt@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2404371 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2020-08-26 16:25:36 -07:00
parent 94bc3a8135
commit a331fd4b3a
16 changed files with 122 additions and 22 deletions
--- a/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
@@ -197,13 +197,18 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 	 *
 	 * Currently PAGE_SIZE is used, even when 64K, to work around an issue
 	 * with the PDB TLB invalidate code not being pd_cache aware yet.
+	 *
+	 * Similarly, we can't use nvgpu_pd_alloc() here, because the top-level
+	 * PD must have mem_offs be 0 for the invalidate code to work, so we
+	 * can't use the PD cache.
 	 */
 	pdb_size = ALIGN(pd_get_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);

-	err = nvgpu_pd_alloc(vm, &vm->pdb, pdb_size);
+	err = nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
 	if (err != 0) {
 		return err;
 	}
+	vm->pdb.pd_size = pdb_size;

 	/*
 	 * One nvgpu_mb() is done after all mapping operations. Don't need
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
@@ -62,7 +62,8 @@ static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
 {
 	BUG_ON(pentry->pd_size == 0);

-	return PAGE_SIZE / pentry->pd_size;
+	return (nvgpu_safe_cast_u64_to_u32(NVGPU_PD_CACHE_SIZE)) /
+			pentry->pd_size;
 }

 /*
@@ -155,7 +156,7 @@ void nvgpu_pd_cache_fini(struct gk20a *g)
 * Note: this does not need the cache lock since it does not modify any of the
 * PD cache data structures.
 */
-static int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
 				       struct nvgpu_gmmu_pd *pd, u32 bytes)
 {
 	int err;
@@ -206,6 +207,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 				    u32 bytes)
 {
 	struct nvgpu_pd_mem_entry *pentry;
+	u64 flags = 0UL;
+	int32_t err;

 	pd_dbg(g, "PD-Alloc [C]   New: offs=0");

@@ -215,8 +218,21 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 		return -ENOMEM;
 	}

-	if (nvgpu_dma_alloc(g, PAGE_SIZE, &pentry->mem) != 0) {
+	if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) {
+		flags = NVGPU_DMA_PHYSICALLY_ADDRESSED;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, flags,
+				  NVGPU_PD_CACHE_SIZE, &pentry->mem);
+	if (err != 0) {
 		nvgpu_kfree(g, pentry);
+
+		/* Not enough contiguous space, but a direct
+		 * allocation may work
+		 */
+		if (err == -ENOMEM) {
+			return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		}
 		nvgpu_err(g, "Unable to DMA alloc!");
 		return -ENOMEM;
 	}
@@ -330,7 +346,7 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
 		return -EINVAL;
 	}

-	nvgpu_assert(bytes < PAGE_SIZE);
+	nvgpu_assert(bytes < NVGPU_PD_CACHE_SIZE);

 	pentry = nvgpu_pd_cache_get_partial(cache, bytes);
 	if (pentry == NULL) {
@@ -360,7 +376,7 @@ int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
 	 * Simple case: PD is bigger than a page so just do a regular DMA
 	 * alloc.
 	 */
-	if (bytes >= PAGE_SIZE) {
+	if (bytes >= NVGPU_PD_CACHE_SIZE) {
 		err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
 		if (err != 0) {
 			return err;
@@ -424,7 +440,21 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
 		/*
 		 * Partially full still. If it was already on the partial list
 		 * this just re-adds it.
+		 *
+		 * Since the memory used for the entries is still mapped, if
+		 * igpu make sure the entries are invalidated so that the hw
+		 * doesn't acccidentally try to prefetch non-existent fb memory.
+		 *
+		 * As IOMMU prefetching of invalid pd entries cause the IOMMU fault,
+		 * fill them with zero.
 		 */
+		if ((nvgpu_iommuable(g)) &&
+			(NVGPU_PD_CACHE_SIZE > PAGE_SIZE) &&
+			(pd->mem->cpu_va != NULL)) {
+			(void)memset(((u8 *)pd->mem->cpu_va + pd->mem_offs), 0,
+					pd->pd_size);
+		}
+
 		nvgpu_list_del(&pentry->list_entry);
 		nvgpu_list_add(&pentry->list_entry,
 			&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -80,23 +80,37 @@
 * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
 * structure is of course depending on this.
 */
-#define NVGPU_PD_CACHE_MIN		256U
+#define NVGPU_PD_CACHE_MIN		256UL
 /**
 * MIN_SHIFT is the right number of bits to shift to determine
 * which list to use in the array of lists.
 */
-#define NVGPU_PD_CACHE_MIN_SHIFT	9U
+#define NVGPU_PD_CACHE_MIN_SHIFT	9UL
+
 /**
- * Maximum PD cache count. This value varies depending on PAGE_SIZE.
+ * Maximum PD cache count. This specifies the number of slabs; since each
+ * slab represents a PO2 increase in size a count of 8 leads to:
+ *
+ *   NVGPU_PD_CACHE_SIZE = 256B * 2^8 = 64KB
+ *
+ * For Linux with 4K pages, if the cache size is larger than 4KB then we
+ * need to allocate from CMA. This puts a lot of pressure on the CMA space.
+ * For kernel with a PAGE_SIZE of 64K this isn't the case, so allow the
+ * PD cache size to be 64K if PAGE_SIZE > 4K (i.e PAGE_SIZE == 64K).
 */
-#if PAGE_SIZE == 4096
-#define NVGPU_PD_CACHE_COUNT		4U
-#elif PAGE_SIZE == 65536
-#define NVGPU_PD_CACHE_COUNT		8U
+#ifdef __KERNEL__
+#  if PAGE_SIZE > 4096
+#    define NVGPU_PD_CACHE_COUNT	8UL
+#  else
+#    define NVGPU_PD_CACHE_COUNT	4UL
+#  endif
 #else
-#error "Unsupported page size."
+#define NVGPU_PD_CACHE_COUNT		8UL
 #endif

+#define NVGPU_PD_CACHE_SIZE		(NVGPU_PD_CACHE_MIN * \
+						(1UL << NVGPU_PD_CACHE_COUNT))
+
 /**
 * This structure describes a slab within the slab allocator.
 */
@@ -115,7 +129,7 @@ struct nvgpu_pd_mem_entry {
 	 * The size of mem will always
 	 * be one page. pd_size will always be a power of 2.
 	 */
-	DECLARE_BITMAP(alloc_map, PAGE_SIZE / NVGPU_PD_CACHE_MIN);
+	DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
 	/**
 	 * Total number of allocations in this PD.
 	 */
--- a/drivers/gpu/nvgpu/include/nvgpu/pd_cache.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/pd_cache.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -189,4 +189,6 @@ void nvgpu_pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
 */
 u64  nvgpu_pd_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd);

+int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				struct nvgpu_gmmu_pd *pd, u32 bytes);
 #endif
--- a/drivers/gpu/nvgpu/os/linux/linux-dma.c
+++ b/drivers/gpu/nvgpu/os/linux/linux-dma.c
@@ -111,7 +111,7 @@ static void nvgpu_dma_print_err(struct gk20a *g, size_t size,

 	nvgpu_dma_flags_to_str(g, flags, flags_str);

-	nvgpu_err(g,
+	nvgpu_info(g,
 		  "DMA %s FAILED: [%s] size=%-7zu "
 		  "aligned=%-7zu flags:%s",
 		  what, type,
--- a/userspace/units/acr/nvgpu-acr.c
+++ b/userspace/units/acr/nvgpu-acr.c
@@ -308,6 +308,11 @@ static int init_test_env(struct unit_module *m, struct gk20a *g)
 		unit_return_fail(m, "ecc init failed\n");
 	}

+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_return_fail(m, "failed to init pd cache");
+	}
+
 	err = g->ops.mm.init_mm_support(g);
 	if (err != 0) {
 		unit_return_fail(m, "failed to init gk20a mm");
--- a/userspace/units/fifo/nvgpu-fifo-common.c
+++ b/userspace/units/fifo/nvgpu-fifo-common.c
@@ -174,6 +174,10 @@ int test_fifo_init_support(struct unit_module *m, struct gk20a *g, void *args)
 	g->ops.userd.setup_sw = stub_userd_setup_sw;
 #endif
 	g->ops.ecc.ecc_init_support(g);
+
+	/* PD cache must be initialized prior to mm init */
+	err = nvgpu_pd_cache_init(g);
+
 	g->ops.mm.init_mm_support(g);

 	nvgpu_device_init(g);
--- a/userspace/units/gr/nvgpu-gr.c
+++ b/userspace/units/gr/nvgpu-gr.c
@@ -84,6 +84,12 @@ int test_gr_init_setup(struct unit_module *m, struct gk20a *g, void *args)
 		return -ENOMEM;
 	}

+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_err(m, "PD cache initialization failed\n");
+		return -ENOMEM;
+	}
+
 	return UNIT_SUCCESS;

 fail:
--- a/userspace/units/mm/as/as.c
+++ b/userspace/units/mm/as/as.c
@@ -158,6 +158,11 @@ int test_init_mm(struct unit_module *m, struct gk20a *g, void *args)
 	g->ops.fb.intr.enable = gv11b_fb_intr_enable;
 	g->ops.fb.ecc.init = NULL;

+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_return_fail(m, "pd cache initialization failed\n");
+	}
+
 	err = nvgpu_init_mm_support(g);
 	if (err != 0) {
 		unit_return_fail(m, "nvgpu_init_mm_support failed err=%d\n",
--- a/userspace/units/mm/dma/dma.c
+++ b/userspace/units/mm/dma/dma.c
@@ -204,6 +204,10 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
 		unit_return_fail(m, "nvgpu_vm_init failed\n");
 	}

+	if (nvgpu_pd_cache_init(g) != 0) {
+		unit_return_fail(m, "pd cache initialization failed\n");
+	}
+
 	return UNIT_SUCCESS;
 }

--- a/userspace/units/mm/gmmu/page_table/page_table.c
+++ b/userspace/units/mm/gmmu/page_table/page_table.c
@@ -383,6 +383,10 @@ int test_nvgpu_gmmu_init(struct unit_module *m, struct gk20a *g, void *args)

 	init_platform(m, g, true);

+	if (nvgpu_pd_cache_init(g) != 0) {
+		unit_return_fail(m, "PD cache initialization failed\n");
+	}
+
 	if (init_mm(m, g) != 0) {
 		unit_return_fail(m, "nvgpu_init_mm_support failed\n");
 	}
--- a/userspace/units/mm/gmmu/pd_cache/pd_cache.c
+++ b/userspace/units/mm/gmmu/pd_cache/pd_cache.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -321,7 +321,7 @@ int test_pd_free_empty_pd(struct unit_module *m, struct gk20a *g,

 	/* And now direct frees. */
 	memset(&pd, 0U, sizeof(pd));
-	err = nvgpu_pd_alloc(&vm, &pd, PAGE_SIZE);
+	err = nvgpu_pd_alloc(&vm, &pd, NVGPU_PD_CACHE_SIZE);
 	if (err != 0) {
 		unit_return_fail(m, "PD alloc failed");
 	}
@@ -610,7 +610,7 @@ static int do_test_pd_cache_packing_size(struct unit_module *m, struct gk20a *g,
 {
 	int err;
 	u32 i;
-	u32 n = PAGE_SIZE / pd_size;
+	u32 n = NVGPU_PD_CACHE_SIZE / pd_size;
 	struct nvgpu_gmmu_pd pds[n], pd;
 	struct nvgpu_posix_fault_inj *dma_fi =
 		nvgpu_dma_alloc_get_fault_injection();
@@ -667,7 +667,7 @@ static int do_test_pd_reusability(struct unit_module *m, struct gk20a *g,
 {
 	int err = UNIT_SUCCESS;
 	u32 i;
-	u32 n = PAGE_SIZE / pd_size;
+	u32 n = NVGPU_PD_CACHE_SIZE / pd_size;
 	struct nvgpu_gmmu_pd pds[n];
 	struct nvgpu_posix_fault_inj *dma_fi =
 		nvgpu_dma_alloc_get_fault_injection();
--- a/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c
+++ b/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c
@@ -126,6 +126,7 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
 	u64 low_hole, aperture_size;
 	struct nvgpu_os_posix *p = nvgpu_os_posix_from_gk20a(g);
 	struct mm_gk20a *mm = &g->mm;
+	int err;

 	p->mm_is_iommuable = true;

@@ -193,6 +194,11 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
 		unit_return_fail(m, "'bar2' nvgpu_vm_init failed\n");
 	}

+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_return_fail(m, "PD cache init failed\n");
+	}
+
 	/*
 	 * This initialization will make sure that correct aperture mask
 	 * is returned */
--- a/userspace/units/pmu/nvgpu-pmu.c
+++ b/userspace/units/pmu/nvgpu-pmu.c
@@ -229,6 +229,12 @@ static int init_pmu_falcon_test_env(struct unit_module *m, struct gk20a *g)
 		return -ENOMEM;
 	}

+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_err(m, " PD cache allocation failed!\n");
+		return -ENOMEM;
+	}
+
 	return 0;
 }

--- a/userspace/units/rc/nvgpu-rc.c
+++ b/userspace/units/rc/nvgpu-rc.c
@@ -89,6 +89,11 @@ int test_rc_init(struct unit_module *m, struct gk20a *g, void *args)
 		unit_return_fail(m, "fifo reg_space failure");
 	}

+	ret = nvgpu_pd_cache_init(g);
+	if (ret != 0) {
+		unit_return_fail(m, "PD cache initialization failure");
+	}
+
 	nvgpu_device_init(g);

 	g->ops.gr.init.get_no_of_sm = stub_gv11b_gr_init_get_no_of_sm;
--- a/userspace/units/sync/nvgpu-sync.c
+++ b/userspace/units/sync/nvgpu-sync.c
@@ -98,6 +98,10 @@ static int init_channel_vm(struct unit_module *m, struct nvgpu_channel *ch)

 	ch->vm = mm->pmu.vm;

+	if (nvgpu_pd_cache_init(g) != 0) {
+		unit_return_fail(m, "pd cache initialization failed\n");
+	}
+
 	return UNIT_SUCCESS;
 }