gpu: nvgpu: pd_cache enablement for >4k allocations in qnx

Mapping of large buffers to GMMU end up needing many
pages for the PTE tables. Allocating these one by one
can end up being a performance bottleneck, particularly
in the virtualized case.

This is adding the following changes:

 - As the TLB invalidation doesn't have access to mem_off,
   allow top-level allocation by alloc_cache_direct().
 - Define NVGPU_PD_CACHE_SIZE, the allocation size for a new slab
   for the PD cache, effectively set to 64K bytes
 - Use the PD cache for any allocation < NVGPU_PD_CACHE_SIZE
   When freeing up cached entries, avoid prefetch errors by
   invalidating the entry (memset to 0).
 - Try to fall back to direct allocation of smaller chunk for
   contiguous allocation failures.
 - Unit test changes.

Bug 200649243

Change-Id: I0a667af0ba01d9147c703e64fc970880e52a8fbc
Signed-off-by: dt <dt@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2404371
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Peter Daifuku
2020-08-26 16:25:36 -07:00
committed by Alex Waterman
parent 94bc3a8135
commit a331fd4b3a
16 changed files with 122 additions and 22 deletions

View File

@@ -197,13 +197,18 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
*
* Currently PAGE_SIZE is used, even when 64K, to work around an issue
* with the PDB TLB invalidate code not being pd_cache aware yet.
*
* Similarly, we can't use nvgpu_pd_alloc() here, because the top-level
* PD must have mem_offs be 0 for the invalidate code to work, so we
* can't use the PD cache.
*/
pdb_size = ALIGN(pd_get_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
err = nvgpu_pd_alloc(vm, &vm->pdb, pdb_size);
err = nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
if (err != 0) {
return err;
}
vm->pdb.pd_size = pdb_size;
/*
* One nvgpu_mb() is done after all mapping operations. Don't need

View File

@@ -62,7 +62,8 @@ static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
{
BUG_ON(pentry->pd_size == 0);
return PAGE_SIZE / pentry->pd_size;
return (nvgpu_safe_cast_u64_to_u32(NVGPU_PD_CACHE_SIZE)) /
pentry->pd_size;
}
/*
@@ -155,7 +156,7 @@ void nvgpu_pd_cache_fini(struct gk20a *g)
* Note: this does not need the cache lock since it does not modify any of the
* PD cache data structures.
*/
static int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
struct nvgpu_gmmu_pd *pd, u32 bytes)
{
int err;
@@ -206,6 +207,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
u32 bytes)
{
struct nvgpu_pd_mem_entry *pentry;
u64 flags = 0UL;
int32_t err;
pd_dbg(g, "PD-Alloc [C] New: offs=0");
@@ -215,8 +218,21 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
return -ENOMEM;
}
if (nvgpu_dma_alloc(g, PAGE_SIZE, &pentry->mem) != 0) {
if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) {
flags = NVGPU_DMA_PHYSICALLY_ADDRESSED;
}
err = nvgpu_dma_alloc_flags(g, flags,
NVGPU_PD_CACHE_SIZE, &pentry->mem);
if (err != 0) {
nvgpu_kfree(g, pentry);
/* Not enough contiguous space, but a direct
* allocation may work
*/
if (err == -ENOMEM) {
return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
}
nvgpu_err(g, "Unable to DMA alloc!");
return -ENOMEM;
}
@@ -330,7 +346,7 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
return -EINVAL;
}
nvgpu_assert(bytes < PAGE_SIZE);
nvgpu_assert(bytes < NVGPU_PD_CACHE_SIZE);
pentry = nvgpu_pd_cache_get_partial(cache, bytes);
if (pentry == NULL) {
@@ -360,7 +376,7 @@ int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
* Simple case: PD is bigger than a page so just do a regular DMA
* alloc.
*/
if (bytes >= PAGE_SIZE) {
if (bytes >= NVGPU_PD_CACHE_SIZE) {
err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
if (err != 0) {
return err;
@@ -424,7 +440,21 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
/*
* Partially full still. If it was already on the partial list
* this just re-adds it.
*
* Since the memory used for the entries is still mapped, if
* igpu make sure the entries are invalidated so that the hw
* doesn't acccidentally try to prefetch non-existent fb memory.
*
* As IOMMU prefetching of invalid pd entries cause the IOMMU fault,
* fill them with zero.
*/
if ((nvgpu_iommuable(g)) &&
(NVGPU_PD_CACHE_SIZE > PAGE_SIZE) &&
(pd->mem->cpu_va != NULL)) {
(void)memset(((u8 *)pd->mem->cpu_va + pd->mem_offs), 0,
pd->pd_size);
}
nvgpu_list_del(&pentry->list_entry);
nvgpu_list_add(&pentry->list_entry,
&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -80,23 +80,37 @@
* Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
* structure is of course depending on this.
*/
#define NVGPU_PD_CACHE_MIN 256U
#define NVGPU_PD_CACHE_MIN 256UL
/**
* MIN_SHIFT is the right number of bits to shift to determine
* which list to use in the array of lists.
*/
#define NVGPU_PD_CACHE_MIN_SHIFT 9U
#define NVGPU_PD_CACHE_MIN_SHIFT 9UL
/**
* Maximum PD cache count. This value varies depending on PAGE_SIZE.
* Maximum PD cache count. This specifies the number of slabs; since each
* slab represents a PO2 increase in size a count of 8 leads to:
*
* NVGPU_PD_CACHE_SIZE = 256B * 2^8 = 64KB
*
* For Linux with 4K pages, if the cache size is larger than 4KB then we
* need to allocate from CMA. This puts a lot of pressure on the CMA space.
* For kernel with a PAGE_SIZE of 64K this isn't the case, so allow the
* PD cache size to be 64K if PAGE_SIZE > 4K (i.e PAGE_SIZE == 64K).
*/
#if PAGE_SIZE == 4096
#define NVGPU_PD_CACHE_COUNT 4U
#elif PAGE_SIZE == 65536
#define NVGPU_PD_CACHE_COUNT 8U
#ifdef __KERNEL__
# if PAGE_SIZE > 4096
# define NVGPU_PD_CACHE_COUNT 8UL
# else
# define NVGPU_PD_CACHE_COUNT 4UL
# endif
#else
#error "Unsupported page size."
#define NVGPU_PD_CACHE_COUNT 8UL
#endif
#define NVGPU_PD_CACHE_SIZE (NVGPU_PD_CACHE_MIN * \
(1UL << NVGPU_PD_CACHE_COUNT))
/**
* This structure describes a slab within the slab allocator.
*/
@@ -115,7 +129,7 @@ struct nvgpu_pd_mem_entry {
* The size of mem will always
* be one page. pd_size will always be a power of 2.
*/
DECLARE_BITMAP(alloc_map, PAGE_SIZE / NVGPU_PD_CACHE_MIN);
DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
/**
* Total number of allocations in this PD.
*/

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -189,4 +189,6 @@ void nvgpu_pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
*/
u64 nvgpu_pd_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
struct nvgpu_gmmu_pd *pd, u32 bytes);
#endif

View File

@@ -111,7 +111,7 @@ static void nvgpu_dma_print_err(struct gk20a *g, size_t size,
nvgpu_dma_flags_to_str(g, flags, flags_str);
nvgpu_err(g,
nvgpu_info(g,
"DMA %s FAILED: [%s] size=%-7zu "
"aligned=%-7zu flags:%s",
what, type,

View File

@@ -308,6 +308,11 @@ static int init_test_env(struct unit_module *m, struct gk20a *g)
unit_return_fail(m, "ecc init failed\n");
}
err = nvgpu_pd_cache_init(g);
if (err != 0) {
unit_return_fail(m, "failed to init pd cache");
}
err = g->ops.mm.init_mm_support(g);
if (err != 0) {
unit_return_fail(m, "failed to init gk20a mm");

View File

@@ -174,6 +174,10 @@ int test_fifo_init_support(struct unit_module *m, struct gk20a *g, void *args)
g->ops.userd.setup_sw = stub_userd_setup_sw;
#endif
g->ops.ecc.ecc_init_support(g);
/* PD cache must be initialized prior to mm init */
err = nvgpu_pd_cache_init(g);
g->ops.mm.init_mm_support(g);
nvgpu_device_init(g);

View File

@@ -84,6 +84,12 @@ int test_gr_init_setup(struct unit_module *m, struct gk20a *g, void *args)
return -ENOMEM;
}
err = nvgpu_pd_cache_init(g);
if (err != 0) {
unit_err(m, "PD cache initialization failed\n");
return -ENOMEM;
}
return UNIT_SUCCESS;
fail:

View File

@@ -158,6 +158,11 @@ int test_init_mm(struct unit_module *m, struct gk20a *g, void *args)
g->ops.fb.intr.enable = gv11b_fb_intr_enable;
g->ops.fb.ecc.init = NULL;
err = nvgpu_pd_cache_init(g);
if (err != 0) {
unit_return_fail(m, "pd cache initialization failed\n");
}
err = nvgpu_init_mm_support(g);
if (err != 0) {
unit_return_fail(m, "nvgpu_init_mm_support failed err=%d\n",

View File

@@ -204,6 +204,10 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
unit_return_fail(m, "nvgpu_vm_init failed\n");
}
if (nvgpu_pd_cache_init(g) != 0) {
unit_return_fail(m, "pd cache initialization failed\n");
}
return UNIT_SUCCESS;
}

View File

@@ -383,6 +383,10 @@ int test_nvgpu_gmmu_init(struct unit_module *m, struct gk20a *g, void *args)
init_platform(m, g, true);
if (nvgpu_pd_cache_init(g) != 0) {
unit_return_fail(m, "PD cache initialization failed\n");
}
if (init_mm(m, g) != 0) {
unit_return_fail(m, "nvgpu_init_mm_support failed\n");
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -321,7 +321,7 @@ int test_pd_free_empty_pd(struct unit_module *m, struct gk20a *g,
/* And now direct frees. */
memset(&pd, 0U, sizeof(pd));
err = nvgpu_pd_alloc(&vm, &pd, PAGE_SIZE);
err = nvgpu_pd_alloc(&vm, &pd, NVGPU_PD_CACHE_SIZE);
if (err != 0) {
unit_return_fail(m, "PD alloc failed");
}
@@ -610,7 +610,7 @@ static int do_test_pd_cache_packing_size(struct unit_module *m, struct gk20a *g,
{
int err;
u32 i;
u32 n = PAGE_SIZE / pd_size;
u32 n = NVGPU_PD_CACHE_SIZE / pd_size;
struct nvgpu_gmmu_pd pds[n], pd;
struct nvgpu_posix_fault_inj *dma_fi =
nvgpu_dma_alloc_get_fault_injection();
@@ -667,7 +667,7 @@ static int do_test_pd_reusability(struct unit_module *m, struct gk20a *g,
{
int err = UNIT_SUCCESS;
u32 i;
u32 n = PAGE_SIZE / pd_size;
u32 n = NVGPU_PD_CACHE_SIZE / pd_size;
struct nvgpu_gmmu_pd pds[n];
struct nvgpu_posix_fault_inj *dma_fi =
nvgpu_dma_alloc_get_fault_injection();

View File

@@ -126,6 +126,7 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
u64 low_hole, aperture_size;
struct nvgpu_os_posix *p = nvgpu_os_posix_from_gk20a(g);
struct mm_gk20a *mm = &g->mm;
int err;
p->mm_is_iommuable = true;
@@ -193,6 +194,11 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
unit_return_fail(m, "'bar2' nvgpu_vm_init failed\n");
}
err = nvgpu_pd_cache_init(g);
if (err != 0) {
unit_return_fail(m, "PD cache init failed\n");
}
/*
* This initialization will make sure that correct aperture mask
* is returned */

View File

@@ -229,6 +229,12 @@ static int init_pmu_falcon_test_env(struct unit_module *m, struct gk20a *g)
return -ENOMEM;
}
err = nvgpu_pd_cache_init(g);
if (err != 0) {
unit_err(m, " PD cache allocation failed!\n");
return -ENOMEM;
}
return 0;
}

View File

@@ -89,6 +89,11 @@ int test_rc_init(struct unit_module *m, struct gk20a *g, void *args)
unit_return_fail(m, "fifo reg_space failure");
}
ret = nvgpu_pd_cache_init(g);
if (ret != 0) {
unit_return_fail(m, "PD cache initialization failure");
}
nvgpu_device_init(g);
g->ops.gr.init.get_no_of_sm = stub_gv11b_gr_init_get_no_of_sm;

View File

@@ -98,6 +98,10 @@ static int init_channel_vm(struct unit_module *m, struct nvgpu_channel *ch)
ch->vm = mm->pmu.vm;
if (nvgpu_pd_cache_init(g) != 0) {
unit_return_fail(m, "pd cache initialization failed\n");
}
return UNIT_SUCCESS;
}