gpu: nvgpu: Update pd_cache to handle 64K pages

Update the PD cache code to handle 64KB pages. To do this the
number of partial/full lists is expanded for when we have 64K
pages. Currently we only explicitly support 4K and 64K page
sizes. Other pages sizes (16K for example) will fail compilation
during preprocessing.

This change also cleans up the definitions for some of the
internal structs. They have been moved into pd_cache.c since
they are not used outside of pd_cache.c.

This allows the following functions to be removed from the global
context:

  __nvgpu_pd_cache_alloc_direct()
  __nvgpu_pd_cache_free_direct()

They have been replaced by calls to nvgpu_pd_{alloc,free}().

The nvgpu_pd_mem_entry alloc_map also had to be expanded to a
real bitmap. 32 or 64 bits is not sufficient for packing 256
byte PDs into a 64K page (there's 256 PDs per nvgpu_pd_mem_entry
in that case). To prevent doing too many find_first_zero
operations on the bitmap an 'allocs' field was also added which
tracks how many allocs are done. We can use this instead of
comparing a mask against the bitmap to determine if an
nvgpu_pd_mem_entry is full.

Note: there's still a limitation with the TLB invalidate code:
it simply assumes an nvgpu_mem is a 1 to 1 with a PDB. This means
we can't invalidate a PDB allocated at an offset greater than 0
in a nvgpu_pd_mem_entry. This in turn means we must always use a
full page size for a context's PDB.

Bug 1977822

Change-Id: I6a7a3a95b7c902bc6487cba05fde58fbc4a25da5
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1718755
Reviewed-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Alex Waterman
2018-05-14 15:07:47 -07:00
committed by mobile promotions
parent eb11d6a7ed
commit aee5511bc8
6 changed files with 117 additions and 107 deletions

View File

@@ -194,13 +194,16 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
};
/*
* PDB size here must be one page so that its address is page size
* PDB size here must be at least 4096 bytes so that its address is 4K
* aligned. Although lower PDE tables can be aligned at 256B boundaries
* the main PDB must be page aligned.
* the PDB must be 4K aligned.
*
* Currently PAGE_SIZE is used, even when 64K, to work around an issue
* with the PDB TLB invalidate code not being pd_cache aware yet.
*/
pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
err = nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
err = nvgpu_pd_alloc(vm, &vm->pdb, pdb_size);
if (WARN_ON(err != 0)) {
return err;
}
@@ -217,7 +220,7 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
/*
* Return the _physical_ address of a page directory.
*/
static u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
u64 nvgpu_pde_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
{
u64 page_addr;
@@ -434,7 +437,7 @@ static int __set_pd_level(struct vm_gk20a *vm,
* target addr is the real physical address we are aiming for.
*/
target_addr = (next_pd != NULL) ?
nvgpu_pde_phys_addr(g, next_pd) :
nvgpu_pde_gpu_addr(g, next_pd) :
phys_addr;
l->update_entry(vm, l,

View File

@@ -36,11 +36,14 @@
/**
* DOC: PD cache
*
* In the name of saving memory with the many sub-page sized PD levels in Pascal
* and beyond a way of packing PD tables together is necessary. This code here
* does just that. If a PD table only requires 1024 bytes, then it is possible
* to have 4 of these PDs in one page. This is even more pronounced for 256 byte
* PD tables.
* To save memory when using sub-page sized PD levels in Pascal and beyond a way
* of packing PD tables together is necessary. If a PD table only requires 1024
* bytes, then it is possible to have 4 of these PDs in one page. This is even
* more pronounced for 256 byte PD tables.
*
* This also matters for page directories on any chip when using a 64K page
* granule. Having 4K PDs packed into a 64K page saves a bunch of memory. Even
* more so for the 256B PDs on Pascal+.
*
* The pd cache is basially just a slab allocator. Each instance of the nvgpu
* driver makes one of these structs:
@@ -68,23 +71,91 @@
* size is page size or larger and choose the correct allocation scheme - either
* from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD
* allocated by nvgpu_pd_alloc().
*
* Since the top level PD (the PDB) is a page aligned pointer but less than a
* page size the direct functions must be used for allocating PDBs. Otherwise
* there would be alignment issues for the PDBs when they get packed.
*/
/*
* Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
* structure is of course depending on this. The MIN_SHIFT define is the right
* number of bits to shift to determine which list to use in the array of lists.
*/
#define NVGPU_PD_CACHE_MIN 256U
#define NVGPU_PD_CACHE_MIN_SHIFT 9U
#if PAGE_SIZE == 4096
#define NVGPU_PD_CACHE_COUNT 4U
#elif PAGE_SIZE == 65536
#define NVGPU_PD_CACHE_COUNT 8U
#else
#error "Unsupported page size."
#endif
struct nvgpu_pd_mem_entry {
struct nvgpu_mem mem;
/*
* Size of the page directories (not the mem). alloc_map is a bitmap
* showing which PDs have been allocated. The size of mem will always
* be one page. pd_size will always be a power of 2.
*/
u32 pd_size;
DECLARE_BITMAP(alloc_map, PAGE_SIZE / NVGPU_PD_CACHE_MIN);
u32 allocs;
struct nvgpu_list_node list_entry;
struct nvgpu_rbtree_node tree_entry;
};
static inline struct nvgpu_pd_mem_entry *
nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
{
return (struct nvgpu_pd_mem_entry *)
((uintptr_t)node -
offsetof(struct nvgpu_pd_mem_entry, list_entry));
};
static inline struct nvgpu_pd_mem_entry *
nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
{
return (struct nvgpu_pd_mem_entry *)
((uintptr_t)node -
offsetof(struct nvgpu_pd_mem_entry, tree_entry));
};
/*
* A cache for allocating PD memory from. This enables smaller PDs to be packed
* into single pages.
*
* This is fairly complex so see the documentation in pd_cache.c for a full
* description of how this is organized.
*/
struct nvgpu_pd_cache {
/*
* Array of lists of full nvgpu_pd_mem_entries and partially full (or
* empty) nvgpu_pd_mem_entries.
*/
struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT];
struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT];
/*
* Tree of all allocated struct nvgpu_mem's for fast look up.
*/
struct nvgpu_rbtree_node *mem_tree;
/*
* All access to the cache much be locked. This protects the lists and
* the rb tree.
*/
struct nvgpu_mutex lock;
};
static u32 nvgpu_pd_cache_nr(u32 bytes)
{
return ilog2((unsigned long)bytes >>
((unsigned long)NVGPU_PD_CACHE_MIN_SHIFT - 1UL));
}
static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
{
u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
return mask_offset - 1U;
return PAGE_SIZE / pentry->pd_size;
}
int nvgpu_pd_cache_init(struct gk20a *g)
@@ -123,6 +194,7 @@ int nvgpu_pd_cache_init(struct gk20a *g)
}
g->mm.pd_cache = cache;
pd_dbg(g, "PD cache initialized!");
return 0;
@@ -151,8 +223,8 @@ void nvgpu_pd_cache_fini(struct gk20a *g)
* Note: this does not need the cache lock since it does not modify any of the
* PD cache data structures.
*/
int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
struct nvgpu_gmmu_pd *pd, u32 bytes)
static int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
struct nvgpu_gmmu_pd *pd, u32 bytes)
{
int err;
unsigned long flags = 0;
@@ -225,7 +297,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
* This allocates the very first PD table in the set of tables in this
* nvgpu_pd_mem_entry.
*/
pentry->alloc_map = 1;
set_bit(0, pentry->alloc_map);
pentry->allocs = 1;
/*
* Now update the nvgpu_gmmu_pd to reflect this allocation.
@@ -247,22 +320,24 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
{
unsigned long bit_offs;
u32 mem_offs;
u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);
/*
* Find and allocate an open PD.
*/
bit_offs = ffz(pentry->alloc_map);
bit_offs = find_first_zero_bit(pentry->alloc_map, nr_bits);
mem_offs = bit_offs * pentry->pd_size;
pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu nr_bits=%d src=0x%p",
bit_offs, nr_bits, pentry);
/* Bit map full. Somethings wrong. */
if (WARN_ON(bit_offs >= ffz(pentry_mask))) {
if (WARN_ON(bit_offs >= nr_bits)) {
return -ENOMEM;
}
pentry->alloc_map |= BIT64(bit_offs);
pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs);
set_bit((int)bit_offs, pentry->alloc_map);
pentry->allocs += 1U;
/*
* First update the pd.
@@ -274,7 +349,7 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
/*
* Now make sure the pentry is in the correct list (full vs partial).
*/
if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
if (pentry->allocs >= nr_bits) {
pd_dbg(g, "Adding pentry to full list!");
nvgpu_list_del(&pentry->list_entry);
nvgpu_list_add(&pentry->list_entry,
@@ -369,7 +444,8 @@ int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
return err;
}
void nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
static void nvgpu_pd_cache_free_direct(struct gk20a *g,
struct nvgpu_gmmu_pd *pd)
{
pd_dbg(g, "PD-Free [D] 0x%p", pd->mem);
@@ -397,13 +473,13 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
struct nvgpu_pd_mem_entry *pentry,
struct nvgpu_gmmu_pd *pd)
{
u32 index = pd->mem_offs / pentry->pd_size;
u32 bit = 1 << index;
u32 bit = pd->mem_offs / pentry->pd_size;
/* Mark entry as free. */
pentry->alloc_map &= ~bit;
clear_bit((int)bit, pentry->alloc_map);
pentry->allocs -= 1U;
if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
if (pentry->allocs > 0U) {
/*
* Partially full still. If it was already on the partial list
* this just re-adds it.

View File

@@ -109,7 +109,7 @@ static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
struct gk20a *g = vm->mm->g;
int i;
nvgpu_pd_cache_free_direct(g, pdb);
nvgpu_pd_free(vm, pdb);
if (pdb->entries == NULL) {
return;
@@ -521,7 +521,7 @@ clean_up_allocators:
}
clean_up_page_tables:
/* Cleans up nvgpu_gmmu_init_page_table() */
nvgpu_pd_cache_free_direct(g, &vm->pdb);
nvgpu_pd_free(vm, &vm->pdb);
clean_up_vgpu_vm:
#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
if (g->is_virtual)

View File

@@ -364,7 +364,7 @@ int gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
struct vm_gk20a *vm)
{
u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
u64 pdb_addr = nvgpu_pde_gpu_addr(g, &vm->pdb);
u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
u32 pdb_addr_hi = u64_hi32(pdb_addr);

View File

@@ -380,7 +380,7 @@ const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
struct vm_gk20a *vm)
{
u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
u64 pdb_addr = nvgpu_pde_gpu_addr(g, &vm->pdb);
u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
u32 pdb_addr_hi = u64_hi32(pdb_addr);

View File

@@ -49,73 +49,6 @@ enum gk20a_mem_rw_flag {
gk20a_mem_flag_write_only = 2, /* WO */
};
/*
* Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
* structure is of course depending on this. The MIN_SHIFT define is the right
* number of bits to shift to determine which list to use in the array of lists.
*/
#define NVGPU_PD_CACHE_MIN 256U
#define NVGPU_PD_CACHE_MIN_SHIFT 9U
#define NVGPU_PD_CACHE_COUNT 4U
struct nvgpu_pd_mem_entry {
struct nvgpu_mem mem;
/*
* Size of the page directories (not the mem). bmap is a bitmap showing
* which PDs have been allocated. The size of mem will always be one
* page. pd_size will always be a power of 2.
*/
u32 pd_size;
unsigned long alloc_map;
struct nvgpu_list_node list_entry;
struct nvgpu_rbtree_node tree_entry;
};
static inline struct nvgpu_pd_mem_entry *
nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
{
return (struct nvgpu_pd_mem_entry *)
((uintptr_t)node -
offsetof(struct nvgpu_pd_mem_entry, list_entry));
};
static inline struct nvgpu_pd_mem_entry *
nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
{
return (struct nvgpu_pd_mem_entry *)
((uintptr_t)node -
offsetof(struct nvgpu_pd_mem_entry, tree_entry));
};
/*
* A cache for allocating PD memory from. This enables smaller PDs to be packed
* into single pages.
*
* This is fairly complex so see the documentation in pd_cache.c for a full
* description of how this is organized.
*/
struct nvgpu_pd_cache {
/*
* Array of lists of full nvgpu_pd_mem_entries and partially full (or
* empty) nvgpu_pd_mem_entries.
*/
struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT];
struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT];
/*
* Tree of all allocated struct nvgpu_mem's for fast look up.
*/
struct nvgpu_rbtree_node *mem_tree;
/*
* All access to the cache much be locked. This protects the lists and
* the rb tree.
*/
struct nvgpu_mutex lock;
};
/*
* GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
* in the GMMU.
@@ -253,11 +186,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
struct nvgpu_gmmu_pd *pd, u32 bytes);
void nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
int nvgpu_pd_cache_init(struct gk20a *g);
void nvgpu_pd_cache_fini(struct gk20a *g);
u64 nvgpu_pde_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
/*
* Some useful routines that are shared across chips.