Open source GPL/LGPL release

2025-12-24 10:34:43 +03:00 · 2022-07-21 16:03:29 -07:00
commit f338182221
2260 changed files with 576813 additions and 0 deletions
--- a/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/bug.h>
+#include <nvgpu/log.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/static_analysis.h>
+
+#include "pd_cache_priv.h"
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, list_entry));
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
+};
+
+static u32 nvgpu_pd_cache_nr(u32 bytes)
+{
+	unsigned long tmp = ilog2((unsigned long)bytes >>
+			((unsigned long)NVGPU_PD_CACHE_MIN_SHIFT - 1UL));
+
+	nvgpu_assert(tmp <= U32_MAX);
+	return (u32)tmp;
+}
+
+static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
+{
+	BUG_ON(pentry->pd_size == 0);
+
+	return (nvgpu_safe_cast_u64_to_u32(NVGPU_PD_CACHE_SIZE)) /
+			pentry->pd_size;
+}
+
+/*
+ * Return the _physical_ address of a page directory.
+ */
+u64 nvgpu_pd_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+	u64 page_addr;
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_NVLINK)) {
+		page_addr = nvgpu_mem_get_phys_addr(g, pd->mem);
+	} else {
+		page_addr = nvgpu_mem_get_addr(g, pd->mem);
+	}
+
+	return nvgpu_safe_add_u64(page_addr, U64(pd->mem_offs));
+}
+
+u32 nvgpu_pd_offset_from_index(const struct gk20a_mmu_level *l, u32 pd_idx)
+{
+	return nvgpu_safe_mult_u32(pd_idx, l->entry_size) / U32(sizeof(u32));
+}
+
+void nvgpu_pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
+		    size_t w, u32 data)
+{
+	u64 tmp_offset = nvgpu_safe_add_u64((pd->mem_offs / sizeof(u32)), w);
+
+	nvgpu_mem_wr32(g, pd->mem,
+		       nvgpu_safe_cast_u64_to_u32(tmp_offset),
+		       data);
+}
+
+int nvgpu_pd_cache_init(struct gk20a *g)
+{
+	struct nvgpu_pd_cache *cache;
+	u32 i;
+
+	/*
+	 * This gets called from finalize_poweron() so we need to make sure we
+	 * don't reinit the pd_cache over and over.
+	 */
+	if (g->mm.pd_cache != NULL) {
+		return 0;
+	}
+
+	cache = nvgpu_kzalloc(g, sizeof(*cache));
+	if (cache == NULL) {
+		nvgpu_err(g, "Failed to alloc pd_cache!");
+		return -ENOMEM;
+	}
+
+	for (i = 0U; i < NVGPU_PD_CACHE_COUNT; i++) {
+		nvgpu_init_list_node(&cache->full[i]);
+		nvgpu_init_list_node(&cache->partial[i]);
+	}
+
+	cache->mem_tree = NULL;
+
+	nvgpu_mutex_init(&cache->lock);
+
+	g->mm.pd_cache = cache;
+
+	pd_dbg(g, "PD cache initialized!");
+
+	return 0;
+}
+
+void nvgpu_pd_cache_fini(struct gk20a *g)
+{
+	u32 i;
+	struct nvgpu_pd_cache *cache = g->mm.pd_cache;
+
+	if (cache == NULL) {
+		return;
+	}
+
+	for (i = 0U; i < NVGPU_PD_CACHE_COUNT; i++) {
+		nvgpu_assert(nvgpu_list_empty(&cache->full[i]));
+		nvgpu_assert(nvgpu_list_empty(&cache->partial[i]));
+	}
+
+	nvgpu_kfree(g, g->mm.pd_cache);
+	g->mm.pd_cache = NULL;
+}
+
+/*
+ * This is the simple pass-through for greater than page or page sized PDs.
+ *
+ * Note: this does not need the cache lock since it does not modify any of the
+ * PD cache data structures.
+ */
+int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	int err;
+	unsigned long flags = 0;
+
+	pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
+
+	pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
+	if (pd->mem == NULL) {
+		nvgpu_err(g, "OOM allocating nvgpu_mem struct!");
+		return -ENOMEM;
+	}
+
+	/*
+	 * If bytes == NVGPU_CPU_PAGE_SIZE then it's impossible to get a discontiguous DMA
+	 * allocation. Some DMA implementations may, despite this fact, still
+	 * use the contiguous pool for page sized allocations. As such only
+	 * request explicitly contiguous allocs if the page directory is larger
+	 * than the page size. Also, of course, this is all only revelant for
+	 * GPUs not using an IOMMU. If there is an IOMMU DMA allocs are always
+	 * going to be virtually contiguous and we don't have to force the
+	 * underlying allocations to be physically contiguous as well.
+	 */
+	if (!nvgpu_iommuable(g) && (bytes > NVGPU_CPU_PAGE_SIZE)) {
+		flags = NVGPU_DMA_PHYSICALLY_ADDRESSED;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, flags, bytes, pd->mem);
+	if (err != 0) {
+		nvgpu_err(g, "OOM allocating page directory!");
+		nvgpu_kfree(g, pd->mem);
+		return -ENOMEM;
+	}
+
+	pd->cached = false;
+	pd->mem_offs = 0;
+
+	return 0;
+}
+
+/*
+ * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
+ * pd to reflect this allocation.
+ */
+static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
+				    struct nvgpu_pd_cache *cache,
+				    struct nvgpu_gmmu_pd *pd,
+				    u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+	u64 flags = 0UL;
+	int32_t err;
+
+	pd_dbg(g, "PD-Alloc [C]   New: offs=0");
+
+	pentry = nvgpu_kzalloc(g, sizeof(*pentry));
+	if (pentry == NULL) {
+		nvgpu_err(g, "OOM allocating pentry!");
+		return -ENOMEM;
+	}
+
+	if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > NVGPU_CPU_PAGE_SIZE)) {
+		flags = NVGPU_DMA_PHYSICALLY_ADDRESSED;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, flags,
+				  NVGPU_PD_CACHE_SIZE, &pentry->mem);
+	if (err != 0) {
+		nvgpu_kfree(g, pentry);
+
+		/* Not enough contiguous space, but a direct
+		 * allocation may work
+		 */
+		if (err == -ENOMEM) {
+			return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		}
+		nvgpu_err(g, "Unable to DMA alloc!");
+		return -ENOMEM;
+	}
+
+	pentry->pd_size = bytes;
+	nvgpu_list_add(&pentry->list_entry,
+		       &cache->partial[nvgpu_pd_cache_nr(bytes)]);
+
+	/*
+	 * This allocates the very first PD table in the set of tables in this
+	 * nvgpu_pd_mem_entry.
+	 */
+	nvgpu_set_bit(0U, pentry->alloc_map);
+	pentry->allocs = 1;
+
+	/*
+	 * Now update the nvgpu_gmmu_pd to reflect this allocation.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = 0;
+	pd->cached = true;
+
+	pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
+	nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
+
+	return 0;
+}
+
+static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
+					     struct nvgpu_pd_cache *cache,
+					     struct nvgpu_pd_mem_entry *pentry,
+					     struct nvgpu_gmmu_pd *pd)
+{
+	u32 bit_offs;
+	u32 mem_offs;
+	u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);
+
+	/*
+	 * Find and allocate an open PD.
+	 */
+	bit_offs = nvgpu_safe_cast_u64_to_u32(
+			find_first_zero_bit(pentry->alloc_map, nr_bits));
+	mem_offs = nvgpu_safe_mult_u32(bit_offs, pentry->pd_size);
+
+	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%u nr_bits=%d src=0x%p",
+	       bit_offs, nr_bits, pentry);
+
+	/* Bit map full. Somethings wrong. */
+	nvgpu_assert(bit_offs < nr_bits);
+
+	nvgpu_set_bit(bit_offs, pentry->alloc_map);
+	pentry->allocs = nvgpu_safe_add_u32(pentry->allocs, 1U);
+
+	/*
+	 * First update the pd.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = mem_offs;
+	pd->cached = true;
+
+	/*
+	 * Now make sure the pentry is in the correct list (full vs partial).
+	 */
+	if (pentry->allocs >= nr_bits) {
+		pd_dbg(g, "Adding pentry to full list!");
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	}
+
+	return 0;
+}
+
+/*
+ * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
+ * nvgpu_pd_mem_entry's.
+ */
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
+	struct nvgpu_pd_cache *cache, u32 bytes)
+{
+	struct nvgpu_list_node *list =
+		&cache->partial[nvgpu_pd_cache_nr(bytes)];
+
+	if (nvgpu_list_empty(list)) {
+		return NULL;
+	}
+
+	return nvgpu_list_first_entry(list,
+				      nvgpu_pd_mem_entry,
+				      list_entry);
+}
+
+/*
+ * Allocate memory from an nvgpu_mem for the page directory.
+ */
+static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+	int err;
+	bool bytes_valid;
+
+	pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
+
+	bytes_valid = bytes >= NVGPU_PD_CACHE_MIN;
+	if (bytes_valid) {
+		bytes_valid = (bytes & nvgpu_safe_sub_u32(bytes, 1U)) == 0U;
+	}
+	if (!bytes_valid) {
+		pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
+		return -EINVAL;
+	}
+
+	nvgpu_assert(bytes < NVGPU_PD_CACHE_SIZE);
+
+	pentry = nvgpu_pd_cache_get_partial(cache, bytes);
+	if (pentry == NULL) {
+		err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
+	} else {
+		err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
+	}
+
+	if (err != 0) {
+		nvgpu_err(g, "PD-Alloc [C] Failed!");
+	}
+
+	return err;
+}
+
+/*
+ * Allocate the DMA memory for a page directory. This handles the necessary PD
+ * cache logistics. Since on Parker and later GPUs some of the page  directories
+ * are smaller than a page packing these PDs together saves a lot of memory.
+ */
+int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	int err;
+
+	/*
+	 * Simple case: PD is bigger than a page so just do a regular DMA
+	 * alloc.
+	 */
+	if (bytes >= NVGPU_PD_CACHE_SIZE) {
+		err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		if (err != 0) {
+			return err;
+		}
+		pd->pd_size = bytes;
+
+		return 0;
+	}
+
+	if (g->mm.pd_cache == NULL) {
+		nvgpu_do_assert();
+		return -ENOMEM;
+	}
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
+	if (err == 0) {
+		pd->pd_size = bytes;
+	}
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+
+	return err;
+}
+
+static void nvgpu_pd_cache_free_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd)
+{
+	pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);
+
+	if (pd->mem == NULL) {
+		return;
+	}
+
+	nvgpu_dma_free(g, pd->mem);
+	nvgpu_kfree(g, pd->mem);
+	pd->mem = NULL;
+}
+
+static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
+					  struct nvgpu_pd_cache *cache,
+					  struct nvgpu_pd_mem_entry *pentry)
+{
+	nvgpu_dma_free(g, &pentry->mem);
+	nvgpu_list_del(&pentry->list_entry);
+	nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
+	nvgpu_kfree(g, pentry);
+}
+
+static void nvgpu_pd_cache_do_free(struct gk20a *g,
+				   struct nvgpu_pd_cache *cache,
+				   struct nvgpu_pd_mem_entry *pentry,
+				   struct nvgpu_gmmu_pd *pd)
+{
+	u32 bit = pd->mem_offs / pentry->pd_size;
+
+	/* Mark entry as free. */
+	nvgpu_clear_bit(bit, pentry->alloc_map);
+	pentry->allocs = nvgpu_safe_sub_u32(pentry->allocs, 1U);
+
+	if (pentry->allocs > 0U) {
+		/*
+		 * Partially full still. If it was already on the partial list
+		 * this just re-adds it.
+		 *
+		 * Since the memory used for the entries is still mapped, if
+		 * igpu make sure the entries are invalidated so that the hw
+		 * doesn't acccidentally try to prefetch non-existent fb memory.
+		 *
+		 * As IOMMU prefetching of invalid pd entries cause the IOMMU fault,
+		 * fill them with zero.
+		 */
+		if ((nvgpu_iommuable(g)) &&
+			(NVGPU_PD_CACHE_SIZE > NVGPU_CPU_SMALL_PAGE_SIZE) &&
+			(pd->mem->cpu_va != NULL)) {
+			(void)memset(((u8 *)pd->mem->cpu_va + pd->mem_offs), 0,
+					pd->pd_size);
+		}
+
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	} else {
+		/* Empty now so free it. */
+		nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
+	}
+
+	pd->mem = NULL;
+}
+
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
+	struct nvgpu_pd_cache *cache,
+	struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_rbtree_node *node = NULL;
+
+	nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
+			    cache->mem_tree);
+	if (node == NULL) {
+		return NULL;
+	}
+
+	return nvgpu_pd_mem_entry_from_tree_entry(node);
+}
+
+static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+
+	pd_dbg(g, "PD-Free  [C] 0x%p", pd->mem);
+
+	pentry = nvgpu_pd_cache_look_up(cache, pd);
+	if (pentry == NULL) {
+		nvgpu_do_assert_print(g, "Attempting to free non-existent pd");
+		return;
+	}
+
+	nvgpu_pd_cache_do_free(g, cache, pentry, pd);
+}
+
+void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	/*
+	 * Simple case: just DMA free.
+	 */
+	if (!pd->cached) {
+		return nvgpu_pd_cache_free_direct(g, pd);
+	}
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+}
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GMMU_PD_CACHE_PRIV_H
+#define NVGPU_GMMU_PD_CACHE_PRIV_H
+
+/**
+ * @file
+ *
+ * Page directory cache private interface
+ * --------------------------------------
+ *
+ * To save memory when using sub-page sized PD levels in Pascal and beyond a way
+ * of packing PD tables together is necessary. If a PD table only requires 1024
+ * bytes, then it is possible to have 4 of these PDs in one page. This is even
+ * more pronounced for 256 byte PD tables.
+ *
+ * This also matters for page directories on any chip when using a 64K page
+ * granule. Having 4K PDs packed into a 64K page saves a bunch of memory. Even
+ * more so for the 256B PDs on Pascal+.
+ *
+ * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * driver makes one of these structs:
+ *
+ *   struct nvgpu_pd_cache {
+ *      struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+ *      struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+ *
+ *      struct nvgpu_rbtree_node	*mem_tree;
+ *   };
+ *
+ * There are two sets of lists, the full and the partial. The full lists contain
+ * pages of memory for which all the memory in that page is in use. The partial
+ * lists contain partially full pages of memory which can be used for more PD
+ * allocations. There a couple of assumptions here:
+ *
+ *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
+ *
+ * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
+ * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
+ * 256, 512, 1024, and 2048 byte PDs.
+ *
+ * nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
+ * size is page size or larger and choose the correct allocation scheme - either
+ * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD
+ * allocated by nvgpu_pd_alloc().
+ */
+
+#include <nvgpu/bug.h>
+#include <nvgpu/log.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/rbtree.h>
+#include <nvgpu/lock.h>
+
+#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
+
+/**
+ * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
+ * structure is of course depending on this.
+ */
+#define NVGPU_PD_CACHE_MIN		256UL
+/**
+ * MIN_SHIFT is the right number of bits to shift to determine
+ * which list to use in the array of lists.
+ */
+#define NVGPU_PD_CACHE_MIN_SHIFT	9UL
+
+/**
+ * Maximum PD cache count. This specifies the number of slabs; since each
+ * slab represents a PO2 increase in size a count of 8 leads to:
+ *
+ *   NVGPU_PD_CACHE_SIZE = 256B * 2^8 = 64KB
+ *
+ * For Linux with 4K pages, if the cache size is larger than 4KB then we
+ * need to allocate from CMA. This puts a lot of pressure on the CMA space.
+ * For kernel with a PAGE_SIZE of 64K this isn't the case, so allow the
+ * PD cache size to be 64K if PAGE_SIZE > 4K (i.e PAGE_SIZE == 64K).
+ */
+#ifdef __KERNEL__
+#  if NVGPU_CPU_PAGE_SIZE > 4096
+#    define NVGPU_PD_CACHE_COUNT	8UL
+#  else
+#    define NVGPU_PD_CACHE_COUNT	4UL
+#  endif
+#else
+#define NVGPU_PD_CACHE_COUNT		8UL
+#endif
+
+#define NVGPU_PD_CACHE_SIZE		(NVGPU_PD_CACHE_MIN * \
+						(1UL << NVGPU_PD_CACHE_COUNT))
+
+/**
+ * This structure describes a slab within the slab allocator.
+ */
+struct nvgpu_pd_mem_entry {
+	/**
+	 * Structure for storing the PD memory information.
+	 */
+	struct nvgpu_mem		mem;
+
+	/**
+	 * Size of the page directories (not the mem).
+	 */
+	u32				pd_size;
+	/**
+	 * alloc_map is a bitmap showing which PDs have been allocated.
+	 * The size of mem will always
+	 * be one page. pd_size will always be a power of 2.
+	 */
+	DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
+	/**
+	 * Total number of allocations in this PD.
+	 */
+	u32				allocs;
+
+	/**
+	 * This is a list node within the list. The list node will be either from
+	 * a full or partial list in #nvgpu_pd_cache.
+	 */
+	struct nvgpu_list_node		list_entry;
+	/**
+	 * This is a tree node within the node.
+	 */
+	struct nvgpu_rbtree_node	tree_entry;
+};
+
+/**
+ * A cache for allocating PD memory. This enables smaller PDs to be packed
+ * into single pages.
+ */
+struct nvgpu_pd_cache {
+	/**
+	 * Array of lists of full nvgpu_pd_mem_entries and partially full
+	 * nvgpu_pd_mem_entries.
+	 */
+	struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+	/**
+	 * Array of lists of empty nvgpu_pd_mem_entries and partially
+	 * empty nvgpu_pd_mem_entries.
+	 */
+	struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+
+	/**
+	 * Tree of all allocated struct nvgpu_mem's for fast look up.
+	 */
+	struct nvgpu_rbtree_node	*mem_tree;
+
+	/**
+	 * All access to the cache much be locked. This protects the lists and
+	 * the rb tree.
+	 */
+	struct nvgpu_mutex		 lock;
+};
+
+#endif /* NVGPU_GMMU_PD_CACHE_PRIV_H */
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pte.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pte.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/static_analysis.h>
+#include <nvgpu/string.h>
+#include <nvgpu/gmmu.h>
+
+u32 nvgpu_gmmu_default_big_page_size(void)
+{
+	return U32(SZ_64K);
+}
+
+/*
+ * MSS NVLINK HW settings are in force_snoop mode.
+ * This will force all the GPU mappings to be coherent.
+ * By default the mem aperture is set to sysmem_non_coherent and will use L2
+ * atomics.
+ * Change target pte aperture to sysmem_coherent if mem attribute requests for
+ * platform atomics to use rmw atomic capability.
+ *
+ */
+u32 nvgpu_gmmu_aperture_mask(struct gk20a *g,
+				  enum nvgpu_aperture mem_ap,
+				  bool platform_atomic_attr,
+				  u32 sysmem_mask,
+				  u32 sysmem_coh_mask,
+				  u32 vidmem_mask)
+{
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_PLATFORM_ATOMIC) &&
+			     platform_atomic_attr) {
+		mem_ap = APERTURE_SYSMEM_COH;
+	}
+
+	return nvgpu_aperture_mask_raw(g, mem_ap,
+				sysmem_mask,
+				sysmem_coh_mask,
+				vidmem_mask);
+}
+
+static char *map_attrs_to_str(char *dest, struct nvgpu_gmmu_attrs *attrs)
+{
+	dest[0] = attrs->cacheable ? 'C' : '-';
+	dest[1] = attrs->sparse    ? 'S' : '-';
+	dest[2] = attrs->priv      ? 'P' : '-';
+	dest[3] = attrs->valid     ? 'V' : '-';
+	dest[4] = attrs->platform_atomic ? 'A' : '-';
+	dest[5] = '\0';
+
+	return dest;
+}
+
+void nvgpu_pte_dbg_print(struct gk20a *g,
+		struct nvgpu_gmmu_attrs *attrs,
+		const char *vm_name, u32 pd_idx, u32 mmu_level_entry_size,
+		u64 virt_addr, u64 phys_addr, u32 page_size, u32 *pte_w)
+{
+	char attrs_str[6];
+	char ctag_str[32] = "\0";
+	const char *aperture_str = nvgpu_aperture_str(attrs->aperture);
+	const char *perm_str = nvgpu_gmmu_perm_str(attrs->rw_flag);
+#ifdef CONFIG_NVGPU_COMPRESSION
+	u64 ctag_tmp = attrs->ctag;
+	u32 str_len = 0U;
+	u32 ctag_num = 0U;
+
+	/*
+	 * attrs->ctag is incremented to count current page size as well.
+	 * Subtract to get this page's ctag line number.
+	 */
+	if (ctag_tmp != 0ULL) {
+		ctag_tmp = nvgpu_safe_sub_u64(ctag_tmp, page_size);
+	}
+
+	ctag_num = nvgpu_safe_cast_u64_to_u32(ctag_tmp /
+					g->ops.fb.compression_page_size(g));
+	(void)strcpy(ctag_str, "ctag=0x\0");
+	str_len = (u32)strlen(ctag_str);
+	(void)nvgpu_strnadd_u32(ctag_str + str_len, ctag_num,
+		nvgpu_safe_sub_u32(31U, str_len), 16U);
+#endif
+	(void)map_attrs_to_str(attrs_str, attrs);
+	pte_dbg(g, attrs,
+		"vm=%s "
+		"PTE: i=%-4u size=%-2u | "
+		"GPU %#-12llx  phys %#-12llx "
+		"pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %-5s "
+		"%s "
+		"[0x%08x, 0x%08x]",
+		vm_name,
+		pd_idx, mmu_level_entry_size,
+		virt_addr, phys_addr,
+		page_size >> 10,
+		perm_str,
+		attrs->kind_v,
+		aperture_str,
+		attrs_str,
+		ctag_str,
+		pte_w[1], pte_w[0]);
+}