diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index bbae54da6..33930a686 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -194,13 +194,16 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 	};
 
 	/*
-	 * PDB size here must be one page so that its address is page size
+	 * PDB size here must be at least 4096 bytes so that its address is 4K
 	 * aligned. Although lower PDE tables can be aligned at 256B boundaries
-	 * the main PDB must be page aligned.
+	 * the PDB must be 4K aligned.
+	 *
+	 * Currently PAGE_SIZE is used, even when 64K, to work around an issue
+	 * with the PDB TLB invalidate code not being pd_cache aware yet.
 	 */
 	pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
 
-	err = nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
+	err = nvgpu_pd_alloc(vm, &vm->pdb, pdb_size);
 	if (WARN_ON(err != 0)) {
 		return err;
 	}
@@ -217,7 +220,7 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 /*
  * Return the _physical_ address of a page directory.
  */
-static u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+u64 nvgpu_pde_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
 {
 	u64 page_addr;
 
@@ -434,7 +437,7 @@ static int __set_pd_level(struct vm_gk20a *vm,
 		 * target addr is the real physical address we are aiming for.
 		 */
 		target_addr = (next_pd != NULL) ?
-			nvgpu_pde_phys_addr(g, next_pd) :
+			nvgpu_pde_gpu_addr(g, next_pd) :
 			phys_addr;
 
 		l->update_entry(vm, l,
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
index 693330814..833e02438 100644
--- a/drivers/gpu/nvgpu/common/mm/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -36,11 +36,14 @@
 /**
  * DOC: PD cache
  *
- * In the name of saving memory with the many sub-page sized PD levels in Pascal
- * and beyond a way of packing PD tables together is necessary. This code here
- * does just that. If a PD table only requires 1024 bytes, then it is possible
- * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
- * PD tables.
+ * To save memory when using sub-page sized PD levels in Pascal and beyond a way
+ * of packing PD tables together is necessary. If a PD table only requires 1024
+ * bytes, then it is possible to have 4 of these PDs in one page. This is even
+ * more pronounced for 256 byte PD tables.
+ *
+ * This also matters for page directories on any chip when using a 64K page
+ * granule. Having 4K PDs packed into a 64K page saves a bunch of memory. Even
+ * more so for the 256B PDs on Pascal+.
  *
  * The pd cache is basially just a slab allocator. Each instance of the nvgpu
  * driver makes one of these structs:
@@ -68,23 +71,91 @@
  * size is page size or larger and choose the correct allocation scheme - either
  * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD
  * allocated by nvgpu_pd_alloc().
- *
- * Since the top level PD (the PDB) is a page aligned pointer but less than a
- * page size the direct functions must be used for allocating PDBs. Otherwise
- * there would be alignment issues for the PDBs when they get packed.
  */
 
+/*
+ * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
+ * structure is of course depending on this. The MIN_SHIFT define is the right
+ * number of bits to shift to determine which list to use in the array of lists.
+ */
+#define NVGPU_PD_CACHE_MIN		256U
+#define NVGPU_PD_CACHE_MIN_SHIFT	9U
+#if PAGE_SIZE == 4096
+#define NVGPU_PD_CACHE_COUNT		4U
+#elif PAGE_SIZE == 65536
+#define NVGPU_PD_CACHE_COUNT		8U
+#else
+#error "Unsupported page size."
+#endif
+
+struct nvgpu_pd_mem_entry {
+	struct nvgpu_mem		mem;
+
+	/*
+	 * Size of the page directories (not the mem). alloc_map is a bitmap
+	 * showing which PDs have been allocated. The size of mem will always
+	 * be one page. pd_size will always be a power of 2.
+	 */
+	u32				pd_size;
+	DECLARE_BITMAP(alloc_map, PAGE_SIZE / NVGPU_PD_CACHE_MIN);
+	u32				allocs;
+
+	struct nvgpu_list_node		list_entry;
+	struct nvgpu_rbtree_node	tree_entry;
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, list_entry));
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
+};
+
+/*
+ * A cache for allocating PD memory from. This enables smaller PDs to be packed
+ * into single pages.
+ *
+ * This is fairly complex so see the documentation in pd_cache.c for a full
+ * description of how this is organized.
+ */
+struct nvgpu_pd_cache {
+	/*
+	 * Array of lists of full nvgpu_pd_mem_entries and partially full (or
+	 * empty) nvgpu_pd_mem_entries.
+	 */
+	struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+	struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+
+	/*
+	 * Tree of all allocated struct nvgpu_mem's for fast look up.
+	 */
+	struct nvgpu_rbtree_node	*mem_tree;
+
+	/*
+	 * All access to the cache much be locked. This protects the lists and
+	 * the rb tree.
+	 */
+	struct nvgpu_mutex		 lock;
+};
+
 static u32 nvgpu_pd_cache_nr(u32 bytes)
 {
 	return ilog2((unsigned long)bytes >>
 			((unsigned long)NVGPU_PD_CACHE_MIN_SHIFT - 1UL));
 }
 
-static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
 {
-	u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
-
-	return mask_offset - 1U;
+	return PAGE_SIZE / pentry->pd_size;
 }
 
 int nvgpu_pd_cache_init(struct gk20a *g)
@@ -123,6 +194,7 @@ int nvgpu_pd_cache_init(struct gk20a *g)
 	}
 
 	g->mm.pd_cache = cache;
+
 	pd_dbg(g, "PD cache initialized!");
 
 	return 0;
@@ -151,8 +223,8 @@ void nvgpu_pd_cache_fini(struct gk20a *g)
  * Note: this does not need the cache lock since it does not modify any of the
  * PD cache data structures.
  */
-int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
-				struct nvgpu_gmmu_pd *pd, u32 bytes)
+static int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd, u32 bytes)
 {
 	int err;
 	unsigned long flags = 0;
@@ -225,7 +297,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 	 * This allocates the very first PD table in the set of tables in this
 	 * nvgpu_pd_mem_entry.
 	 */
-	pentry->alloc_map = 1;
+	set_bit(0, pentry->alloc_map);
+	pentry->allocs = 1;
 
 	/*
 	 * Now update the nvgpu_gmmu_pd to reflect this allocation.
@@ -247,22 +320,24 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
 {
 	unsigned long bit_offs;
 	u32 mem_offs;
-	u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+	u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);
 
 	/*
 	 * Find and allocate an open PD.
 	 */
-	bit_offs = ffz(pentry->alloc_map);
+	bit_offs = find_first_zero_bit(pentry->alloc_map, nr_bits);
 	mem_offs = bit_offs * pentry->pd_size;
 
+	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu nr_bits=%d src=0x%p",
+	       bit_offs, nr_bits, pentry);
+
 	/* Bit map full. Somethings wrong. */
-	if (WARN_ON(bit_offs >= ffz(pentry_mask))) {
+	if (WARN_ON(bit_offs >= nr_bits)) {
 		return -ENOMEM;
 	}
 
-	pentry->alloc_map |= BIT64(bit_offs);
-
-	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
+	set_bit((int)bit_offs, pentry->alloc_map);
+	pentry->allocs += 1U;
 
 	/*
 	 * First update the pd.
@@ -274,7 +349,7 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
 	/*
 	 * Now make sure the pentry is in the correct list (full vs partial).
 	 */
-	if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+	if (pentry->allocs >= nr_bits) {
 		pd_dbg(g, "Adding pentry to full list!");
 		nvgpu_list_del(&pentry->list_entry);
 		nvgpu_list_add(&pentry->list_entry,
@@ -369,7 +444,8 @@ int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
 	return err;
 }
 
-void nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+static void nvgpu_pd_cache_free_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd)
 {
 	pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);
 
@@ -397,13 +473,13 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
 				   struct nvgpu_pd_mem_entry *pentry,
 				   struct nvgpu_gmmu_pd *pd)
 {
-	u32 index = pd->mem_offs / pentry->pd_size;
-	u32 bit = 1 << index;
+	u32 bit = pd->mem_offs / pentry->pd_size;
 
 	/* Mark entry as free. */
-	pentry->alloc_map &= ~bit;
+	clear_bit((int)bit, pentry->alloc_map);
+	pentry->allocs -= 1U;
 
-	if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+	if (pentry->allocs > 0U) {
 		/*
 		 * Partially full still. If it was already on the partial list
 		 * this just re-adds it.
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 662556576..64ab4a9a7 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -109,7 +109,7 @@ static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
 	struct gk20a *g = vm->mm->g;
 	int i;
 
-	nvgpu_pd_cache_free_direct(g, pdb);
+	nvgpu_pd_free(vm, pdb);
 
 	if (pdb->entries == NULL) {
 		return;
@@ -521,7 +521,7 @@ clean_up_allocators:
 	}
 clean_up_page_tables:
 	/* Cleans up nvgpu_gmmu_init_page_table() */
-	nvgpu_pd_cache_free_direct(g, &vm->pdb);
+	nvgpu_pd_free(vm, &vm->pdb);
 clean_up_vgpu_vm:
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
 	if (g->is_virtual)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 8e74ad5d5..7ff5a3382 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -364,7 +364,7 @@ int gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
 void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
+	u64 pdb_addr = nvgpu_pde_gpu_addr(g, &vm->pdb);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index 8746e0452..e85c92cc9 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -380,7 +380,7 @@ const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
+	u64 pdb_addr = nvgpu_pde_gpu_addr(g, &vm->pdb);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index ff17fcbda..3e496f758 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -49,73 +49,6 @@ enum gk20a_mem_rw_flag {
 	gk20a_mem_flag_write_only = 2,	/* WO */
 };
 
-/*
- * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
- * structure is of course depending on this. The MIN_SHIFT define is the right
- * number of bits to shift to determine which list to use in the array of lists.
- */
-#define NVGPU_PD_CACHE_MIN		256U
-#define NVGPU_PD_CACHE_MIN_SHIFT	9U
-#define NVGPU_PD_CACHE_COUNT		4U
-
-struct nvgpu_pd_mem_entry {
-	struct nvgpu_mem		mem;
-
-	/*
-	 * Size of the page directories (not the mem). bmap is a bitmap showing
-	 * which PDs have been allocated. The size of mem will always be one
-	 * page. pd_size will always be a power of 2.
-	 */
-	u32				pd_size;
-	unsigned long			alloc_map;
-
-	struct nvgpu_list_node		list_entry;
-	struct nvgpu_rbtree_node	tree_entry;
-};
-
-static inline struct nvgpu_pd_mem_entry *
-nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
-{
-	return (struct nvgpu_pd_mem_entry *)
-		((uintptr_t)node -
-		 offsetof(struct nvgpu_pd_mem_entry, list_entry));
-};
-
-static inline struct nvgpu_pd_mem_entry *
-nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
-{
-	return (struct nvgpu_pd_mem_entry *)
-		((uintptr_t)node -
-		 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
-};
-
-/*
- * A cache for allocating PD memory from. This enables smaller PDs to be packed
- * into single pages.
- *
- * This is fairly complex so see the documentation in pd_cache.c for a full
- * description of how this is organized.
- */
-struct nvgpu_pd_cache {
-	/*
-	 * Array of lists of full nvgpu_pd_mem_entries and partially full (or
-	 * empty) nvgpu_pd_mem_entries.
-	 */
-	struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
-	struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
-
-	/*
-	 * Tree of all allocated struct nvgpu_mem's for fast look up.
-	 */
-	struct nvgpu_rbtree_node	*mem_tree;
-
-	/*
-	 * All access to the cache much be locked. This protects the lists and
-	 * the rb tree.
-	 */
-	struct nvgpu_mutex		 lock;
-};
-
 /*
  * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
  * in the GMMU.
@@ -253,11 +186,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
 
 int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
 void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
-int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
-				  struct nvgpu_gmmu_pd *pd, u32 bytes);
-void nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
 int nvgpu_pd_cache_init(struct gk20a *g);
 void nvgpu_pd_cache_fini(struct gk20a *g);
+u64 nvgpu_pde_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
 
 /*
  * Some useful routines that are shared across chips.