diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 062916001..ec1bc095d 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -25,115 +25,26 @@
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
 
-#define gmmu_dbg(g, fmt, args...)			\
-	nvgpu_log(g, gpu_dbg_map, fmt, ##args)
-#define gmmu_dbg_v(g, fmt, args...)			\
-	nvgpu_log(g, gpu_dbg_map_v, fmt, ##args)
+#define __gmmu_dbg(g, attrs, fmt, args...)				\
+	do {								\
+		if (attrs->debug)					\
+			nvgpu_info(g, fmt, ##args);			\
+		else							\
+			nvgpu_log(g, gpu_dbg_map, fmt, ##args);		\
+	} while (0)
 
-static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
-{
-	return nvgpu_mem_begin(g, &entry->mem);
-}
+#define __gmmu_dbg_v(g, attrs, fmt, args...)				\
+	do {								\
+		if (attrs->debug)					\
+			nvgpu_info(g, fmt, ##args);			\
+		else							\
+			nvgpu_log(g, gpu_dbg_map_v, fmt, ##args);	\
+	} while (0)
 
-static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
-{
-	nvgpu_mem_end(g, &entry->mem);
-}
-
-static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
-				  struct gk20a_mm_entry *entry)
-{
-	struct gk20a *g = gk20a_from_vm(vm);
-	u32 num_pages = 1 << order;
-	u32 len = num_pages * PAGE_SIZE;
-	int err;
-
-	err = nvgpu_dma_alloc(g, len, &entry->mem);
-
-	if (err) {
-		nvgpu_err(g, "memory allocation failed");
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *entry)
-{
-	struct gk20a *g = gk20a_from_vm(vm);
-
-	if (!entry->mem.size)
-		return;
-
-	if (entry->woffset) /* fake shadow mem */
-		return;
-
-	nvgpu_dma_free(g, &entry->mem);
-}
-
-/*
- * Allocate a phys contig region big enough for a full
- * sized gmmu page table for the given gmmu_page_size.
- * the whole range is zeroed so it's "invalid"/will fault.
- *
- * If a previous entry is supplied, its memory will be used for
- * suballocation for this next entry too, if there is space.
- */
-int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm,
-				 enum gmmu_pgsz_gk20a pgsz_idx,
-				 const struct gk20a_mmu_level *l,
-				 struct gk20a_mm_entry *entry,
-				 struct gk20a_mm_entry *prev_entry)
-{
-	int err = -ENOMEM;
-	int order;
-	struct gk20a *g = gk20a_from_vm(vm);
-	u32 bytes;
-
-	/* allocate enough pages for the table */
-	order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
-	order += ilog2(l->entry_size);
-	bytes = 1 << order;
-	order -= PAGE_SHIFT;
-	if (order < 0 && prev_entry) {
-		/* try to suballocate from previous chunk */
-		u32 capacity = prev_entry->mem.size / bytes;
-		u32 prev = prev_entry->woffset * sizeof(u32) / bytes;
-		u32 free = capacity - prev - 1;
-
-		nvgpu_log(g, gpu_dbg_pte, "cap %d prev %d free %d bytes %d",
-				capacity, prev, free, bytes);
-
-		if (free) {
-			memcpy(&entry->mem, &prev_entry->mem,
-					sizeof(entry->mem));
-			entry->woffset = prev_entry->woffset
-				+ bytes / sizeof(u32);
-			err = 0;
-		}
-	}
-
-	if (err) {
-		/* no suballoc space */
-		order = max(0, order);
-		err = nvgpu_alloc_gmmu_pages(vm, order, entry);
-		entry->woffset = 0;
-	}
-
-	nvgpu_log(g, gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x",
-		  entry,
-		  (entry->mem.priv.sgt &&
-		   entry->mem.aperture == APERTURE_SYSMEM) ?
-		  g->ops.mm.get_iova_addr(g, entry->mem.priv.sgt->sgl, 0) : 0,
-		  order, entry->woffset);
-	if (err)
-		return err;
-	entry->pgsz = pgsz_idx;
-	entry->mem.skip_wmb = true;
-
-	return err;
-}
+static int pd_allocate(struct vm_gk20a *vm,
+		       struct nvgpu_gmmu_pd *pd,
+		       const struct gk20a_mmu_level *l,
+		       struct nvgpu_gmmu_attrs *attrs);
 
 /*
  * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
@@ -225,103 +136,484 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
 }
 
-static int update_gmmu_level_locked(struct vm_gk20a *vm,
-				    struct gk20a_mm_entry *pte,
-				    enum gmmu_pgsz_gk20a pgsz_idx,
-				    struct scatterlist **sgl,
-				    u64 *offset,
-				    u64 *iova,
-				    u64 gpu_va, u64 gpu_end,
-				    u8 kind_v, u64 *ctag,
-				    bool cacheable, bool unmapped_pte,
-				    int rw_flag,
-				    bool sparse,
-				    int lvl,
-				    bool priv,
-				    enum nvgpu_aperture aperture)
+int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
+{
+	/*
+	 * Need this just for page size. Everything else can be ignored. Also
+	 * note that we can just use pgsz 0 (i.e small pages) since the number
+	 * of bits present in the top level PDE are the same for small/large
+	 * page VMs.
+	 */
+	struct nvgpu_gmmu_attrs attrs = {
+		.pgsz = 0,
+	};
+
+	return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
+}
+
+
+/*
+ * Ensure that there's a CPU mapping for the page directory memory. This won't
+ * always be the case for 32 bit systems since we may need to save kernel
+ * virtual memory.
+ */
+static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+{
+	return nvgpu_mem_begin(g, &entry->mem);
+}
+
+/*
+ * Handle any necessary CPU unmap semantics for a page directories DMA memory.
+ * For 64 bit platforms this is a noop.
+ */
+static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
+{
+	nvgpu_mem_end(g, &entry->mem);
+}
+
+static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
+				  struct nvgpu_gmmu_pd *pd)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
-	const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
-	const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
+	unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
+	int err;
+
+	/*
+	 * On arm32 vmalloc space is a precious commodity so we do not map pages
+	 * by default.
+	 */
+	if (!IS_ENABLED(CONFIG_ARM64))
+		flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
+
+	err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
+	if (err)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
+			   struct nvgpu_gmmu_pd *pd)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	nvgpu_dma_free(g, &pd->mem);
+}
+
+/*
+ * Return the _physical_ address of a page directory.
+ */
+u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+	if (g->mm.has_physical_mode)
+		return sg_phys(pd->mem.priv.sgt->sgl);
+	else
+		return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
+}
+
+/*
+ * Return the aligned length based on the page size in attrs.
+ */
+static u64 nvgpu_align_map_length(struct vm_gk20a *vm, u64 length,
+				  struct nvgpu_gmmu_attrs *attrs)
+{
+	u64 page_size = vm->gmmu_page_sizes[attrs->pgsz];
+
+	return ALIGN(length, page_size);
+}
+
+static u32 pd_entries(const struct gk20a_mmu_level *l,
+		      struct nvgpu_gmmu_attrs *attrs)
+{
+	/*
+	 * Number of entries in a PD is easy to compute from the number of bits
+	 * used to index the page directory. That is simply 2 raised to the
+	 * number of bits.
+	 */
+	return 1UL << (l->hi_bit[attrs->pgsz] - l->lo_bit[attrs->pgsz] + 1UL);
+}
+
+/*
+ * Computes the size of a PD table.
+ */
+static u32 pd_size(const struct gk20a_mmu_level *l,
+		   struct nvgpu_gmmu_attrs *attrs)
+{
+	return pd_entries(l, attrs) * l->entry_size;
+}
+
+/*
+ * Allocate a physically contiguous region big enough for a gmmu page table
+ * of the specified level and page size. The whole range is zeroed so that any
+ * accesses will fault until proper values are programmed.
+ */
+static int pd_allocate(struct vm_gk20a *vm,
+		       struct nvgpu_gmmu_pd *pd,
+		       const struct gk20a_mmu_level *l,
+		       struct nvgpu_gmmu_attrs *attrs)
+{
+	int err;
+
+	if (pd->mem.size)
+		return 0;
+
+	err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
+	if (err) {
+		nvgpu_info(vm->mm->g, "error allocating page directory!");
+		return err;
+	}
+
+	/*
+	 * One mb() is done after all mapping operations. Don't need individual
+	 * barriers for each PD write.
+	 */
+	pd->mem.skip_wmb = true;
+
+	return 0;
+}
+
+/*
+ * Compute what page directory index at the passed level the passed virtual
+ * address corresponds to. @attrs is necessary for determining the page size
+ * which is used to pick the right bit offsets for the GMMU level.
+ */
+static u32 pd_index(const struct gk20a_mmu_level *l, u64 virt,
+		    struct nvgpu_gmmu_attrs *attrs)
+{
+	u64 pd_mask = (1ULL << ((u64)l->hi_bit[attrs->pgsz] + 1)) - 1ULL;
+	u32 pd_shift = (u64)l->lo_bit[attrs->pgsz];
+
+	/*
+	 * For convenience we don't bother computing the lower bound of the
+	 * mask; it's easier to just shift it off.
+	 */
+	return (virt & pd_mask) >> pd_shift;
+}
+
+static int pd_allocate_children(struct vm_gk20a *vm,
+				const struct gk20a_mmu_level *l,
+				struct nvgpu_gmmu_pd *pd,
+				struct nvgpu_gmmu_attrs *attrs)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	if (pd->entries)
+		return 0;
+
+	pd->num_entries = pd_entries(l, attrs);
+	pd->entries = nvgpu_vzalloc(g, sizeof(struct nvgpu_gmmu_pd) *
+				    pd->num_entries);
+	if (!pd->entries)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * This function programs the GMMU based on two ranges: a physical range and a
+ * GPU virtual range. The virtual is mapped to the physical. Physical in this
+ * case can mean either a real physical sysmem address or a IO virtual address
+ * (for instance when a system has an IOMMU running).
+ *
+ * The rest of the parameters are for describing the actual mapping itself.
+ *
+ * This function recursively calls itself for handling PDEs. At the final level
+ * a PTE handler is called. The phys and virt ranges are adjusted for each
+ * recursion so that each invocation of this function need only worry about the
+ * range it is passed.
+ *
+ * phys_addr will always point to a contiguous range - the discontiguous nature
+ * of DMA buffers is taken care of at the layer above this.
+ */
+static int __set_pd_level(struct vm_gk20a *vm,
+			  struct nvgpu_gmmu_pd *pd,
+			  int lvl,
+			  u64 phys_addr,
+			  u64 virt_addr, u64 length,
+			  struct nvgpu_gmmu_attrs *attrs)
+{
 	int err = 0;
-	u32 pde_i;
-	u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
-	struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL;
+	u64 pde_range;
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct nvgpu_gmmu_pd *next_pd = NULL;
+	const struct gk20a_mmu_level *l      = &vm->mmu_levels[lvl];
+	const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl + 1];
 
-	gk20a_dbg_fn("");
+	/*
+	 * 5 levels for Pascal+. For pre-pascal we only have 2. This puts
+	 * offsets into the page table debugging code which makes it easier to
+	 * see what level prints are from.
+	 */
+	static const char *__lvl_debug[] = {
+		"",          /* L=0 */
+		"  ",        /* L=1 */
+		"    ",      /* L=2 */
+		"      ",    /* L=3 */
+		"        ",  /* L=4 */
+	};
 
-	pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
-		>> (u64)l->lo_bit[pgsz_idx];
+	pde_range = 1ULL << (u64)l->lo_bit[attrs->pgsz];
 
-	gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
-		  pgsz_idx, lvl, gpu_va, gpu_end-1, *iova);
+	__gmmu_dbg_v(g, attrs,
+		     "L=%d   %sGPU virt %#-12llx +%#-9llx -> phys %#-12llx",
+		     lvl,
+		     __lvl_debug[lvl],
+		     virt_addr,
+		     length,
+		     phys_addr);
 
-	while (gpu_va < gpu_end) {
-		u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
+	/*
+	 * Iterate across the mapping in chunks the size of this level's PDE.
+	 * For each of those chunks program our level's PDE and then, if there's
+	 * a next level, program the next level's PDEs/PTEs.
+	 */
+	while (length) {
+		u32 pd_idx = pd_index(l, virt_addr, attrs);
+		u64 chunk_size;
+		u64 target_addr;
 
-		/* Allocate next level */
+		/*
+		 * Truncate the pde_range when the virtual address does not
+		 * start at a PDE boundary.
+		 */
+		chunk_size = min(length,
+				 pde_range - (virt_addr & (pde_range - 1)));
+
+		/*
+		 * If the next level has an update_entry function then we know
+		 * that _this_ level points to PDEs (not PTEs). Thus we need to
+		 * have a bunch of children PDs.
+		 */
 		if (next_l->update_entry) {
-			if (!pte->entries) {
-				int num_entries =
-					1 <<
-					 (l->hi_bit[pgsz_idx]
-					  - l->lo_bit[pgsz_idx] + 1);
-				pte->entries =
-					nvgpu_vzalloc(g,
-						sizeof(struct gk20a_mm_entry) *
-						num_entries);
-				if (!pte->entries)
-					return -ENOMEM;
-				pte->pgsz = pgsz_idx;
-				pte->num_entries = num_entries;
-			}
-			prev_pte = next_pte;
-			next_pte = pte->entries + pde_i;
+			if (pd_allocate_children(vm, l, pd, attrs))
+				return -ENOMEM;
 
-			if (!next_pte->mem.size) {
-				err = nvgpu_zalloc_gmmu_page_table(vm,
-					pgsz_idx, next_l, next_pte, prev_pte);
-				if (err)
-					return err;
-			}
+			/*
+			 * Get the next PD so that we know what to put in this
+			 * current PD. If the next level is actually PTEs then
+			 * we don't need this - we will just use the real
+			 * physical target.
+			 */
+			next_pd = &pd->entries[pd_idx];
+
+			/*
+			 * Allocate the backing memory for next_pd.
+			 */
+			if (pd_allocate(vm, next_pd, next_l, attrs))
+				return -ENOMEM;
 		}
 
-		err = l->update_entry(vm, pte, pde_i, pgsz_idx,
-				sgl, offset, iova,
-				kind_v, ctag, cacheable, unmapped_pte,
-				rw_flag, sparse, priv, aperture);
-		if (err)
-			return err;
+		/*
+		 * This is the address we want to program into the actual PDE/
+		 * PTE. When the next level is PDEs we need the target address
+		 * to be the table of PDEs. When the next level is PTEs the
+		 * target addr is the real physical address we are aiming for.
+		 */
+		target_addr = next_pd ? nvgpu_pde_phys_addr(g, next_pd) :
+			                phys_addr;
+
+		l->update_entry(vm, l,
+				pd, pd_idx,
+				virt_addr,
+				target_addr,
+				attrs);
 
 		if (next_l->update_entry) {
-			/* get cpu access to the ptes */
-			err = map_gmmu_pages(g, next_pte);
+			err = map_gmmu_pages(g, next_pd);
 			if (err) {
 				nvgpu_err(g,
-					   "couldn't map ptes for update as=%d",
-					   vm_aspace_id(vm));
+					  "couldn't map ptes for update as=%d",
+					  vm_aspace_id(vm));
 				return err;
 			}
-			err = update_gmmu_level_locked(vm, next_pte,
-				pgsz_idx,
-				sgl,
-				offset,
-				iova,
-				gpu_va,
-				next,
-				kind_v, ctag, cacheable, unmapped_pte,
-				rw_flag, sparse, lvl+1, priv, aperture);
-			unmap_gmmu_pages(g, next_pte);
+
+			err = __set_pd_level(vm, next_pd,
+					     lvl + 1,
+					     phys_addr,
+					     virt_addr,
+					     chunk_size,
+					     attrs);
+			unmap_gmmu_pages(g, next_pd);
 
 			if (err)
 				return err;
 		}
 
-		pde_i++;
-		gpu_va = next;
+		virt_addr += chunk_size;
+
+		/*
+		 * Only add to phys_addr if it's non-zero. A zero value implies
+		 * we are unmapping as as a result we don't want to place
+		 * non-zero phys addresses in the PTEs. A non-zero phys-addr
+		 * would also confuse the lower level PTE programming code.
+		 */
+		if (phys_addr)
+			phys_addr += chunk_size;
+		length -= chunk_size;
 	}
 
-	gk20a_dbg_fn("done");
+	__gmmu_dbg_v(g, attrs, "L=%d   %s%s", lvl, __lvl_debug[lvl], "ret!");
+
+	return 0;
+}
+
+/*
+ * VIDMEM version of the update_ptes logic.
+ */
+static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
+						 struct sg_table *sgt,
+						 u64 space_to_skip,
+						 u64 virt_addr,
+						 u64 length,
+						 struct nvgpu_gmmu_attrs *attrs)
+{
+	struct nvgpu_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *chunk = NULL;
+	u64 phys_addr, chunk_length;
+	int err = 0;
+
+	if (!sgt) {
+		/*
+		 * This is considered an unmap. Just pass in 0 as the physical
+		 * address for the entire GPU range.
+		 */
+		err = __set_pd_level(vm, &vm->pdb,
+				     0,
+				     0,
+				     virt_addr, length,
+				     attrs);
+		return err;
+	}
+
+	alloc = get_vidmem_page_alloc(sgt->sgl);
+
+	/*
+	 * Otherwise iterate across all the chunks in this allocation and
+	 * map them.
+	 */
+	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
+				  page_alloc_chunk, list_entry) {
+		if (space_to_skip &&
+		    space_to_skip >= chunk->length) {
+			space_to_skip -= chunk->length;
+			continue;
+		}
+
+		phys_addr = chunk->base + space_to_skip;
+		chunk_length = min(length, (chunk->length - space_to_skip));
+
+		err = __set_pd_level(vm, &vm->pdb,
+				     0,
+				     phys_addr,
+				     virt_addr, length,
+				     attrs);
+		if (err)
+			break;
+
+		/* Space has been skipped so zero this for future chunks. */
+		space_to_skip = 0;
+
+		/*
+		 * Update the map pointer and the remaining length.
+		 */
+		virt_addr += chunk_length;
+		length    -= chunk_length;
+
+		if (length == 0)
+			break;
+	}
+
+	return err;
+}
+
+static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
+						 struct sg_table *sgt,
+						 u64 space_to_skip,
+						 u64 virt_addr,
+						 u64 length,
+						 struct nvgpu_gmmu_attrs *attrs)
+{
+	int err;
+	struct scatterlist *sgl;
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	if (!sgt) {
+		/*
+		 * This is considered an unmap. Just pass in 0 as the physical
+		 * address for the entire GPU range.
+		 */
+		err = __set_pd_level(vm, &vm->pdb,
+				     0,
+				     0,
+				     virt_addr, length,
+				     attrs);
+		return err;
+	}
+
+	/*
+	 * At this point we have a Linux scatter-gather list pointing to some
+	 * number of discontiguous chunks of memory. Iterate over that list and
+	 * generate a GMMU map call for each chunk. There are two possibilities:
+	 * either the IOMMU is enabled or not. When the IOMMU is enabled the
+	 * mapping is simple since the "physical" address is actually a virtual
+	 * IO address and will be contiguous. The no-IOMMU case is more
+	 * complicated. We will have to iterate over the SGT and do a separate
+	 * map for each chunk of the SGT.
+	 */
+	sgl = sgt->sgl;
+
+	if (!g->mm.bypass_smmu) {
+		u64 io_addr = g->ops.mm.get_iova_addr(g, sgl, 0);
+
+		io_addr += space_to_skip;
+
+		err = __set_pd_level(vm, &vm->pdb,
+				     0,
+				     io_addr,
+				     virt_addr,
+				     length,
+				     attrs);
+
+		return err;
+	}
+
+	/*
+	 * Finally: last possible case: do the no-IOMMU mapping. In this case we
+	 * really are mapping physical pages directly.
+	 */
+	while (sgl) {
+		u64 phys_addr;
+		u64 chunk_length;
+
+		/*
+		 * Cut out sgl ents for space_to_skip.
+		 */
+		if (space_to_skip && space_to_skip >= sgl->length) {
+			space_to_skip -= sgl->length;
+			sgl = sg_next(sgl);
+			continue;
+		}
+
+		phys_addr = sg_phys(sgl) + space_to_skip;
+		chunk_length = min(length, sgl->length - space_to_skip);
+
+		err = __set_pd_level(vm, &vm->pdb,
+				     0,
+				     phys_addr,
+				     virt_addr,
+				     chunk_length,
+				     attrs);
+		if (err)
+			return err;
+
+		space_to_skip = 0;
+		virt_addr += chunk_length;
+		length    -= chunk_length;
+		sgl        = sg_next(sgl);
+
+		if (length == 0)
+			break;
+	}
 
 	return 0;
 }
@@ -332,8 +624,8 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
  * physical* address.
  *
  * The update of each level of the page tables is farmed out to chip specific
- * implementations. But the logic around that is generic to all chips. Every chip
- * has some number of PDE levels and then a PTE level.
+ * implementations. But the logic around that is generic to all chips. Every
+ * chip has some number of PDE levels and then a PTE level.
  *
  * Each chunk of the incoming SGT is sent to the chip specific implementation
  * of page table update.
@@ -341,148 +633,81 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
  * [*] Note: the "physical" address may actually be an IO virtual address in the
  *     case of SMMU usage.
  */
-static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
-				   enum gmmu_pgsz_gk20a pgsz_idx,
-				   struct sg_table *sgt,
-				   u64 buffer_offset,
-				   u64 gpu_va, u64 gpu_end,
-				   u8 kind_v, u32 ctag_offset,
-				   bool cacheable, bool unmapped_pte,
-				   int rw_flag,
-				   bool sparse,
-				   bool priv,
-				   enum nvgpu_aperture aperture)
+static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
+					  struct sg_table *sgt,
+					  u64 space_to_skip,
+					  u64 virt_addr,
+					  u64 length,
+					  struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
-	int ctag_granularity = g->ops.fb.compression_page_size(g);
-	u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
-	u64 iova = 0;
-	u64 space_to_skip = buffer_offset;
-	u64 map_size = gpu_end - gpu_va;
-	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
+	u32 page_size;
 	int err;
-	struct scatterlist *sgl = NULL;
-	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
-	u64 length;
 
 	/* note: here we need to map kernel to small, since the
 	 * low-level mmu code assumes 0 is small and 1 is big pages */
-	if (pgsz_idx == gmmu_page_size_kernel)
-		pgsz_idx = gmmu_page_size_small;
+	if (attrs->pgsz == gmmu_page_size_kernel)
+		attrs->pgsz = gmmu_page_size_small;
+
+	page_size = vm->gmmu_page_sizes[attrs->pgsz];
 
 	if (space_to_skip & (page_size - 1))
 		return -EINVAL;
 
+	/*
+	 * Update length to be aligned to the passed page size.
+	 */
+	length = nvgpu_align_map_length(vm, length, attrs);
+
 	err = map_gmmu_pages(g, &vm->pdb);
 	if (err) {
-		nvgpu_err(g,
-			   "couldn't map ptes for update as=%d",
-			   vm_aspace_id(vm));
+		nvgpu_err(g, "couldn't map ptes for update as=%d",
+			  vm_aspace_id(vm));
 		return err;
 	}
 
-	if (aperture == APERTURE_VIDMEM) {
-		gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]",
-			   pgsz_idx, gpu_va, gpu_end-1);
+	__gmmu_dbg(g, attrs,
+		   "vm=%s "
+		   "%-5s GPU virt %#-12llx +%#-9llx    phys %#-12llx "
+		   "phys offset: %#-4llx;  pgsz: %3dkb perm=%-2s | "
+		   "kind=%#02x APT=%-6s %c%c%c",
+		   vm->name,
+		   sgt ? "MAP" : "UNMAP",
+		   virt_addr,
+		   length,
+		   sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
+		   space_to_skip,
+		   page_size >> 10,
+		   nvgpu_gmmu_perm_str(attrs->rw_flag),
+		   attrs->kind_v,
+		   nvgpu_aperture_str(attrs->aperture),
+		   attrs->cacheable ? 'C' : 'V', /* C = cached, V = volatile. */
+		   attrs->sparse    ? 'S' : '-',
+		   attrs->priv      ? 'P' : '-');
 
-		if (sgt) {
-			alloc = get_vidmem_page_alloc(sgt->sgl);
-
-			nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-						 page_alloc_chunk, list_entry) {
-				if (space_to_skip &&
-				    space_to_skip > chunk->length) {
-					space_to_skip -= chunk->length;
-				} else {
-					iova = chunk->base + space_to_skip;
-					length = chunk->length - space_to_skip;
-					length = min(length, map_size);
-					space_to_skip = 0;
-
-					err = update_gmmu_level_locked(vm,
-						&vm->pdb, pgsz_idx,
-						&sgl,
-						&space_to_skip,
-						&iova,
-						gpu_va, gpu_va + length,
-						kind_v, &ctag,
-						cacheable, unmapped_pte,
-						rw_flag, sparse, 0, priv,
-						aperture);
-					if (err)
-						break;
-
-					/* need to set explicit zero here */
-					space_to_skip = 0;
-					gpu_va += length;
-					map_size -= length;
-
-					if (!map_size)
-						break;
-				}
-			}
-		} else {
-			err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
-					&sgl,
-					&space_to_skip,
-					&iova,
-					gpu_va, gpu_end,
-					kind_v, &ctag,
-					cacheable, unmapped_pte, rw_flag,
-					sparse, 0, priv,
-					aperture);
-		}
-	} else {
-		gmmu_dbg_v(g,
-			   "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx  phys: %#-12llx "
-			   "buffer offset: %-4lld, nents: %d",
-			   page_size,
-			   gpu_va, gpu_end - gpu_va,
-			   sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
-			   buffer_offset,
-			   sgt ? sgt->nents : 0);
-
-		if (sgt) {
-			iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
-			if (!vm->mm->bypass_smmu && iova) {
-				iova += space_to_skip;
-			} else {
-				sgl = sgt->sgl;
-
-				gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-						(u64)sg_phys(sgl),
-						sgl->length);
-
-				while (space_to_skip && sgl &&
-				      space_to_skip + page_size > sgl->length) {
-					space_to_skip -= sgl->length;
-					sgl = sg_next(sgl);
-					gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-							(u64)sg_phys(sgl),
-							sgl->length);
-				}
-
-				iova = sg_phys(sgl) + space_to_skip;
-			}
-		}
-
-		err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
-				&sgl,
-				&space_to_skip,
-				&iova,
-				gpu_va, gpu_end,
-				kind_v, &ctag,
-				cacheable, unmapped_pte, rw_flag,
-				sparse, 0, priv,
-				aperture);
-	}
+	/*
+	 * Handle VIDMEM progamming. Currently uses a different scatter list
+	 * format.
+	 */
+	if (attrs->aperture == APERTURE_VIDMEM)
+		err = __nvgpu_gmmu_update_page_table_vidmem(vm,
+							    sgt,
+							    space_to_skip,
+							    virt_addr,
+							    length,
+							    attrs);
+	else
+		err = __nvgpu_gmmu_update_page_table_sysmem(vm,
+							    sgt,
+							    space_to_skip,
+							    virt_addr,
+							    length,
+							    attrs);
 
 	unmap_gmmu_pages(g, &vm->pdb);
-
 	mb();
 
-	gk20a_dbg_fn("done");
+	__gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
 
 	return err;
 }
@@ -500,32 +725,44 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
  * have the update_gmmu_lock aquired.
  */
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
-			u64 map_offset,
-			struct sg_table *sgt,
-			u64 buffer_offset,
-			u64 size,
-			int pgsz_idx,
-			u8 kind_v,
-			u32 ctag_offset,
-			u32 flags,
-			int rw_flag,
-			bool clear_ctags,
-			bool sparse,
-			bool priv,
-			struct vm_gk20a_mapping_batch *batch,
-			enum nvgpu_aperture aperture)
+			  u64 vaddr,
+			  struct sg_table *sgt,
+			  u64 buffer_offset,
+			  u64 size,
+			  int pgsz_idx,
+			  u8 kind_v,
+			  u32 ctag_offset,
+			  u32 flags,
+			  int rw_flag,
+			  bool clear_ctags,
+			  bool sparse,
+			  bool priv,
+			  struct vm_gk20a_mapping_batch *batch,
+			  enum nvgpu_aperture aperture)
 {
+	struct gk20a *g = gk20a_from_vm(vm);
 	int err = 0;
 	bool allocated = false;
-	struct gk20a *g = gk20a_from_vm(vm);
 	int ctag_granularity = g->ops.fb.compression_page_size(g);
-	u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
+	struct nvgpu_gmmu_attrs attrs = {
+		.pgsz      = pgsz_idx,
+		.kind_v    = kind_v,
+		.ctag      = (u64)ctag_offset * (u64)ctag_granularity,
+		.cacheable = flags & NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+		.rw_flag   = rw_flag,
+		.sparse    = sparse,
+		.priv      = priv,
+		.valid     = !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE),
+		.aperture  = aperture
+	};
 
-	/* Allocate (or validate when map_offset != 0) the virtual address. */
-	if (!map_offset) {
-		map_offset = __nvgpu_vm_alloc_va(vm, size,
-					  pgsz_idx);
-		if (!map_offset) {
+	/*
+	 * Only allocate a new GPU VA range if we haven't already been passed a
+	 * GPU VA range. This facilitates fixed mappings.
+	 */
+	if (!vaddr) {
+		vaddr = __nvgpu_vm_alloc_va(vm, size, pgsz_idx);
+		if (!vaddr) {
 			nvgpu_err(g, "failed to allocate va space");
 			err = -ENOMEM;
 			goto fail_alloc;
@@ -533,34 +770,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 		allocated = true;
 	}
 
-	gmmu_dbg(g,
-		 "gv: 0x%04x_%08x + 0x%-7llx "
-		 "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
-		 "pgsz=%-3dKb as=%-2d ctags=%d start=%d "
-		 "kind=0x%x flags=0x%x apt=%s",
-		 u64_hi32(map_offset), u64_lo32(map_offset), size,
-		 sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
-		 sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
-		 sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
-		 sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
-		 vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
-		 ctag_lines, ctag_offset,
-		 kind_v, flags, nvgpu_aperture_str(aperture));
-
-	err = update_gmmu_ptes_locked(vm, pgsz_idx,
-				      sgt,
-				      buffer_offset,
-				      map_offset, map_offset + size,
-				      kind_v,
-				      ctag_offset,
-				      flags &
-				      NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-				      flags &
-				      NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
-				      rw_flag,
-				      sparse,
-				      priv,
-				      aperture);
+	err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
+					     vaddr, size, &attrs);
 	if (err) {
 		nvgpu_err(g, "failed to update ptes on map");
 		goto fail_validate;
@@ -571,26 +782,37 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 	else
 		batch->need_tlb_invalidate = true;
 
-	return map_offset;
+	return vaddr;
 fail_validate:
 	if (allocated)
-		__nvgpu_vm_free_va(vm, map_offset, pgsz_idx);
+		__nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
 fail_alloc:
 	nvgpu_err(g, "%s: failed with err=%d", __func__, err);
 	return 0;
 }
 
 void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
-			u64 vaddr,
-			u64 size,
-			int pgsz_idx,
-			bool va_allocated,
-			int rw_flag,
-			bool sparse,
-			struct vm_gk20a_mapping_batch *batch)
+			     u64 vaddr,
+			     u64 size,
+			     int pgsz_idx,
+			     bool va_allocated,
+			     int rw_flag,
+			     bool sparse,
+			     struct vm_gk20a_mapping_batch *batch)
 {
 	int err = 0;
 	struct gk20a *g = gk20a_from_vm(vm);
+	struct nvgpu_gmmu_attrs attrs = {
+		.pgsz      = pgsz_idx,
+		.kind_v    = 0,
+		.ctag      = 0,
+		.cacheable = 0,
+		.rw_flag   = rw_flag,
+		.sparse    = sparse,
+		.priv      = 0,
+		.valid     = 0,
+		.aperture  = APERTURE_INVALID,
+	};
 
 	if (va_allocated) {
 		err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
@@ -601,27 +823,11 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
 	}
 
 	/* unmap here needs to know the page size we assigned at mapping */
-	err = update_gmmu_ptes_locked(vm,
-				pgsz_idx,
-				NULL, /* n/a for unmap */
-				0,
-				vaddr,
-				vaddr + size,
-				0, 0, false /* n/a for unmap */,
-				false, rw_flag,
-				sparse, 0,
-				APERTURE_INVALID); /* don't care for unmap */
+	err = __nvgpu_gmmu_update_page_table(vm, NULL, 0,
+					     vaddr, size, &attrs);
 	if (err)
 		nvgpu_err(g, "failed to update gmmu ptes on unmap");
 
-	/* flush l2 so any dirty lines are written out *now*.
-	 *  also as we could potentially be switching this buffer
-	 * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
-	 * some point in the future we need to invalidate l2.  e.g. switching
-	 * from a render buffer unmap (here) to later using the same memory
-	 * for gmmu ptes.  note the positioning of this relative to any smmu
-	 * unmapping (below). */
-
 	if (!batch) {
 		gk20a_mm_l2_flush(g, true);
 		g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 88622ecad..3aeba5001 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -36,7 +36,7 @@ int vm_aspace_id(struct vm_gk20a *vm)
 }
 
 static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
-				  struct gk20a_mm_entry *parent,
+				  struct nvgpu_gmmu_pd *parent,
 				  int level)
 {
 	int i;
@@ -75,8 +75,6 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
 
 	/* Be certain we round up to page_size if needed */
 	size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1);
-	nvgpu_log(g, gpu_dbg_map, "size=0x%llx @ pgsz=%dKB", size,
-		  vm->gmmu_page_sizes[pgsz_idx] >> 10);
 
 	addr = nvgpu_alloc(vma, size);
 	if (!addr) {
@@ -84,17 +82,14 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
 		return 0;
 	}
 
-	nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
 	return addr;
 }
 
 int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr,
 		       enum gmmu_pgsz_gk20a pgsz_idx)
 {
-	struct gk20a *g = vm->mm->g;
 	struct nvgpu_allocator *vma = vm->vma[pgsz_idx];
 
-	nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
 	nvgpu_free(vma, addr);
 
 	return 0;
@@ -127,32 +122,6 @@ void nvgpu_vm_mapping_batch_finish(struct vm_gk20a *vm,
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
 }
 
-static int nvgpu_vm_init_page_tables(struct vm_gk20a *vm)
-{
-	u32 pde_lo, pde_hi;
-	int err;
-
-	pde_range_from_vaddr_range(vm,
-				   0, vm->va_limit-1,
-				   &pde_lo, &pde_hi);
-	vm->pdb.entries = nvgpu_vzalloc(vm->mm->g,
-					sizeof(struct gk20a_mm_entry) *
-					(pde_hi + 1));
-	vm->pdb.num_entries = pde_hi + 1;
-
-	if (!vm->pdb.entries)
-		return -ENOMEM;
-
-	err = nvgpu_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0],
-					   &vm->pdb, NULL);
-	if (err) {
-		nvgpu_vfree(vm->mm->g, vm->pdb.entries);
-		return err;
-	}
-
-	return 0;
-}
-
 /*
  * Determine if the passed address space can support big pages or not.
  */
@@ -280,7 +249,8 @@ static int __nvgpu_vm_init(struct mm_gk20a *mm,
 #endif
 
 	/* Initialize the page table data structures. */
-	err = nvgpu_vm_init_page_tables(vm);
+	strncpy(vm->name, name, min(strlen(name), sizeof(vm->name)));
+	err = nvgpu_gmmu_init_page_table(vm);
 	if (err)
 		goto clean_up_vgpu_vm;
 
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
index 3c76e8173..c5f9c1fd5 100644
--- a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -67,7 +67,7 @@ void gk20a_fb_tlb_invalidate(struct gk20a *g, struct nvgpu_mem *pdb)
 	if (!g->power_on)
 		return;
 
-	addr_lo = u64_lo32(gk20a_mem_get_base_addr(g, pdb, 0) >> 12);
+	addr_lo = u64_lo32(nvgpu_mem_get_base_addr(g, pdb, 0) >> 12);
 
 	nvgpu_mutex_acquire(&g->mm.tlb_lock);
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index b7b685750..558a1b063 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -777,31 +777,6 @@ int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
 	return vm->mmu_levels[0].lo_bit[0];
 }
 
-/* given address range (inclusive) determine the pdes crossed */
-void pde_range_from_vaddr_range(struct vm_gk20a *vm,
-					      u64 addr_lo, u64 addr_hi,
-					      u32 *pde_lo, u32 *pde_hi)
-{
-	int pde_shift = gk20a_mm_pde_coverage_bit_count(vm);
-
-	*pde_lo = (u32)(addr_lo >> pde_shift);
-	*pde_hi = (u32)(addr_hi >> pde_shift);
-	gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
-		   addr_lo, addr_hi, pde_shift);
-	gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
-		   *pde_lo, *pde_hi);
-}
-
-static u32 pde_from_index(u32 i)
-{
-	return i * gmmu_pde__size_v() / sizeof(u32);
-}
-
-static u32 pte_from_index(u32 i)
-{
-	return i * gmmu_pte__size_v() / sizeof(u32);
-}
-
 int nvgpu_vm_get_buffers(struct vm_gk20a *vm,
 			 struct nvgpu_mapped_buf ***mapped_buffers,
 			 int *num_buffers)
@@ -1478,7 +1453,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
  * If mem is in VIDMEM, return base address in vidmem
  * else return IOVA address for SYSMEM
  */
-u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
+u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
 			    u32 flags)
 {
 	struct nvgpu_page_alloc *alloc;
@@ -1580,203 +1555,168 @@ u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
 	return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
 }
 
-void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
-		size_t w, size_t data)
-{
-	nvgpu_mem_wr32(g, &entry->mem, entry->woffset + w, data);
-}
-
-u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry)
-{
-	u64 base;
-
-	if (g->mm.has_physical_mode)
-		base = sg_phys(entry->mem.priv.sgt->sgl);
-	else
-		base = gk20a_mem_get_base_addr(g, &entry->mem, 0);
-
-	return base + entry->woffset * sizeof(u32);
-}
-
 /* for gk20a the "video memory" apertures here are misnomers. */
 static inline u32 big_valid_pde0_bits(struct gk20a *g,
-		struct gk20a_mm_entry *entry)
+				      struct nvgpu_gmmu_pd *pd, u64 addr)
 {
-	u64 pte_addr = gk20a_pde_addr(g, entry);
 	u32 pde0_bits =
-		nvgpu_aperture_mask(g, &entry->mem,
+		nvgpu_aperture_mask(g, &pd->mem,
 		  gmmu_pde_aperture_big_sys_mem_ncoh_f(),
 		  gmmu_pde_aperture_big_video_memory_f()) |
 		gmmu_pde_address_big_sys_f(
-			   (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+			   (u32)(addr >> gmmu_pde_address_shift_v()));
 
 	return pde0_bits;
 }
 
 static inline u32 small_valid_pde1_bits(struct gk20a *g,
-		struct gk20a_mm_entry *entry)
+					struct nvgpu_gmmu_pd *pd, u64 addr)
 {
-	u64 pte_addr = gk20a_pde_addr(g, entry);
 	u32 pde1_bits =
-		nvgpu_aperture_mask(g, &entry->mem,
+		nvgpu_aperture_mask(g, &pd->mem,
 		  gmmu_pde_aperture_small_sys_mem_ncoh_f(),
 		  gmmu_pde_aperture_small_video_memory_f()) |
 		gmmu_pde_vol_small_true_f() | /* tbd: why? */
 		gmmu_pde_address_small_sys_f(
-			   (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+			   (u32)(addr >> gmmu_pde_address_shift_v()));
 
 	return pde1_bits;
 }
 
-/* Given the current state of the ptes associated with a pde,
-   determine value and write it out.  There's no checking
-   here to determine whether or not a change was actually
-   made.  So, superfluous updates will cause unnecessary
-   pde invalidations.
-*/
-static int update_gmmu_pde_locked(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *pte,
-			   u32 i, u32 gmmu_pgsz_idx,
-			   struct scatterlist **sgl,
-			   u64 *offset,
-			   u64 *iova,
-			   u32 kind_v, u64 *ctag,
-			   bool cacheable, bool unammped_pte,
-			   int rw_flag, bool sparse, bool priv,
-			   enum nvgpu_aperture aperture)
+static void update_gmmu_pde_locked(struct vm_gk20a *vm,
+				   const struct gk20a_mmu_level *l,
+				   struct nvgpu_gmmu_pd *pd,
+				   u32 pd_idx,
+				   u64 virt_addr,
+				   u64 phys_addr,
+				   struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	bool small_valid, big_valid;
-	struct gk20a_mm_entry *entry = vm->pdb.entries + i;
+	u32 pd_offset = pd_offset_from_index(l, pd_idx);
 	u32 pde_v[2] = {0, 0};
-	u32 pde;
 
-	gk20a_dbg_fn("");
-
-	small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
-	big_valid   = entry->mem.size && entry->pgsz == gmmu_page_size_big;
+	small_valid = attrs->pgsz == gmmu_page_size_small;
+	big_valid   = attrs->pgsz == gmmu_page_size_big;
 
 	pde_v[0] = gmmu_pde_size_full_f();
 	pde_v[0] |= big_valid ?
-		big_valid_pde0_bits(g, entry) :
+		big_valid_pde0_bits(g, pd, phys_addr) :
 		gmmu_pde_aperture_big_invalid_f();
 
-	pde_v[1] |= (small_valid ?
-		     small_valid_pde1_bits(g, entry) :
+	pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) :
 		     (gmmu_pde_aperture_small_invalid_f() |
 		      gmmu_pde_vol_small_false_f()))
-		    |
-		    (big_valid ? (gmmu_pde_vol_big_true_f()) :
-		     gmmu_pde_vol_big_false_f());
+		|
+		(big_valid ? (gmmu_pde_vol_big_true_f()) :
+		 gmmu_pde_vol_big_false_f());
 
-	pde = pde_from_index(i);
+	pte_dbg(g, attrs,
+		"PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
+		"GPU %#-12llx  phys %#-12llx "
+		"[0x%08x, 0x%08x]",
+		pd_idx, l->entry_size, pd_offset,
+		small_valid ? 'S' : '-',
+		big_valid ?   'B' : '-',
+		virt_addr, phys_addr,
+		pde_v[1], pde_v[0]);
 
-	gk20a_pde_wr32(g, &vm->pdb, pde + 0, pde_v[0]);
-	gk20a_pde_wr32(g, &vm->pdb, pde + 1, pde_v[1]);
-
-	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
-		  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
-	return 0;
+	pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]);
+	pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]);
 }
 
-static int update_gmmu_pte_locked(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *pte,
-			   u32 i, u32 gmmu_pgsz_idx,
-			   struct scatterlist **sgl,
-			   u64 *offset,
-			   u64 *iova,
-			   u32 kind_v, u64 *ctag,
-			   bool cacheable, bool unmapped_pte,
-			   int rw_flag, bool sparse, bool priv,
-			   enum nvgpu_aperture aperture)
+static void __update_pte_sparse(u32 *pte_w)
+{
+	pte_w[0]  = gmmu_pte_valid_false_f();
+	pte_w[1] |= gmmu_pte_vol_true_f();
+}
+
+static void __update_pte(struct vm_gk20a *vm,
+			 u32 *pte_w,
+			 u64 phys_addr,
+			 struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
+	u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
+	u32 pte_valid = attrs->valid ?
+		gmmu_pte_valid_true_f() :
+		gmmu_pte_valid_false_f();
+	u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v();
+	u32 addr = attrs->aperture == APERTURE_SYSMEM ?
+		gmmu_pte_address_sys_f(phys_shifted) :
+		gmmu_pte_address_vid_f(phys_shifted);
 	int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
-	u32 page_size  = vm->gmmu_page_sizes[gmmu_pgsz_idx];
-	u32 pte_w[2] = {0, 0}; /* invalid pte */
 
-	if (*iova) {
-		u32 pte_valid = unmapped_pte ?
-			gmmu_pte_valid_false_f() :
-			gmmu_pte_valid_true_f();
-		u32 iova_v = *iova >> gmmu_pte_address_shift_v();
-		u32 pte_addr = aperture == APERTURE_SYSMEM ?
-				gmmu_pte_address_sys_f(iova_v) :
-				gmmu_pte_address_vid_f(iova_v);
+	pte_w[0] = pte_valid | addr;
 
-		pte_w[0] = pte_valid | pte_addr;
+	if (attrs->priv)
+		pte_w[0] |= gmmu_pte_privilege_true_f();
 
-		if (priv)
-			pte_w[0] |= gmmu_pte_privilege_true_f();
+	pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture,
+					 gmmu_pte_aperture_sys_mem_ncoh_f(),
+					 gmmu_pte_aperture_video_memory_f()) |
+		gmmu_pte_kind_f(attrs->kind_v) |
+		gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift));
 
-		pte_w[1] = __nvgpu_aperture_mask(g, aperture,
-			  gmmu_pte_aperture_sys_mem_ncoh_f(),
-			  gmmu_pte_aperture_video_memory_f()) |
-			gmmu_pte_kind_f(kind_v) |
-			gmmu_pte_comptagline_f((u32)(*ctag >> ctag_shift));
+	if (attrs->ctag && vm->mm->use_full_comp_tag_line &&
+	    phys_addr & 0x10000)
+		pte_w[1] |= gmmu_pte_comptagline_f(
+			1 << (gmmu_pte_comptagline_s() - 1));
 
-		if (*ctag && vm->mm->use_full_comp_tag_line && *iova & 0x10000)
-			pte_w[1] |= gmmu_pte_comptagline_f(
-					1 << (gmmu_pte_comptagline_s() - 1));
+	if (attrs->rw_flag == gk20a_mem_flag_read_only) {
+		pte_w[0] |= gmmu_pte_read_only_true_f();
+		pte_w[1] |= gmmu_pte_write_disable_true_f();
+	} else if (attrs->rw_flag == gk20a_mem_flag_write_only) {
+		pte_w[1] |= gmmu_pte_read_disable_true_f();
+	}
 
-		if (rw_flag == gk20a_mem_flag_read_only) {
-			pte_w[0] |= gmmu_pte_read_only_true_f();
-			pte_w[1] |=
-				gmmu_pte_write_disable_true_f();
-		} else if (rw_flag ==
-			   gk20a_mem_flag_write_only) {
-			pte_w[1] |=
-				gmmu_pte_read_disable_true_f();
-		}
-		if (!unmapped_pte) {
-			if (!cacheable)
-				pte_w[1] |=
-					gmmu_pte_vol_true_f();
-		} else {
-			/* Store cacheable value behind
-			 * gmmu_pte_write_disable_true_f */
-			if (!cacheable)
-				pte_w[1] |=
-				gmmu_pte_write_disable_true_f();
-		}
-
-		gk20a_dbg(gpu_dbg_pte,
-			"pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]",
-			   i, *iova,
-			   kind_v, (u32)(*ctag >> ctag_shift), !cacheable,
-			   pte_w[1], pte_w[0]);
-
-		if (*ctag)
-			*ctag += page_size;
-	} else if (sparse) {
-		pte_w[0] = gmmu_pte_valid_false_f();
+	if (!attrs->cacheable)
 		pte_w[1] |= gmmu_pte_vol_true_f();
-	} else {
-		gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
-	}
 
-	gk20a_pde_wr32(g, pte, pte_from_index(i) + 0, pte_w[0]);
-	gk20a_pde_wr32(g, pte, pte_from_index(i) + 1, pte_w[1]);
+	if (attrs->ctag)
+		attrs->ctag += page_size;
+}
 
-	if (*iova) {
-		*iova += page_size;
-		*offset += page_size;
-		if (*sgl && *offset + page_size > (*sgl)->length) {
-			u64 new_iova;
-			*sgl = sg_next(*sgl);
-			if (*sgl) {
-				new_iova = sg_phys(*sgl);
-				gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-					  new_iova, (*sgl)->length);
-				if (new_iova) {
-					*offset = 0;
-					*iova = new_iova;
-				}
-			}
-		}
-	}
+static void update_gmmu_pte_locked(struct vm_gk20a *vm,
+				   const struct gk20a_mmu_level *l,
+				   struct nvgpu_gmmu_pd *pd,
+				   u32 pd_idx,
+				   u64 virt_addr,
+				   u64 phys_addr,
+				   struct nvgpu_gmmu_attrs *attrs)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	u32 page_size  = vm->gmmu_page_sizes[attrs->pgsz];
+	u32 pd_offset = pd_offset_from_index(l, pd_idx);
+	u32 pte_w[2] = {0, 0};
+	int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
 
-	return 0;
+	if (phys_addr)
+		__update_pte(vm, pte_w, phys_addr, attrs);
+	else if (attrs->sparse)
+		__update_pte_sparse(pte_w);
+
+	pte_dbg(g, attrs,
+		"PTE: i=%-4u size=%-2u offs=%-4u | "
+		"GPU %#-12llx  phys %#-12llx "
+		"pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
+		"ctag=0x%08x "
+		"[0x%08x, 0x%08x]",
+		pd_idx, l->entry_size, pd_offset,
+		virt_addr, phys_addr,
+		page_size >> 10,
+		nvgpu_gmmu_perm_str(attrs->rw_flag),
+		attrs->kind_v,
+		nvgpu_aperture_str(attrs->aperture),
+		attrs->valid     ? 'V' : '-',
+		attrs->cacheable ? 'C' : '-',
+		attrs->sparse    ? 'S' : '-',
+		attrs->priv      ? 'P' : '-',
+		(u32)attrs->ctag >> ctag_shift,
+		pte_w[1], pte_w[0]);
+
+	pd_write(g, pd, pd_offset + 0, pte_w[0]);
+	pd_write(g, pd, pd_offset + 1, pte_w[1]);
 }
 
 /* NOTE! mapped_buffers lock must be held */
@@ -1809,13 +1749,6 @@ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer,
 		  mapped_buffer->vm_area->sparse : false,
 		batch);
 
-	gk20a_dbg(gpu_dbg_map,
-		  "gv: 0x%04x_%08x pgsz=%-3dKb as=%-2d own_mem_ref=%d",
-		  u64_hi32(mapped_buffer->addr), u64_lo32(mapped_buffer->addr),
-		  vm->gmmu_page_sizes[mapped_buffer->pgsz_idx] >> 10,
-		  vm_aspace_id(vm),
-		  mapped_buffer->own_mem_ref);
-
 	gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
 		       mapped_buffer->sgt);
 
@@ -1942,6 +1875,9 @@ int __gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
 	if (err)
 		ch->vm = NULL;
 
+	nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s",
+		  ch->chid, vm->name);
+
 	return err;
 }
 
@@ -2114,7 +2050,7 @@ u64 gk20a_mm_inst_block_addr(struct gk20a *g, struct nvgpu_mem *inst_block)
 	if (g->mm.has_physical_mode)
 		addr = gk20a_mem_phys(inst_block);
 	else
-		addr = gk20a_mem_get_base_addr(g, inst_block, 0);
+		addr = nvgpu_mem_get_base_addr(g, inst_block, 0);
 
 	return addr;
 }
@@ -2237,7 +2173,7 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
 void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0);
+	u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index cf37640d6..a245d0e0a 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -42,12 +42,6 @@
 		outer_flush_range(pa, pa + (size_t)(size));		\
 	} while (0)
 
-enum gk20a_mem_rw_flag {
-	gk20a_mem_flag_none = 0,
-	gk20a_mem_flag_read_only = 1,
-	gk20a_mem_flag_write_only = 2,
-};
-
 struct gpfifo_desc {
 	struct nvgpu_mem mem;
 	u32 entry_num;
@@ -347,7 +341,7 @@ int gk20a_mm_suspend(struct gk20a *g);
 u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
 		u32 flags);
 u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova);
-u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
+u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
 			    u32 flags);
 
 void gk20a_mm_ltc_isr(struct gk20a *g);
@@ -371,10 +365,6 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem)
 	return 0;
 }
 
-void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
-		size_t w, size_t data);
-u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry);
-
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 			u64 map_offset,
 			struct sg_table *sgt,
@@ -451,8 +441,4 @@ int gk20a_mm_get_buffer_info(struct device *dev, int dmabuf_fd,
 			     u64 *buffer_id, u64 *buffer_len);
 void gk20a_vm_unmap_locked_kref(struct kref *ref);
 
-void gk20a_vm_free_entries(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *parent,
-			   int level);
-
 #endif /* MM_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index d7391c6d1..c3867e9d3 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -14,6 +14,7 @@
  */
 
 #include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
 	return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
 }
 
-static u32 pde3_from_index(u32 i)
-{
-	return i * gmmu_new_pde__size_v() / sizeof(u32);
-}
-
-static u32 pte3_from_index(u32 i)
-{
-	return i * gmmu_new_pte__size_v() / sizeof(u32);
-}
-
-static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *parent,
-			   u32 i, u32 gmmu_pgsz_idx,
-			   struct scatterlist **sgl,
-			   u64 *offset,
-			   u64 *iova,
-			   u32 kind_v, u64 *ctag,
-			   bool cacheable, bool unmapped_pte,
-			   int rw_flag, bool sparse, bool priv,
-			   enum nvgpu_aperture aperture)
+static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
+				    const struct gk20a_mmu_level *l,
+				    struct nvgpu_gmmu_pd *pd,
+				    u32 pd_idx,
+				    u64 virt_addr,
+				    u64 phys_addr,
+				    struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
-	u64 pte_addr = 0;
-	struct gk20a_mm_entry *pte = parent->entries + i;
+	u32 pd_offset = pd_offset_from_index(l, pd_idx);
 	u32 pde_v[2] = {0, 0};
-	u32 pde;
 
-	gk20a_dbg_fn("");
+	phys_addr >>= gmmu_new_pde_address_shift_v();
 
-	pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v();
-
-	pde_v[0] |= nvgpu_aperture_mask(g, &pte->mem,
+	pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
 			gmmu_new_pde_aperture_sys_mem_ncoh_f(),
 			gmmu_new_pde_aperture_video_memory_f());
-	pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr));
+	pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
 	pde_v[0] |= gmmu_new_pde_vol_true_f();
-	pde_v[1] |= pte_addr >> 24;
-	pde = pde3_from_index(i);
+	pde_v[1] |= phys_addr >> 24;
 
-	gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]);
-	gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]);
+	pd_write(g, pd, pd_offset + 0, pde_v[0]);
+	pd_write(g, pd, pd_offset + 1, pde_v[1]);
 
-	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
-		  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
-	gk20a_dbg_fn("done");
-	return 0;
+	pte_dbg(g, attrs,
+		"PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- | "
+		"GPU %#-12llx  phys %#-12llx "
+		"[0x%08x, 0x%08x]",
+		pd_idx, l->entry_size, pd_offset,
+		virt_addr, phys_addr,
+		pde_v[1], pde_v[0]);
 }
 
-static u32 pde0_from_index(u32 i)
-{
-	return i * gmmu_new_dual_pde__size_v() / sizeof(u32);
-}
-
-static int update_gmmu_pde0_locked(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *pte,
-			   u32 i, u32 gmmu_pgsz_idx,
-			   struct scatterlist **sgl,
-			   u64 *offset,
-			   u64 *iova,
-			   u32 kind_v, u64 *ctag,
-			   bool cacheable, bool unmapped_pte,
-			   int rw_flag, bool sparse, bool priv,
-			   enum nvgpu_aperture aperture)
+static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
+				    const struct gk20a_mmu_level *l,
+				    struct nvgpu_gmmu_pd *pd,
+				    u32 pd_idx,
+				    u64 virt_addr,
+				    u64 phys_addr,
+				    struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	bool small_valid, big_valid;
-	u32 pte_addr_small = 0, pte_addr_big = 0;
-	struct gk20a_mm_entry *entry = pte->entries + i;
+	u32 small_addr = 0, big_addr = 0;
+	u32 pd_offset = pd_offset_from_index(l, pd_idx);
 	u32 pde_v[4] = {0, 0, 0, 0};
-	u32 pde;
 
-	gk20a_dbg_fn("");
+	small_valid = attrs->pgsz == gmmu_page_size_small;
+	big_valid   = attrs->pgsz == gmmu_page_size_big;
 
-	small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
-	big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
-
-	if (small_valid) {
-		pte_addr_small = gk20a_pde_addr(g, entry)
-				 >> gmmu_new_dual_pde_address_shift_v();
-	}
+	if (small_valid)
+		small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v();
 
 	if (big_valid)
-		pte_addr_big = gk20a_pde_addr(g, entry)
-			       >> gmmu_new_dual_pde_address_big_shift_v();
+		big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v();
 
 	if (small_valid) {
-		pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small);
-		pde_v[2] |= nvgpu_aperture_mask(g, &entry->mem,
+		pde_v[2] |=
+			gmmu_new_dual_pde_address_small_sys_f(small_addr);
+		pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
 			gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
 			gmmu_new_dual_pde_aperture_small_video_memory_f());
 		pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
-		pde_v[3] |= pte_addr_small >> 24;
+		pde_v[3] |= small_addr >> 24;
 	}
 
 	if (big_valid) {
-		pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big);
+		pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
 		pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
-		pde_v[0] |= nvgpu_aperture_mask(g, &entry->mem,
+		pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
 			gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
 			gmmu_new_dual_pde_aperture_big_video_memory_f());
-		pde_v[1] |= pte_addr_big >> 28;
+		pde_v[1] |= big_addr >> 28;
 	}
 
-	pde = pde0_from_index(i);
+	pd_write(g, pd, pd_offset + 0, pde_v[0]);
+	pd_write(g, pd, pd_offset + 1, pde_v[1]);
+	pd_write(g, pd, pd_offset + 2, pde_v[2]);
+	pd_write(g, pd, pd_offset + 3, pde_v[3]);
 
-	gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]);
-	gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]);
-	gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]);
-	gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]);
-
-	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]",
-		  i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
-	gk20a_dbg_fn("done");
-	return 0;
+	pte_dbg(g, attrs,
+		"PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
+		"GPU %#-12llx  phys %#-12llx "
+		"[0x%08x, 0x%08x, 0x%08x, 0x%08x]",
+		pd_idx, l->entry_size, pd_offset,
+		small_valid ? 'S' : '-',
+		big_valid ?   'B' : '-',
+		virt_addr, phys_addr,
+		pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
 }
 
-static int update_gmmu_pte_locked(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *pte,
-			   u32 i, u32 gmmu_pgsz_idx,
-			   struct scatterlist **sgl,
-			   u64 *offset,
-			   u64 *iova,
-			   u32 kind_v, u64 *ctag,
-			   bool cacheable, bool unmapped_pte,
-			   int rw_flag, bool sparse, bool priv,
-			   enum nvgpu_aperture aperture)
+static void __update_pte(struct vm_gk20a *vm,
+			 u32 *pte_w,
+			 u64 phys_addr,
+			 struct nvgpu_gmmu_attrs *attrs)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	u64 ctag_granularity = g->ops.fb.compression_page_size(g);
+	u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
+	u32 pte_valid = attrs->valid ?
+		gmmu_new_pte_valid_true_f() :
+		gmmu_new_pte_valid_false_f();
+	u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v();
+	u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ?
+		gmmu_new_pte_address_sys_f(phys_shifted) :
+		gmmu_new_pte_address_vid_f(phys_shifted);
+	u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture,
+		gmmu_new_pte_aperture_sys_mem_ncoh_f(),
+		gmmu_new_pte_aperture_video_memory_f());
+
+	pte_w[0] = pte_valid | pte_addr | pte_tgt;
+
+	if (attrs->priv)
+		pte_w[0] |= gmmu_new_pte_privilege_true_f();
+
+	pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) |
+		gmmu_new_pte_kind_f(attrs->kind_v) |
+		gmmu_new_pte_comptagline_f((u32)(attrs->ctag /
+						 ctag_granularity));
+
+	if (attrs->rw_flag == gk20a_mem_flag_read_only)
+		pte_w[0] |= gmmu_new_pte_read_only_true_f();
+
+	if (!attrs->valid && !attrs->cacheable)
+		pte_w[0] |= gmmu_new_pte_read_only_true_f();
+	else if (!attrs->cacheable)
+		pte_w[0] |= gmmu_new_pte_vol_true_f();
+
+	if (attrs->ctag)
+		attrs->ctag += page_size;
+
+}
+
+static void __update_pte_sparse(u32 *pte_w)
+{
+	pte_w[0] = gmmu_new_pte_valid_false_f();
+	pte_w[0] |= gmmu_new_pte_vol_true_f();
+}
+
+static void update_gmmu_pte_locked(struct vm_gk20a *vm,
+				   const struct gk20a_mmu_level *l,
+				   struct nvgpu_gmmu_pd *pd,
+				   u32 pd_idx,
+				   u64 virt_addr,
+				   u64 phys_addr,
+				   struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = vm->mm->g;
-	u32 page_size  = vm->gmmu_page_sizes[gmmu_pgsz_idx];
-	u64 ctag_granularity = g->ops.fb.compression_page_size(g);
-	u32 pte_w[2] = {0, 0}; /* invalid pte */
-	u32 pte_i;
+	u32 page_size  = vm->gmmu_page_sizes[attrs->pgsz];
+	u32 pd_offset = pd_offset_from_index(l, pd_idx);
+	u32 pte_w[2] = {0, 0};
 
-	if (*iova) {
-		u32 pte_valid = unmapped_pte ?
-			gmmu_new_pte_valid_false_f() :
-			gmmu_new_pte_valid_true_f();
-		u32 iova_v = *iova >> gmmu_new_pte_address_shift_v();
-		u32 pte_addr = aperture == APERTURE_SYSMEM ?
-				gmmu_new_pte_address_sys_f(iova_v) :
-				gmmu_new_pte_address_vid_f(iova_v);
-		u32 pte_tgt = __nvgpu_aperture_mask(g, aperture,
-				gmmu_new_pte_aperture_sys_mem_ncoh_f(),
-				gmmu_new_pte_aperture_video_memory_f());
+	if (phys_addr)
+		__update_pte(vm, pte_w, phys_addr, attrs);
+	else if (attrs->sparse)
+		__update_pte_sparse(pte_w);
 
-		pte_w[0] = pte_valid | pte_addr | pte_tgt;
+	pte_dbg(g, attrs,
+		"vm=%s "
+		"PTE: i=%-4u size=%-2u offs=%-4u | "
+		"GPU %#-12llx  phys %#-12llx "
+		"pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
+		"ctag=0x%08x "
+		"[0x%08x, 0x%08x]",
+		vm->name,
+		pd_idx, l->entry_size, pd_offset,
+		virt_addr, phys_addr,
+		page_size >> 10,
+		nvgpu_gmmu_perm_str(attrs->rw_flag),
+		attrs->kind_v,
+		nvgpu_aperture_str(attrs->aperture),
+		attrs->valid     ? 'V' : '-',
+		attrs->cacheable ? 'C' : '-',
+		attrs->sparse    ? 'S' : '-',
+		attrs->priv      ? 'P' : '-',
+		(u32)attrs->ctag / g->ops.fb.compression_page_size(g),
+		pte_w[1], pte_w[0]);
 
-		if (priv)
-			pte_w[0] |= gmmu_new_pte_privilege_true_f();
-
-		pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) |
-			   gmmu_new_pte_kind_f(kind_v) |
-			   gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity));
-
-		if (rw_flag == gk20a_mem_flag_read_only)
-			pte_w[0] |= gmmu_new_pte_read_only_true_f();
-		if (unmapped_pte && !cacheable)
-			pte_w[0] |= gmmu_new_pte_read_only_true_f();
-		else if (!cacheable)
-			pte_w[0] |= gmmu_new_pte_vol_true_f();
-
-		gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d"
-			   " ctag=%d vol=%d"
-			   " [0x%08x, 0x%08x]",
-			   i, *iova,
-			   kind_v, (u32)(*ctag / ctag_granularity), !cacheable,
-			   pte_w[1], pte_w[0]);
-
-		if (*ctag)
-			*ctag += page_size;
-	} else if (sparse) {
-		pte_w[0] = gmmu_new_pte_valid_false_f();
-		pte_w[0] |= gmmu_new_pte_vol_true_f();
-	} else {
-		gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
-	}
-
-	pte_i = pte3_from_index(i);
-
-	gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]);
-	gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]);
-
-	if (*iova) {
-		*iova += page_size;
-		*offset += page_size;
-		if (*sgl && *offset + page_size > (*sgl)->length) {
-			u64 new_iova;
-			*sgl = sg_next(*sgl);
-			if (*sgl) {
-				new_iova = sg_phys(*sgl);
-				gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
-					  new_iova, (*sgl)->length);
-				if (new_iova) {
-					*offset = 0;
-					*iova = new_iova;
-				}
-			}
-		}
-	}
-	return 0;
+	pd_write(g, pd, pd_offset + 0, pte_w[0]);
+	pd_write(g, pd, pd_offset + 1, pte_w[1]);
 }
 
 static const struct gk20a_mmu_level gp10b_mm_levels[] = {
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
 static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm)
 {
-	u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0);
+	u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index ed152cd84..28a2cb827 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -38,36 +38,97 @@ enum gmmu_pgsz_gk20a {
 	gmmu_nr_page_sizes    = 3,
 };
 
-struct gk20a_mm_entry {
-	/* backing for */
-	struct nvgpu_mem mem;
-	u32 woffset; /* if >0, mem is a shadow copy, owned by another entry */
-	int pgsz;
-	struct gk20a_mm_entry *entries;
-	int num_entries;
+enum gk20a_mem_rw_flag {
+	gk20a_mem_flag_none = 0,	/* RW */
+	gk20a_mem_flag_read_only = 1,	/* RO */
+	gk20a_mem_flag_write_only = 2,	/* WO */
+};
+
+/*
+ * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
+ * in the GMMU.
+ */
+struct nvgpu_gmmu_pd {
+	/*
+	 * DMA memory describing the PTEs or PTEs.
+	 */
+	struct nvgpu_mem	 mem;
+
+	/*
+	 * List of pointers to the next level of page tables. Does not
+	 * need to be populated when this PD is pointing to PTEs.
+	 */
+	struct nvgpu_gmmu_pd	*entries;
+	int			 num_entries;
+};
+
+/*
+ * Reduce the number of arguments getting passed through the various levels of
+ * GMMU mapping functions.
+ *
+ * The following fields are set statically and do not change throughout
+ * mapping call:
+ *
+ *   pgsz:        Index into the page size table.
+ *   kind_v:      Kind attributes for mapping.
+ *   cacheable:   Cacheability of the mapping.
+ *   rw_flag:     Flag from enum gk20a_mem_rw_flag
+ *   sparse:      Set if the mapping should be sparse.
+ *   priv:        Privilidged mapping.
+ *   valid:       Set if the PTE should be marked valid.
+ *   aperture:    VIDMEM or SYSMEM.
+ *   debug:       When set print debugging info.
+ *
+ * These fields are dynamically updated as necessary during the map:
+ *
+ *   ctag:        Comptag line in the comptag cache;
+ *                updated every time we write a PTE.
+ */
+struct nvgpu_gmmu_attrs {
+	u32			 pgsz;
+	u32			 kind_v;
+	u64			 ctag;
+	bool			 cacheable;
+	int			 rw_flag;
+	bool			 sparse;
+	bool			 priv;
+	bool			 valid;
+	enum nvgpu_aperture	 aperture;
+	bool			 debug;
 };
 
 struct gk20a_mmu_level {
 	int hi_bit[2];
 	int lo_bit[2];
-	int (*update_entry)(struct vm_gk20a *vm,
-			   struct gk20a_mm_entry *pte,
-			   u32 i, u32 gmmu_pgsz_idx,
-			   struct scatterlist **sgl,
-			   u64 *offset,
-			   u64 *iova,
-			   u32 kind_v, u64 *ctag,
-			   bool cacheable, bool unmapped_pte,
-			   int rw_flag, bool sparse, bool priv,
-			   enum nvgpu_aperture aperture);
-	size_t entry_size;
+
+	/*
+	 * Build map from virt_addr -> phys_addr.
+	 */
+	void (*update_entry)(struct vm_gk20a *vm,
+			     const struct gk20a_mmu_level *l,
+			     struct nvgpu_gmmu_pd *pd,
+			     u32 pd_idx,
+			     u64 phys_addr,
+			     u64 virt_addr,
+			     struct nvgpu_gmmu_attrs *attrs);
+	u32 entry_size;
 };
 
-int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm,
-				 enum gmmu_pgsz_gk20a pgsz_idx,
-				 const struct gk20a_mmu_level *l,
-				 struct gk20a_mm_entry *entry,
-				 struct gk20a_mm_entry *prev_entry);
+static inline const char *nvgpu_gmmu_perm_str(enum gk20a_mem_rw_flag p)
+{
+	switch (p) {
+	case gk20a_mem_flag_none:
+		return "RW";
+	case gk20a_mem_flag_write_only:
+		return "WO";
+	case gk20a_mem_flag_read_only:
+		return "RO";
+	default:
+		return "??";
+	}
+}
+
+int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm);
 
 /**
  * nvgpu_gmmu_map - Map memory into the GMMU.
@@ -106,6 +167,33 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
 		      u64 gpu_va);
 
 void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
-		     struct gk20a_mm_entry *entry);
+		     struct nvgpu_gmmu_pd *entry);
+
+/*
+ * Some useful routines that are shared across chips.
+ */
+static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
+				       u32 pd_idx)
+{
+	return (pd_idx * l->entry_size) / sizeof(u32);
+}
+
+static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
+			    size_t w, size_t data)
+{
+	nvgpu_mem_wr32(g, &pd->mem, w, data);
+}
+
+
+/*
+ * Internal debugging routines. Probably not something you want to use.
+ */
+#define pte_dbg(g, attrs, fmt, args...)					\
+	do {								\
+		if (attrs && attrs->debug)				\
+			nvgpu_info(g, fmt, ##args);			\
+		else							\
+			nvgpu_log(g, gpu_dbg_pte, fmt, ##args);		\
+	} while (0)
 
 #endif
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index 66d04ab8f..4259d40f5 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -109,9 +109,9 @@ nvgpu_mem_from_clear_list_entry(struct nvgpu_list_node *node)
 static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture)
 {
 	switch (aperture) {
-		case APERTURE_INVALID: return "invalid";
-		case APERTURE_SYSMEM:  return "sysmem";
-		case APERTURE_VIDMEM:  return "vidmem";
+		case APERTURE_INVALID: return "INVAL";
+		case APERTURE_SYSMEM:  return "SYSMEM";
+		case APERTURE_VIDMEM:  return "VIDMEM";
 	};
 	return "UNKNOWN";
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vm.h b/drivers/gpu/nvgpu/include/nvgpu/vm.h
index f6d88cc3b..255b43615 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h
@@ -126,6 +126,7 @@ mapped_buffer_from_rbtree_node(struct nvgpu_rbtree_node *node)
 struct vm_gk20a {
 	struct mm_gk20a *mm;
 	struct gk20a_as_share *as_share; /* as_share this represents */
+	char name[20];
 
 	u64 va_start;
 	u64 va_limit;
@@ -145,7 +146,7 @@ struct vm_gk20a {
 
 	struct nvgpu_mutex update_gmmu_lock;
 
-	struct gk20a_mm_entry pdb;
+	struct nvgpu_gmmu_pd pdb;
 
 	/*
 	 * These structs define the address spaces. In some cases it's possible