diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index d02870fbb..6e475fcb2 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -55,6 +55,7 @@ nvgpu-y := \
 	common/mm/pd_cache.o \
 	common/mm/vm.o \
 	common/mm/vm_area.o \
+	common/mm/nvgpu_mem.o \
 	common/bus.o \
 	common/enabled.o \
 	common/pramin.o \
diff --git a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
index e4991d0da..eb54f3fd6 100644
--- a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
+++ b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
@@ -21,6 +21,7 @@
 #include <nvgpu/log.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/kmem.h>
 
 #include <nvgpu/linux/dma.h>
 
@@ -395,3 +396,116 @@ int __nvgpu_mem_create_from_pages(struct gk20a *g, struct nvgpu_mem *dest,
 
 	return 0;
 }
+
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_dup(struct gk20a *g,
+						 struct nvgpu_mem_sgl *sgl)
+{
+	struct nvgpu_mem_sgl *head, *next;
+
+	head = nvgpu_kzalloc(g, sizeof(*sgl));
+	if (!head)
+		return NULL;
+
+	next = head;
+	while (true) {
+		nvgpu_log(g, gpu_dbg_sgl,
+			  "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+			  sgl->phys, sgl->dma, sgl->length);
+
+		next->dma    = sgl->dma;
+		next->phys   = sgl->phys;
+		next->length = sgl->length;
+		next->next   = NULL;
+
+		sgl = nvgpu_mem_sgl_next(sgl);
+		if (!sgl)
+			break;
+
+		next->next = nvgpu_kzalloc(g, sizeof(*sgl));
+		if (!next->next) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+		next = next->next;
+	}
+
+	return head;
+}
+
+static struct nvgpu_mem_sgl *__nvgpu_mem_sgl_create_from_vidmem(
+	struct gk20a *g,
+	struct scatterlist *linux_sgl)
+{
+	struct nvgpu_page_alloc *vidmem_alloc;
+
+	vidmem_alloc = get_vidmem_page_alloc(linux_sgl);
+	if (!vidmem_alloc)
+		return NULL;
+
+	nvgpu_log(g, gpu_dbg_sgl, "Vidmem sgl:");
+
+	return __nvgpu_mem_sgl_dup(g, vidmem_alloc->sgl);
+}
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create(struct gk20a *g,
+					   struct sg_table *sgt)
+{
+	struct nvgpu_mem_sgl *head, *sgl, *next;
+	struct scatterlist *linux_sgl = sgt->sgl;
+
+	if (is_vidmem_page_alloc(sg_dma_address(linux_sgl)))
+		return __nvgpu_mem_sgl_create_from_vidmem(g, linux_sgl);
+
+	head = nvgpu_kzalloc(g, sizeof(*sgl));
+	if (!head)
+		return NULL;
+
+	nvgpu_log(g, gpu_dbg_sgl, "Making sgl:");
+
+	sgl = head;
+	while (true) {
+		sgl->dma    = sg_dma_address(linux_sgl);
+		sgl->phys   = sg_phys(linux_sgl);
+		sgl->length = linux_sgl->length;
+
+		/*
+		 * We don't like offsets in the pages here. This will cause
+		 * problems.
+		 */
+		if (WARN_ON(linux_sgl->offset)) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+
+		nvgpu_log(g, gpu_dbg_sgl,
+			  "  phys: 0x%-12llx dma: 0x%-12llx len: 0x%llx",
+			  sgl->phys, sgl->dma, sgl->length);
+
+		/*
+		 * When there's no more SGL ents for the Linux SGL we are
+		 * done. Don't bother making any more SGL ents for the nvgpu
+		 * SGL.
+		 */
+		linux_sgl = sg_next(linux_sgl);
+		if (!linux_sgl)
+			break;
+
+		next = nvgpu_kzalloc(g, sizeof(*sgl));
+		if (!next) {
+			nvgpu_mem_sgl_free(g, head);
+			return NULL;
+		}
+
+		sgl->next = next;
+		sgl = next;
+	}
+
+	nvgpu_log(g, gpu_dbg_sgl, "Done!");
+	return head;
+}
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create_from_mem(struct gk20a *g,
+						    struct nvgpu_mem *mem)
+{
+	return nvgpu_mem_sgl_create(g, mem->priv.sgt);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/vm.c b/drivers/gpu/nvgpu/common/linux/vm.c
index 86d8bec9c..4a4429dc4 100644
--- a/drivers/gpu/nvgpu/common/linux/vm.c
+++ b/drivers/gpu/nvgpu/common/linux/vm.c
@@ -21,8 +21,11 @@
 #include <nvgpu/lock.h>
 #include <nvgpu/rbtree.h>
 #include <nvgpu/vm_area.h>
+#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/page_allocator.h>
 
+#include <nvgpu/linux/nvgpu_mem.h>
+
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
 #include "gk20a/kind_gk20a.h"
@@ -66,17 +69,19 @@ static u64 nvgpu_get_buffer_alignment(struct gk20a *g, struct scatterlist *sgl,
 
 	if (aperture == APERTURE_VIDMEM) {
 		struct nvgpu_page_alloc *alloc = get_vidmem_page_alloc(sgl);
-		struct page_alloc_chunk *chunk = NULL;
+		struct nvgpu_mem_sgl *sgl_vid = alloc->sgl;
 
-		nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-					page_alloc_chunk, list_entry) {
-			chunk_align = 1ULL << __ffs(chunk->base |
-						    chunk->length);
+		while (sgl_vid) {
+			chunk_align = 1ULL <<
+				__ffs(nvgpu_mem_sgl_phys(sgl_vid) |
+				nvgpu_mem_sgl_length(sgl_vid));
 
 			if (align)
 				align = min(align, chunk_align);
 			else
 				align = chunk_align;
+
+			sgl_vid = nvgpu_mem_sgl_next(sgl_vid);
 		}
 
 		return align;
@@ -237,6 +242,7 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	struct nvgpu_vm_area *vm_area = NULL;
 	u32 ctag_offset;
 	enum nvgpu_aperture aperture;
+	struct nvgpu_mem_sgl *nvgpu_sgl;
 
 	/*
 	 * The kind used as part of the key for map caching. HW may
@@ -393,9 +399,12 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 		ctag_offset += buffer_offset >>
 			       ilog2(g->ops.fb.compression_page_size(g));
 
+	nvgpu_sgl = nvgpu_mem_sgl_create(g, bfr.sgt);
+
 	/* update gmmu ptes */
-	map_offset = g->ops.mm.gmmu_map(vm, map_offset,
-					bfr.sgt,
+	map_offset = g->ops.mm.gmmu_map(vm,
+					map_offset,
+					nvgpu_sgl,
 					buffer_offset, /* sg offset */
 					mapping_size,
 					bfr.pgsz_idx,
@@ -410,6 +419,8 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	if (!map_offset)
 		goto clean_up;
 
+	nvgpu_mem_sgl_free(g, nvgpu_sgl);
+
 	mapped_buffer = nvgpu_kzalloc(g, sizeof(*mapped_buffer));
 	if (!mapped_buffer) {
 		nvgpu_warn(g, "oom allocating tracking buffer");
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 7f486d682..41f5acdd7 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -65,11 +65,14 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 	struct gk20a *g = gk20a_from_vm(vm);
 	u64 vaddr;
 
-	struct sg_table *sgt = mem->priv.sgt;
+	struct nvgpu_mem_sgl *sgl = nvgpu_mem_sgl_create_from_mem(g, mem);
+
+	if (!sgl)
+		return -ENOMEM;
 
 	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
 	vaddr = g->ops.mm.gmmu_map(vm, addr,
-				   sgt,    /* sg table */
+				   sgl,    /* sg list */
 				   0,      /* sg offset */
 				   size,
 				   gmmu_page_size_kernel,
@@ -82,8 +85,11 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 				   NULL,   /* mapping_batch handle */
 				   aperture);
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
+
+	nvgpu_mem_sgl_free(g, sgl);
+
 	if (!vaddr) {
-		nvgpu_err(g, "failed to allocate va space");
+		nvgpu_err(g, "failed to map buffer!");
 		return 0;
 	}
 
@@ -91,7 +97,7 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 
 /*
- * Convenience wrapper over __nvgpu_gmmu_map() for non-fixed mappings.
+ * Map a nvgpu_mem into the GMMU. This is for kernel space to use.
  */
 u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 		   struct nvgpu_mem *mem,
@@ -106,7 +112,7 @@ u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
 }
 
 /*
- * Like nvgpu_gmmu_map() except it can work on a fixed address instead.
+ * Like nvgpu_gmmu_map() except this can work on a fixed address.
  */
 u64 nvgpu_gmmu_map_fixed(struct vm_gk20a *vm,
 			 struct nvgpu_mem *mem,
@@ -407,7 +413,7 @@ static int __set_pd_level(struct vm_gk20a *vm,
 		 */
 		target_addr = next_pd ?
 			nvgpu_pde_phys_addr(g, next_pd) :
-			g->ops.mm.gpu_phys_addr(g, attrs, phys_addr);
+			phys_addr;
 
 		l->update_entry(vm, l,
 				pd, pd_idx,
@@ -458,18 +464,16 @@ static int __set_pd_level(struct vm_gk20a *vm,
  * VIDMEM version of the update_ptes logic.
  */
 static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
-						 struct sg_table *sgt,
+						 struct nvgpu_mem_sgl *sgl,
 						 u64 space_to_skip,
 						 u64 virt_addr,
 						 u64 length,
 						 struct nvgpu_gmmu_attrs *attrs)
 {
-	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
 	u64 phys_addr, chunk_length;
 	int err = 0;
 
-	if (!sgt) {
+	if (!sgl) {
 		/*
 		 * This is considered an unmap. Just pass in 0 as the physical
 		 * address for the entire GPU range.
@@ -482,22 +486,21 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
 		return err;
 	}
 
-	alloc = get_vidmem_page_alloc(sgt->sgl);
-
 	/*
 	 * Otherwise iterate across all the chunks in this allocation and
 	 * map them.
 	 */
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	while (sgl) {
 		if (space_to_skip &&
-		    space_to_skip >= chunk->length) {
-			space_to_skip -= chunk->length;
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
 			continue;
 		}
 
-		phys_addr = chunk->base + space_to_skip;
-		chunk_length = min(length, (chunk->length - space_to_skip));
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(length, (nvgpu_mem_sgl_length(sgl) -
+					    space_to_skip));
 
 		err = __set_pd_level(vm, &vm->pdb,
 				     0,
@@ -518,23 +521,24 @@ static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
 
 		if (length == 0)
 			break;
+
+		sgl = nvgpu_mem_sgl_next(sgl);
 	}
 
 	return err;
 }
 
 static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
-						 struct sg_table *sgt,
+						 struct nvgpu_mem_sgl *sgl,
 						 u64 space_to_skip,
 						 u64 virt_addr,
 						 u64 length,
 						 struct nvgpu_gmmu_attrs *attrs)
 {
 	int err;
-	struct scatterlist *sgl;
 	struct gk20a *g = gk20a_from_vm(vm);
 
-	if (!sgt) {
+	if (!sgl) {
 		/*
 		 * This is considered an unmap. Just pass in 0 as the physical
 		 * address for the entire GPU range.
@@ -548,19 +552,15 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 	}
 
 	/*
-	 * At this point we have a Linux scatter-gather list pointing to some
-	 * number of discontiguous chunks of memory. Iterate over that list and
+	 * At this point we have a scatter-gather list pointing to some number
+	 * of discontiguous chunks of memory. We must iterate over that list and
 	 * generate a GMMU map call for each chunk. There are two possibilities:
-	 * either the IOMMU is enabled or not. When the IOMMU is enabled the
+	 * either an IOMMU is enabled or not. When an IOMMU is enabled the
 	 * mapping is simple since the "physical" address is actually a virtual
-	 * IO address and will be contiguous. The no-IOMMU case is more
-	 * complicated. We will have to iterate over the SGT and do a separate
-	 * map for each chunk of the SGT.
+	 * IO address and will be contiguous.
 	 */
-	sgl = sgt->sgl;
-
 	if (!g->mm.bypass_smmu) {
-		u64 io_addr = nvgpu_mem_get_addr_sgl(g, sgl);
+		u64 io_addr = nvgpu_mem_sgl_gpu_addr(g, sgl, attrs);
 
 		io_addr += space_to_skip;
 
@@ -585,14 +585,16 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 		/*
 		 * Cut out sgl ents for space_to_skip.
 		 */
-		if (space_to_skip && space_to_skip >= sgl->length) {
-			space_to_skip -= sgl->length;
-			sgl = sg_next(sgl);
+		if (space_to_skip &&
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
 			continue;
 		}
 
-		phys_addr = sg_phys(sgl) + space_to_skip;
-		chunk_length = min(length, sgl->length - space_to_skip);
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(length,
+				   nvgpu_mem_sgl_length(sgl) - space_to_skip);
 
 		err = __set_pd_level(vm, &vm->pdb,
 				     0,
@@ -600,13 +602,11 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
 				     virt_addr,
 				     chunk_length,
 				     attrs);
-		if (err)
-			return err;
 
 		space_to_skip = 0;
 		virt_addr += chunk_length;
 		length    -= chunk_length;
-		sgl        = sg_next(sgl);
+		sgl        = nvgpu_mem_sgl_next(sgl);
 
 		if (length == 0)
 			break;
@@ -624,22 +624,20 @@ static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
  * implementations. But the logic around that is generic to all chips. Every
  * chip has some number of PDE levels and then a PTE level.
  *
- * Each chunk of the incoming SGT is sent to the chip specific implementation
+ * Each chunk of the incoming SGL is sent to the chip specific implementation
  * of page table update.
  *
  * [*] Note: the "physical" address may actually be an IO virtual address in the
  *     case of SMMU usage.
  */
 static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
-					  struct sg_table *sgt,
+					  struct nvgpu_mem_sgl *sgl,
 					  u64 space_to_skip,
 					  u64 virt_addr,
 					  u64 length,
 					  struct nvgpu_gmmu_attrs *attrs)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
-	struct nvgpu_page_alloc *alloc;
-	u64 phys_addr = 0;
 	u32 page_size;
 	int err;
 
@@ -665,25 +663,16 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 		return err;
 	}
 
-	if (sgt) {
-		if (attrs->aperture == APERTURE_VIDMEM) {
-			alloc = get_vidmem_page_alloc(sgt->sgl);
-
-			phys_addr = alloc->base;
-		} else
-			phys_addr = nvgpu_mem_get_addr_sgl(g, sgt->sgl);
-	}
-
 	__gmmu_dbg(g, attrs,
 		   "vm=%s "
 		   "%-5s GPU virt %#-12llx +%#-9llx    phys %#-12llx "
 		   "phys offset: %#-4llx;  pgsz: %3dkb perm=%-2s | "
 		   "kind=%#02x APT=%-6s %c%c%c%c%c",
 		   vm->name,
-		   sgt ? "MAP" : "UNMAP",
+		   sgl ? "MAP" : "UNMAP",
 		   virt_addr,
 		   length,
-		   phys_addr,
+		   sgl ? nvgpu_mem_sgl_phys(sgl) : 0,
 		   space_to_skip,
 		   page_size >> 10,
 		   nvgpu_gmmu_perm_str(attrs->rw_flag),
@@ -696,19 +685,19 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 		   attrs->valid     ? 'V' : '-');
 
 	/*
-	 * Handle VIDMEM progamming. Currently uses a different scatter list
-	 * format.
+	 * For historical reasons these are separate, but soon these will be
+	 * unified.
 	 */
 	if (attrs->aperture == APERTURE_VIDMEM)
 		err = __nvgpu_gmmu_update_page_table_vidmem(vm,
-							    sgt,
+							    sgl,
 							    space_to_skip,
 							    virt_addr,
 							    length,
 							    attrs);
 	else
 		err = __nvgpu_gmmu_update_page_table_sysmem(vm,
-							    sgt,
+							    sgl,
 							    space_to_skip,
 							    virt_addr,
 							    length,
@@ -717,7 +706,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
 	unmap_gmmu_pages(g, &vm->pdb);
 	nvgpu_smp_mb();
 
-	__gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
+	__gmmu_dbg(g, attrs, "%-5s Done!", sgl ? "MAP" : "UNMAP");
 
 	return err;
 }
@@ -736,7 +725,7 @@ static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
  */
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 			  u64 vaddr,
-			  struct sg_table *sgt,
+			  struct nvgpu_mem_sgl *sgl,
 			  u64 buffer_offset,
 			  u64 size,
 			  int pgsz_idx,
@@ -785,7 +774,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 		allocated = true;
 	}
 
-	err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
+	err = __nvgpu_gmmu_update_page_table(vm, sgl, buffer_offset,
 					     vaddr, size, &attrs);
 	if (err) {
 		nvgpu_err(g, "failed to update ptes on map");
diff --git a/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
new file mode 100644
index 000000000..7296c6738
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <nvgpu/kmem.h>
+#include <nvgpu/nvgpu_mem.h>
+
+#include "gk20a/gk20a.h"
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_next(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->next;
+}
+
+u64 nvgpu_mem_sgl_phys(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->phys;
+}
+
+u64 nvgpu_mem_sgl_dma(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->dma;
+}
+
+u64 nvgpu_mem_sgl_length(struct nvgpu_mem_sgl *sgl)
+{
+	return sgl->length;
+}
+
+/*
+ * This builds a GPU address for the %sgl based on whether an IOMMU is present
+ * or not. It also handles turning the physical address into the true GPU
+ * physical address that should be programmed into the page tables.
+ */
+u64 nvgpu_mem_sgl_gpu_addr(struct gk20a *g, struct nvgpu_mem_sgl *sgl,
+			   struct nvgpu_gmmu_attrs *attrs)
+{
+	if (nvgpu_mem_sgl_dma(sgl) == 0)
+		return g->ops.mm.gpu_phys_addr(g, attrs,
+					       nvgpu_mem_sgl_phys(sgl));
+
+	if (nvgpu_mem_sgl_dma(sgl) == DMA_ERROR_CODE)
+		return 0;
+
+	return gk20a_mm_smmu_vaddr_translate(g, nvgpu_mem_sgl_dma(sgl));
+}
+
+void nvgpu_mem_sgl_free(struct gk20a *g, struct nvgpu_mem_sgl *sgl)
+{
+	struct nvgpu_mem_sgl *next;
+
+	/*
+	 * Free each of the elements. We expect each element to have been
+	 * nvgpu_k[mz]alloc()ed.
+	 */
+	while (sgl) {
+		next = nvgpu_mem_sgl_next(sgl);
+		nvgpu_kfree(g, sgl);
+		sgl = next;
+	}
+}
diff --git a/drivers/gpu/nvgpu/common/mm/page_allocator.c b/drivers/gpu/nvgpu/common/mm/page_allocator.c
index 72ff8f2dc..6d92b4570 100644
--- a/drivers/gpu/nvgpu/common/mm/page_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/page_allocator.c
@@ -147,19 +147,16 @@ static void __nvgpu_free_pages(struct nvgpu_page_allocator *a,
 			       struct nvgpu_page_alloc *alloc,
 			       bool free_buddy_alloc)
 {
-	struct page_alloc_chunk *chunk;
+	struct nvgpu_mem_sgl *sgl = alloc->sgl;
 
-	while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
-		chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
-					 page_alloc_chunk,
-					 list_entry);
-		nvgpu_list_del(&chunk->list_entry);
-
-		if (free_buddy_alloc)
-			nvgpu_free(&a->source_allocator, chunk->base);
-		nvgpu_kmem_cache_free(a->chunk_cache, chunk);
+	if (free_buddy_alloc) {
+		while (sgl) {
+			nvgpu_free(&a->source_allocator, sgl->phys);
+			sgl = nvgpu_mem_sgl_next(sgl);
+		}
 	}
 
+	nvgpu_mem_sgl_free(a->owner->g, alloc->sgl);
 	nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 }
 
@@ -243,15 +240,14 @@ static void free_slab_page(struct nvgpu_page_allocator *a,
 }
 
 /*
- * This expects @alloc to have 1 empty page_alloc_chunk already added to the
- * alloc_chunks list.
+ * This expects @alloc to have 1 empty sgl_entry ready for usage.
  */
 static int __do_slab_alloc(struct nvgpu_page_allocator *a,
 			   struct page_alloc_slab *slab,
 			   struct nvgpu_page_alloc *alloc)
 {
 	struct page_alloc_slab_page *slab_page = NULL;
-	struct page_alloc_chunk *chunk;
+	struct nvgpu_mem_sgl *sgl;
 	unsigned long offs;
 
 	/*
@@ -302,18 +298,19 @@ static int __do_slab_alloc(struct nvgpu_page_allocator *a,
 		BUG(); /* Should be impossible to hit this. */
 
 	/*
-	 * Handle building the nvgpu_page_alloc struct. We expect one
-	 * page_alloc_chunk to be present.
+	 * Handle building the nvgpu_page_alloc struct. We expect one sgl
+	 * to be present.
 	 */
 	alloc->slab_page = slab_page;
 	alloc->nr_chunks = 1;
 	alloc->length = slab_page->slab_size;
 	alloc->base = slab_page->page_addr + (offs * slab_page->slab_size);
 
-	chunk = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				page_alloc_chunk, list_entry);
-	chunk->base = alloc->base;
-	chunk->length = alloc->length;
+	sgl         = alloc->sgl;
+	sgl->phys   = alloc->base;
+	sgl->dma    = alloc->base;
+	sgl->length = alloc->length;
+	sgl->next   = NULL;
 
 	return 0;
 }
@@ -327,7 +324,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 	int err, slab_nr;
 	struct page_alloc_slab *slab;
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl = NULL;
 
 	/*
 	 * Align the length to a page and then divide by the page size (4k for
@@ -341,15 +338,13 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 		palloc_dbg(a, "OOM: could not alloc page_alloc struct!\n");
 		goto fail;
 	}
-	chunk = nvgpu_kmem_cache_alloc(a->chunk_cache);
-	if (!chunk) {
-		palloc_dbg(a, "OOM: could not alloc alloc_chunk struct!\n");
+	sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+	if (!sgl) {
+		palloc_dbg(a, "OOM: could not alloc sgl struct!\n");
 		goto fail;
 	}
 
-	nvgpu_init_list_node(&alloc->alloc_chunks);
-	nvgpu_list_add(&chunk->list_entry, &alloc->alloc_chunks);
-
+	alloc->sgl = sgl;
 	err = __do_slab_alloc(a, slab, alloc);
 	if (err)
 		goto fail;
@@ -363,8 +358,8 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
 fail:
 	if (alloc)
 		nvgpu_kmem_cache_free(a->alloc_cache, alloc);
-	if (chunk)
-		nvgpu_kmem_cache_free(a->chunk_cache, chunk);
+	if (sgl)
+		nvgpu_kfree(a->owner->g, sgl);
 	return NULL;
 }
 
@@ -426,7 +421,7 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 	struct nvgpu_page_allocator *a, u64 pages)
 {
 	struct nvgpu_page_alloc *alloc;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl, *prev_sgl = NULL;
 	u64 max_chunk_len = pages << a->page_shift;
 	int i = 0;
 
@@ -436,7 +431,6 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 
 	memset(alloc, 0, sizeof(*alloc));
 
-	nvgpu_init_list_node(&alloc->alloc_chunks);
 	alloc->length = pages << a->page_shift;
 
 	while (pages) {
@@ -482,36 +476,48 @@ static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
 			goto fail_cleanup;
 		}
 
-		c = nvgpu_kmem_cache_alloc(a->chunk_cache);
-		if (!c) {
+		sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+		if (!sgl) {
 			nvgpu_free(&a->source_allocator, chunk_addr);
 			goto fail_cleanup;
 		}
 
 		pages -= chunk_pages;
 
-		c->base = chunk_addr;
-		c->length = chunk_len;
-		nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+		sgl->phys   = chunk_addr;
+		sgl->dma    = chunk_addr;
+		sgl->length = chunk_len;
+
+		/*
+		 * Build the singly linked list with a head node that is part of
+		 * the list.
+		 */
+		if (prev_sgl)
+			prev_sgl->next = sgl;
+		else
+			alloc->sgl = sgl;
+
+		prev_sgl = sgl;
 
 		i++;
 	}
 
 	alloc->nr_chunks = i;
-	c = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				page_alloc_chunk, list_entry);
-	alloc->base = c->base;
+	alloc->base = alloc->sgl->phys;
 
 	return alloc;
 
 fail_cleanup:
-	while (!nvgpu_list_empty(&alloc->alloc_chunks)) {
-		c = nvgpu_list_first_entry(&alloc->alloc_chunks,
-				     page_alloc_chunk, list_entry);
-		nvgpu_list_del(&c->list_entry);
-		nvgpu_free(&a->source_allocator, c->base);
-		nvgpu_kmem_cache_free(a->chunk_cache, c);
+	sgl = alloc->sgl;
+	while (sgl) {
+		struct nvgpu_mem_sgl *next = sgl->next;
+
+		nvgpu_free(&a->source_allocator, sgl->phys);
+		nvgpu_kfree(a->owner->g, sgl);
+
+		sgl = next;
 	}
+
 	nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 fail:
 	return NULL;
@@ -521,7 +527,7 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
 	struct nvgpu_page_allocator *a, u64 len)
 {
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 	u64 pages;
 	int i = 0;
 
@@ -536,11 +542,15 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
 
 	palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
 		   pages << a->page_shift, pages, alloc->base);
-	nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
+			   i++,
+			   nvgpu_mem_sgl_phys(sgl),
+			   nvgpu_mem_sgl_length(sgl));
+		sgl = sgl->next;
 	}
+	palloc_dbg(a, "Alloc done\n");
 
 	return alloc;
 }
@@ -638,11 +648,11 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
 	struct nvgpu_page_allocator *a, u64 base, u64 length, u32 unused)
 {
 	struct nvgpu_page_alloc *alloc;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 
 	alloc = nvgpu_kmem_cache_alloc(a->alloc_cache);
-	c = nvgpu_kmem_cache_alloc(a->chunk_cache);
-	if (!alloc || !c)
+	sgl = nvgpu_kzalloc(a->owner->g, sizeof(*sgl));
+	if (!alloc || !sgl)
 		goto fail;
 
 	alloc->base = nvgpu_alloc_fixed(&a->source_allocator, base, length, 0);
@@ -653,17 +663,18 @@ static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
 
 	alloc->nr_chunks = 1;
 	alloc->length = length;
-	nvgpu_init_list_node(&alloc->alloc_chunks);
+	alloc->sgl = sgl;
 
-	c->base = alloc->base;
-	c->length = length;
-	nvgpu_list_add(&c->list_entry, &alloc->alloc_chunks);
+	sgl->phys   = alloc->base;
+	sgl->dma    = alloc->base;
+	sgl->length = length;
+	sgl->next   = NULL;
 
 	return alloc;
 
 fail:
-	if (c)
-		nvgpu_kmem_cache_free(a->chunk_cache, c);
+	if (sgl)
+		nvgpu_kfree(a->owner->g, sgl);
 	if (alloc)
 		nvgpu_kmem_cache_free(a->alloc_cache, alloc);
 	return NULL;
@@ -677,7 +688,7 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
 {
 	struct nvgpu_page_allocator *a = page_allocator(__a);
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
+	struct nvgpu_mem_sgl *sgl;
 	u64 aligned_len, pages;
 	int i = 0;
 
@@ -697,10 +708,13 @@ static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
 
 	palloc_dbg(a, "Alloc [fixed] @ 0x%010llx + 0x%llx (%llu)\n",
 		   alloc->base, aligned_len, pages);
-	nvgpu_list_for_each_entry(c, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
+			   i++,
+			   nvgpu_mem_sgl_phys(sgl),
+			   nvgpu_mem_sgl_length(sgl));
+		sgl = sgl->next;
 	}
 
 	a->nr_fixed_allocs++;
@@ -896,11 +910,9 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 
 	a->alloc_cache = nvgpu_kmem_cache_create(g,
 					sizeof(struct nvgpu_page_alloc));
-	a->chunk_cache = nvgpu_kmem_cache_create(g,
-					sizeof(struct page_alloc_chunk));
 	a->slab_page_cache = nvgpu_kmem_cache_create(g,
 					sizeof(struct page_alloc_slab_page));
-	if (!a->alloc_cache || !a->chunk_cache || !a->slab_page_cache) {
+	if (!a->alloc_cache || !a->slab_page_cache) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -941,8 +953,6 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 fail:
 	if (a->alloc_cache)
 		nvgpu_kmem_cache_destroy(a->alloc_cache);
-	if (a->chunk_cache)
-		nvgpu_kmem_cache_destroy(a->chunk_cache);
 	if (a->slab_page_cache)
 		nvgpu_kmem_cache_destroy(a->slab_page_cache);
 	nvgpu_kfree(g, a);
diff --git a/drivers/gpu/nvgpu/common/pramin.c b/drivers/gpu/nvgpu/common/pramin.c
index 425bfdb47..bb7d930e2 100644
--- a/drivers/gpu/nvgpu/common/pramin.c
+++ b/drivers/gpu/nvgpu/common/pramin.c
@@ -84,37 +84,40 @@ void nvgpu_pramin_access_batched(struct gk20a *g, struct nvgpu_mem *mem,
 		u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
 {
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl;
 	u32 byteoff, start_reg, until_end, n;
 
 	alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-			page_alloc_chunk, list_entry) {
-		if (offset >= chunk->length)
-			offset -= chunk->length;
-		else
+	sgl = alloc->sgl;
+	while (sgl) {
+		if (offset >= nvgpu_mem_sgl_length(sgl)) {
+			offset -= nvgpu_mem_sgl_length(sgl);
+			sgl = sgl->next;
+		} else {
 			break;
+		}
 	}
 
 	while (size) {
-		byteoff = g->ops.pramin.enter(g, mem, chunk,
+		u32 sgl_len = (u32)nvgpu_mem_sgl_length(sgl);
+
+		byteoff = g->ops.pramin.enter(g, mem, sgl,
 					      offset / sizeof(u32));
 		start_reg = g->ops.pramin.data032_r(byteoff / sizeof(u32));
 		until_end = SZ_1M - (byteoff & (SZ_1M - 1));
 
-		n = min3(size, until_end, (u32)(chunk->length - offset));
+		n = min3(size, until_end, (u32)(sgl_len - offset));
 
 		loop(g, start_reg, n / sizeof(u32), arg);
 
 		/* read back to synchronize accesses */
 		gk20a_readl(g, start_reg);
-		g->ops.pramin.exit(g, mem, chunk);
+		g->ops.pramin.exit(g, mem, sgl);
 
 		size -= n;
 
-		if (n == (chunk->length - offset)) {
-			chunk = nvgpu_list_next_entry(chunk, page_alloc_chunk,
-					list_entry);
+		if (n == (sgl_len - offset)) {
+			sgl = nvgpu_mem_sgl_next(sgl);
 			offset = 0;
 		} else {
 			offset += n;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 7eee2d514..355228dba 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -34,6 +34,7 @@ struct gk20a_debug_output;
 struct nvgpu_clk_pll_debug_data;
 struct nvgpu_nvhost_dev;
 struct nvgpu_cpu_time_correlation_sample;
+struct nvgpu_mem_sgl;
 
 #include <nvgpu/lock.h>
 #include <nvgpu/thread.h>
@@ -70,8 +71,6 @@ struct nvgpu_cpu_time_correlation_sample;
 #endif
 #include "ecc_gk20a.h"
 
-struct page_alloc_chunk;
-
 /* PTIMER_REF_FREQ_HZ corresponds to a period of 32 nanoseconds.
     32 ns is the resolution of ptimer. */
 #define PTIMER_REF_FREQ_HZ                      31250000
@@ -701,7 +700,7 @@ struct gpu_ops {
 		bool (*support_sparse)(struct gk20a *g);
 		u64 (*gmmu_map)(struct vm_gk20a *vm,
 				u64 map_offset,
-				struct sg_table *sgt,
+				struct nvgpu_mem_sgl *sgl,
 				u64 buffer_offset,
 				u64 size,
 				int pgsz_idx,
@@ -761,9 +760,9 @@ struct gpu_ops {
 				size_t size);
 	struct {
 		u32 (*enter)(struct gk20a *g, struct nvgpu_mem *mem,
-			     struct page_alloc_chunk *chunk, u32 w);
+			     struct nvgpu_mem_sgl *sgl, u32 w);
 		void (*exit)(struct gk20a *g, struct nvgpu_mem *mem,
-			     struct page_alloc_chunk *chunk);
+			     struct nvgpu_mem_sgl *sgl);
 		u32 (*data032_r)(u32 i);
 	} pramin;
 	struct {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 97b7aa800..cd34e769d 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1151,7 +1151,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
 	struct gk20a_fence *gk20a_fence_out = NULL;
 	struct gk20a_fence *gk20a_last_fence = NULL;
 	struct nvgpu_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
+	struct nvgpu_mem_sgl *sgl = NULL;
 	int err = 0;
 
 	if (g->mm.vidmem.ce_ctx_id == (u32)~0)
@@ -1159,16 +1159,16 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
 
 	alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
 
-	nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
-				  page_alloc_chunk, list_entry) {
+	sgl = alloc->sgl;
+	while (sgl) {
 		if (gk20a_last_fence)
 			gk20a_fence_put(gk20a_last_fence);
 
 		err = gk20a_ce_execute_ops(g,
 			g->mm.vidmem.ce_ctx_id,
 			0,
-			chunk->base,
-			chunk->length,
+			nvgpu_mem_sgl_phys(sgl),
+			nvgpu_mem_sgl_length(sgl),
 			0x00000000,
 			NVGPU_CE_DST_LOCATION_LOCAL_FB,
 			NVGPU_CE_MEMSET,
@@ -1183,6 +1183,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
 		}
 
 		gk20a_last_fence = gk20a_fence_out;
+		sgl = nvgpu_mem_sgl_next(sgl);
 	}
 
 	if (gk20a_last_fence) {
@@ -1262,10 +1263,10 @@ dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr)
 	return addr;
 }
 
-u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova)
+u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, u64 iova)
 {
 	/* ensure it is not vidmem allocation */
-	WARN_ON(is_vidmem_page_alloc((u64)iova));
+	WARN_ON(is_vidmem_page_alloc(iova));
 
 	if (device_is_iommuable(dev_from_gk20a(g)) &&
 			g->ops.mm.get_physical_addr_bits)
@@ -2167,11 +2168,6 @@ u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g)
 	return 34;
 }
 
-u64 gk20a_mm_gpu_phys_addr(struct gk20a *g, u64 phys, u32 flags)
-{
-	return phys;
-}
-
 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
 						      u32 big_page_size)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index c77bebf8e..2fdc17299 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -336,7 +336,6 @@ void gk20a_mm_dump_vm(struct vm_gk20a *vm,
 
 int gk20a_mm_suspend(struct gk20a *g);
 
-u64 gk20a_mm_gpu_phys_addr(struct gk20a *g, u64 phys, u32 flags);
 u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova);
 
 void gk20a_mm_ltc_isr(struct gk20a *g);
@@ -361,29 +360,29 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem)
 }
 
 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
-			u64 map_offset,
-			struct sg_table *sgt,
-			u64 buffer_offset,
-			u64 size,
-			int pgsz_idx,
-			u8 kind_v,
-			u32 ctag_offset,
-			u32 flags,
-			int rw_flag,
-			bool clear_ctags,
-			bool sparse,
-			bool priv,
-			struct vm_gk20a_mapping_batch *batch,
-			enum nvgpu_aperture aperture);
+			  u64 map_offset,
+			  struct nvgpu_mem_sgl *sgl,
+			  u64 buffer_offset,
+			  u64 size,
+			  int pgsz_idx,
+			  u8 kind_v,
+			  u32 ctag_offset,
+			  u32 flags,
+			  int rw_flag,
+			  bool clear_ctags,
+			  bool sparse,
+			  bool priv,
+			  struct vm_gk20a_mapping_batch *batch,
+			  enum nvgpu_aperture aperture);
 
 void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
-			u64 vaddr,
-			u64 size,
-			int pgsz_idx,
-			bool va_allocated,
-			int rw_flag,
-			bool sparse,
-			struct vm_gk20a_mapping_batch *batch);
+			     u64 vaddr,
+			     u64 size,
+			     int pgsz_idx,
+			     bool va_allocated,
+			     int rw_flag,
+			     bool sparse,
+			     struct vm_gk20a_mapping_batch *batch);
 
 struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf);
 void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
diff --git a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
index 9d19e9e53..8a34a63c9 100644
--- a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.c
@@ -26,9 +26,9 @@
 
 /* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */
 u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk, u32 w)
+		       struct nvgpu_mem_sgl *sgl, u32 w)
 {
-	u64 bufbase = chunk->base;
+	u64 bufbase = nvgpu_mem_sgl_phys(sgl);
 	u64 addr = bufbase + w * sizeof(u32);
 	u32 hi = (u32)((addr & ~(u64)0xfffff)
 		>> bus_bar0_window_target_bar0_window_base_shift_v());
@@ -40,8 +40,9 @@ u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
 
 	gk20a_dbg(gpu_dbg_mem,
 			"0x%08x:%08x begin for %p,%p at [%llx,%llx] (sz %llx)",
-			hi, lo, mem, chunk, bufbase,
-			bufbase + chunk->length, chunk->length);
+			hi, lo, mem, sgl, bufbase,
+			bufbase + nvgpu_mem_sgl_phys(sgl),
+			nvgpu_mem_sgl_length(sgl));
 
 	WARN_ON(!bufbase);
 
@@ -57,9 +58,9 @@ u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
 }
 
 void gk20a_pramin_exit(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk)
+		       struct nvgpu_mem_sgl *sgl)
 {
-	gk20a_dbg(gpu_dbg_mem, "end for %p,%p", mem, chunk);
+	gk20a_dbg(gpu_dbg_mem, "end for %p,%p", mem, sgl);
 
 	nvgpu_spinlock_release(&g->mm.pramin_window_lock);
 }
diff --git a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
index 1a1ac8714..fc5ba919e 100644
--- a/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pramin_gk20a.h
@@ -19,10 +19,10 @@
 
 struct gk20a;
 struct nvgpu_mem;
-struct page_alloc_chunk;
+struct nvgpu_mem_sgl;
 
 u32 gk20a_pramin_enter(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk, u32 w);
+		       struct nvgpu_mem_sgl *sgl, u32 w);
 void gk20a_pramin_exit(struct gk20a *g, struct nvgpu_mem *mem,
-			      struct page_alloc_chunk *chunk);
+		       struct nvgpu_mem_sgl *sgl);
 #endif
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index fc27b120a..c276f5a64 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -904,7 +904,7 @@ int gr_gp10b_alloc_buffer(struct vm_gk20a *vm, size_t size,
 
 	mem->gpu_va = nvgpu_gmmu_map(vm,
 				mem,
-				size,
+				mem->aligned_size,
 				NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 				gk20a_mem_flag_none,
 				false,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index de129a5f4..11060300b 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -27,8 +27,6 @@
 #include <nvgpu/gmmu_t19x.h>
 #endif
 
-struct scatterlist;
-
 /*
  * This is the GMMU API visible to blocks outside of the GMMU. Basically this
  * API supports all the different types of mappings that might be done in the
diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
index e2d4d3367..f96c28018 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/nvgpu_mem.h
@@ -32,6 +32,8 @@ struct nvgpu_mem_priv {
 };
 
 u64 nvgpu_mem_get_addr_sgl(struct gk20a *g, struct scatterlist *sgl);
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create(struct gk20a *g,
+					   struct sg_table *sgt);
 
 /**
  * __nvgpu_mem_create_from_pages - Create an nvgpu_mem from physical pages.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 4cac3e702..cfce8c5b2 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -71,6 +71,7 @@ enum nvgpu_log_categories {
 	gpu_dbg_pd_cache   = BIT(20),	/* PD cache traces. */
 	gpu_dbg_alloc      = BIT(21),	/* Allocator debugging. */
 	gpu_dbg_dma        = BIT(22),	/* DMA allocation prints. */
+	gpu_dbg_sgl        = BIT(23),	/* SGL related traces. */
 	gpu_dbg_mem        = BIT(31),	/* memory accesses; very verbose. */
 };
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index a112623e0..7d19cf81d 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -33,6 +33,8 @@ struct gk20a;
 struct nvgpu_allocator;
 struct nvgpu_gmmu_attrs;
 
+#define NVGPU_MEM_DMA_ERROR		(~0ULL)
+
 /*
  * Real location of a buffer - nvgpu_aperture_mask() will deduce what will be
  * told to the gpu about the aperture, but this flag designates where the
@@ -44,6 +46,28 @@ enum nvgpu_aperture {
 	APERTURE_VIDMEM
 };
 
+/*
+ * This struct holds the necessary information for describing a struct
+ * nvgpu_mem's scatter gather list.
+ *
+ * These are created in a platform dependent way. As a result the function
+ * definition for allocating these lives in the <nvgpu/_OS_/nvgpu_mem.h> file.
+ */
+struct nvgpu_mem_sgl {
+	/*
+	 * Internally this is implemented as a singly linked list.
+	 */
+	struct nvgpu_mem_sgl	*next;
+
+	/*
+	 * There is both a phys address and a DMA address since some systems,
+	 * for example ones with an IOMMU, may see these as different addresses.
+	 */
+	u64			 phys;
+	u64			 dma;
+	u64			 length;
+};
+
 struct nvgpu_mem {
 	/*
 	 * Populated for all nvgpu_mem structs - vidmem or system.
@@ -176,6 +200,27 @@ int nvgpu_mem_create_from_mem(struct gk20a *g,
 			      struct nvgpu_mem *dest, struct nvgpu_mem *src,
 			      int start_page, int nr_pages);
 
+/**
+ * nvgpu_mem_sgl_create_from_mem - Create a scatter list from an nvgpu_mem.
+ *
+ * @g   - The GPU.
+ * @mem - The source memory allocation to use.
+ *
+ * Create a scatter gather list from the passed @mem struct. This list lets the
+ * calling code iterate across each chunk of a DMA allocation for when that DMA
+ * allocation is not completely contiguous.
+ */
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_create_from_mem(struct gk20a *g,
+						    struct nvgpu_mem *mem);
+void nvgpu_mem_sgl_free(struct gk20a *g, struct nvgpu_mem_sgl *sgl);
+
+struct nvgpu_mem_sgl *nvgpu_mem_sgl_next(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_phys(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_dma(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_length(struct nvgpu_mem_sgl *sgl);
+u64 nvgpu_mem_sgl_gpu_addr(struct gk20a *g, struct nvgpu_mem_sgl *sgl,
+			   struct nvgpu_gmmu_attrs *attrs);
+
 /*
  * Buffer accessors - wrap between begin() and end() if there is no permanent
  * kernel mapping for this buffer.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
index 9a5ef8d37..de83ca7f3 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
@@ -18,6 +18,7 @@
 #define PAGE_ALLOCATOR_PRIV_H
 
 #include <nvgpu/allocator.h>
+#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/kmem.h>
 #include <nvgpu/list.h>
 #include <nvgpu/rbtree.h>
@@ -83,27 +84,17 @@ page_alloc_slab_page_from_list_entry(struct nvgpu_list_node *node)
 	((uintptr_t)node - offsetof(struct page_alloc_slab_page, list_entry));
 };
 
-struct page_alloc_chunk {
-	struct nvgpu_list_node list_entry;
-
-	u64 base;
-	u64 length;
-};
-
-static inline struct page_alloc_chunk *
-page_alloc_chunk_from_list_entry(struct nvgpu_list_node *node)
-{
-	return (struct page_alloc_chunk *)
-	((uintptr_t)node - offsetof(struct page_alloc_chunk, list_entry));
-};
-
 /*
  * Struct to handle internal management of page allocation. It holds a list
  * of the chunks of pages that make up the overall allocation - much like a
  * scatter gather table.
  */
 struct nvgpu_page_alloc {
-	struct nvgpu_list_node alloc_chunks;
+	/*
+	 * nvgpu_mem_sgl for describing the actual allocation. Convenient for
+	 * GMMU mapping.
+	 */
+	struct nvgpu_mem_sgl *sgl;
 
 	int nr_chunks;
 	u64 length;
@@ -156,7 +147,6 @@ struct nvgpu_page_allocator {
 	int nr_slabs;
 
 	struct nvgpu_kmem_cache *alloc_cache;
-	struct nvgpu_kmem_cache *chunk_cache;
 	struct nvgpu_kmem_cache *slab_page_cache;
 
 	u64 flags;
diff --git a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
index 85c436e5a..ee9b791af 100644
--- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
+++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_mm_gp10b.c
@@ -13,7 +13,6 @@
  * more details.
  */
 
-#include <linux/dma-mapping.h>
 #include "vgpu/vgpu.h"
 #include "vgpu_mm_gp10b.h"
 #include "gk20a/mm_gk20a.h"
@@ -41,7 +40,7 @@ static inline int add_mem_desc(struct tegra_vgpu_mem_desc *mem_desc,
 
 static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 				u64 map_offset,
-				struct sg_table *sgt,
+				struct nvgpu_mem_sgl *sgl,
 				u64 buffer_offset,
 				u64 size,
 				int pgsz_idx,
@@ -61,10 +60,9 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 	struct tegra_vgpu_as_map_ex_params *p = &msg.params.as_map_ex;
 	struct tegra_vgpu_mem_desc *mem_desc;
 	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
+	u64 buffer_size = PAGE_ALIGN(size);
 	u64 space_to_skip = buffer_offset;
-	u64 buffer_size = 0;
 	u32 mem_desc_count = 0, i;
-	struct scatterlist *sgl;
 	void *handle = NULL;
 	size_t oob_size;
 	u8 prot;
@@ -73,7 +71,7 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 
 	/* FIXME: add support for sparse mappings */
 
-	if (WARN_ON(!sgt) || WARN_ON(!g->mm.bypass_smmu))
+	if (WARN_ON(!sgl) || WARN_ON(!g->mm.bypass_smmu))
 		return 0;
 
 	if (space_to_skip & (page_size - 1))
@@ -100,33 +98,36 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 		goto fail;
 	}
 
-	sgl = sgt->sgl;
-	while (space_to_skip && sgl &&
-		(space_to_skip + page_size > sgl->length)) {
-		space_to_skip -= sgl->length;
-		sgl = sg_next(sgl);
-	}
-	WARN_ON(!sgl);
+	while (sgl) {
+		u64 phys_addr;
+		u64 chunk_length;
 
-	if (add_mem_desc(&mem_desc[mem_desc_count++],
-			sg_phys(sgl) + space_to_skip,
-			sgl->length - space_to_skip,
-			&oob_size)) {
-		err = -ENOMEM;
-		goto fail;
-	}
-	buffer_size += sgl->length - space_to_skip;
+		/*
+		 * Cut out sgl ents for space_to_skip.
+		 */
+		if (space_to_skip &&
+		    space_to_skip >= nvgpu_mem_sgl_length(sgl)) {
+			space_to_skip -= nvgpu_mem_sgl_length(sgl);
+			sgl = nvgpu_mem_sgl_next(sgl);
+			continue;
+		}
 
-	sgl = sg_next(sgl);
-	while (sgl && buffer_size < size) {
-		if (add_mem_desc(&mem_desc[mem_desc_count++], sg_phys(sgl),
-				sgl->length, &oob_size)) {
+		phys_addr = nvgpu_mem_sgl_phys(sgl) + space_to_skip;
+		chunk_length = min(size,
+				   nvgpu_mem_sgl_length(sgl) - space_to_skip);
+
+		if (add_mem_desc(&mem_desc[mem_desc_count++], phys_addr,
+				 chunk_length, &oob_size)) {
 			err = -ENOMEM;
 			goto fail;
 		}
 
-		buffer_size += sgl->length;
-		sgl = sg_next(sgl);
+		space_to_skip = 0;
+		size -= chunk_length;
+		sgl   = nvgpu_mem_sgl_next(sgl);
+
+		if (size == 0)
+			break;
 	}
 
 	if (rw_flag == gk20a_mem_flag_read_only)
@@ -153,7 +154,7 @@ static u64 vgpu_gp10b_locked_gmmu_map(struct vm_gk20a *vm,
 	msg.handle = vgpu_get_handle(g);
 	p->handle = vm->handle;
 	p->gpu_va = map_offset;
-	p->size = size;
+	p->size = buffer_size;
 	p->mem_desc_count = mem_desc_count;
 	p->pgsz_idx = pgsz_idx;
 	p->iova = 0;
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index ef9e00c8c..5da6f158b 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -78,7 +78,7 @@ int vgpu_init_mm_support(struct gk20a *g)
 
 static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
 				u64 map_offset,
-				struct sg_table *sgt,
+				struct nvgpu_mem_sgl *sgl,
 				u64 buffer_offset,
 				u64 size,
 				int pgsz_idx,
@@ -98,7 +98,7 @@ static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
 	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_as_map_params *p = &msg.params.as_map;
-	u64 addr = nvgpu_mem_get_addr_sgl(g, sgt->sgl);
+	u64 addr = nvgpu_mem_sgl_gpu_addr(g, sgl, NULL);
 	u8 prot;
 
 	gk20a_dbg_fn("");