diff --git a/drivers/gpu/nvgpu/common/linux/dma.c b/drivers/gpu/nvgpu/common/linux/dma.c
index c13dae8b3..81aebb7d7 100644
--- a/drivers/gpu/nvgpu/common/linux/dma.c
+++ b/drivers/gpu/nvgpu/common/linux/dma.c
@@ -221,6 +221,16 @@ int nvgpu_dma_alloc_flags_sys(struct gk20a *g, unsigned long flags,
 	NVGPU_DEFINE_DMA_ATTRS(dma_attrs);
 	void *alloc_ret;
 
+	/*
+	 * WAR for IO coherent chips: the DMA API does not seem to generate
+	 * mappings that work correctly. Unclear why - Bug ID: 2040115.
+	 *
+	 * Basically we just tell the DMA API not to map with NO_KERNEL_MAPPING
+	 * and then make a vmap() ourselves.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_USE_COHERENT_SYSMEM))
+		flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
+
 	/*
 	 * Before the debug print so we see this in the total. But during
 	 * cleanup in the fail path this has to be subtracted.
@@ -255,7 +265,17 @@ int nvgpu_dma_alloc_flags_sys(struct gk20a *g, unsigned long flags,
 					iova, size, flags);
 	}
 	if (err)
-		goto fail_free;
+		goto fail_free_dma;
+
+	if (nvgpu_is_enabled(g, NVGPU_USE_COHERENT_SYSMEM)) {
+		mem->cpu_va = vmap(mem->priv.pages,
+				   size >> PAGE_SHIFT,
+				   0, PAGE_KERNEL);
+		if (!mem->cpu_va) {
+			err = -ENOMEM;
+			goto fail_free_sgt;
+		}
+	}
 
 	mem->aligned_size = size;
 	mem->aperture = APERTURE_SYSMEM;
@@ -265,12 +285,14 @@ int nvgpu_dma_alloc_flags_sys(struct gk20a *g, unsigned long flags,
 
 	return 0;
 
-fail_free:
-	g->dma_memory_used -= mem->aligned_size;
+fail_free_sgt:
+	nvgpu_free_sgtable(g, &mem->priv.sgt);
+fail_free_dma:
 	dma_free_attrs(d, size, alloc_ret, iova, NVGPU_DMA_ATTR(dma_attrs));
 	mem->cpu_va = NULL;
 	mem->priv.sgt = NULL;
 	mem->size = 0;
+	g->dma_memory_used -= mem->aligned_size;
 	return err;
 }
 
@@ -466,6 +488,12 @@ static void nvgpu_dma_free_sys(struct gk20a *g, struct nvgpu_mem *mem)
 	if (!(mem->mem_flags & NVGPU_MEM_FLAG_SHADOW_COPY) &&
 	    !(mem->mem_flags & __NVGPU_MEM_FLAG_NO_DMA) &&
 	    (mem->cpu_va || mem->priv.pages)) {
+		/*
+		 * Free side of WAR for bug 2040115.
+		 */
+		if (nvgpu_is_enabled(g, NVGPU_USE_COHERENT_SYSMEM))
+			vunmap(mem->cpu_va);
+
 		if (mem->priv.flags) {
 			NVGPU_DEFINE_DMA_ATTRS(dma_attrs);
 
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index 52348db00..741c86e77 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -1149,6 +1149,12 @@ static int gk20a_probe(struct platform_device *dev)
 	if (err)
 		goto return_err;
 
+	np = nvgpu_get_node(gk20a);
+	if (of_dma_is_coherent(np)) {
+		__nvgpu_set_enabled(gk20a, NVGPU_USE_COHERENT_SYSMEM, true);
+		__nvgpu_set_enabled(gk20a, NVGPU_SUPPORT_IO_COHERENCE, true);
+	}
+
 	if (nvgpu_platform_is_simulation(gk20a))
 		__nvgpu_set_enabled(gk20a, NVGPU_IS_FMODEL, true);
 
@@ -1208,12 +1214,6 @@ static int gk20a_probe(struct platform_device *dev)
 
 	gk20a->mm.has_physical_mode = !nvgpu_is_hypervisor_mode(gk20a);
 
-	np = nvgpu_get_node(gk20a);
-	if (of_dma_is_coherent(np)) {
-		__nvgpu_set_enabled(gk20a, NVGPU_USE_COHERENT_SYSMEM, true);
-		__nvgpu_set_enabled(gk20a, NVGPU_SUPPORT_IO_COHERENCE, true);
-	}
-
 	return 0;
 
 return_err:
diff --git a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
index c859520d0..698976940 100644
--- a/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
+++ b/drivers/gpu/nvgpu/common/linux/nvgpu_mem.c
@@ -44,6 +44,14 @@ int nvgpu_mem_begin(struct gk20a *g, struct nvgpu_mem *mem)
 	if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin)
 		return 0;
 
+	/*
+	 * WAR for bug 2040115: we already will always have a coherent vmap()
+	 * for all sysmem buffers. The prot settings are left alone since
+	 * eventually this should be deleted.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_USE_COHERENT_SYSMEM))
+		return 0;
+
 	/*
 	 * A CPU mapping is implicitly made for all SYSMEM DMA allocations that
 	 * don't have NVGPU_DMA_NO_KERNEL_MAPPING. Thus we don't need to make
@@ -73,6 +81,13 @@ void nvgpu_mem_end(struct gk20a *g, struct nvgpu_mem *mem)
 	if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin)
 		return;
 
+	/*
+	 * WAR for bug 2040115: skip this since the map will be taken care of
+	 * during the free in the DMA API.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_USE_COHERENT_SYSMEM))
+		return;
+
 	/*
 	 * Similar to nvgpu_mem_begin() we don't need to unmap the CPU mapping
 	 * already made by the DMA API.
@@ -393,8 +408,12 @@ int nvgpu_mem_create_from_mem(struct gk20a *g,
 
 	/*
 	 * Re-use the CPU mapping only if the mapping was made by the DMA API.
+	 *
+	 * Bug 2040115: the DMA API wrapper makes the mapping that we should
+	 * re-use.
 	 */
-	if (!(src->priv.flags & NVGPU_DMA_NO_KERNEL_MAPPING))
+	if (!(src->priv.flags & NVGPU_DMA_NO_KERNEL_MAPPING) ||
+	    nvgpu_is_enabled(g, NVGPU_USE_COHERENT_SYSMEM))
 		dest->cpu_va = src->cpu_va + (PAGE_SIZE * start_page);
 
 	dest->priv.pages = src->priv.pages + start_page;
diff --git a/drivers/gpu/nvgpu/common/linux/pci.c b/drivers/gpu/nvgpu/common/linux/pci.c
index 4ba839c40..973da9ca1 100644
--- a/drivers/gpu/nvgpu/common/linux/pci.c
+++ b/drivers/gpu/nvgpu/common/linux/pci.c
@@ -566,6 +566,12 @@ static int nvgpu_pci_probe(struct pci_dev *pdev,
 	platform->g = g;
 	l->dev = &pdev->dev;
 
+	np = nvgpu_get_node(g);
+	if (of_dma_is_coherent(np)) {
+		__nvgpu_set_enabled(g, NVGPU_USE_COHERENT_SYSMEM, true);
+		__nvgpu_set_enabled(g, NVGPU_SUPPORT_IO_COHERENCE, true);
+	}
+
 	err = pci_enable_device(pdev);
 	if (err)
 		return err;
@@ -644,13 +650,6 @@ static int nvgpu_pci_probe(struct pci_dev *pdev,
 
 	g->mm.has_physical_mode = false;
 
-	np = nvgpu_get_node(g);
-
-	if (of_dma_is_coherent(np)) {
-		__nvgpu_set_enabled(g, NVGPU_USE_COHERENT_SYSMEM, true);
-		__nvgpu_set_enabled(g, NVGPU_SUPPORT_IO_COHERENCE, true);
-	}
-
 	return 0;
 }