diff --git a/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c b/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
index 583c27694..ca77f00de 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
@@ -197,13 +197,18 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 	 *
 	 * Currently PAGE_SIZE is used, even when 64K, to work around an issue
 	 * with the PDB TLB invalidate code not being pd_cache aware yet.
+	 *
+	 * Similarly, we can't use nvgpu_pd_alloc() here, because the top-level
+	 * PD must have mem_offs be 0 for the invalidate code to work, so we
+	 * can't use the PD cache.
 	 */
 	pdb_size = ALIGN(pd_get_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
 
-	err = nvgpu_pd_alloc(vm, &vm->pdb, pdb_size);
+	err = nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
 	if (err != 0) {
 		return err;
 	}
+	vm->pdb.pd_size = pdb_size;
 
 	/*
 	 * One nvgpu_mb() is done after all mapping operations. Don't need
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
index f1681a3d6..33272d4b5 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
@@ -62,7 +62,8 @@ static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
 {
 	BUG_ON(pentry->pd_size == 0);
 
-	return PAGE_SIZE / pentry->pd_size;
+	return (nvgpu_safe_cast_u64_to_u32(NVGPU_PD_CACHE_SIZE)) /
+			pentry->pd_size;
 }
 
 /*
@@ -155,7 +156,7 @@ void nvgpu_pd_cache_fini(struct gk20a *g)
  * Note: this does not need the cache lock since it does not modify any of the
  * PD cache data structures.
  */
-static int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
 				       struct nvgpu_gmmu_pd *pd, u32 bytes)
 {
 	int err;
@@ -206,6 +207,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 				    u32 bytes)
 {
 	struct nvgpu_pd_mem_entry *pentry;
+	u64 flags = 0UL;
+	int32_t err;
 
 	pd_dbg(g, "PD-Alloc [C]   New: offs=0");
 
@@ -215,8 +218,21 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 		return -ENOMEM;
 	}
 
-	if (nvgpu_dma_alloc(g, PAGE_SIZE, &pentry->mem) != 0) {
+	if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) {
+		flags = NVGPU_DMA_PHYSICALLY_ADDRESSED;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, flags,
+				  NVGPU_PD_CACHE_SIZE, &pentry->mem);
+	if (err != 0) {
 		nvgpu_kfree(g, pentry);
+
+		/* Not enough contiguous space, but a direct
+		 * allocation may work
+		 */
+		if (err == -ENOMEM) {
+			return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		}
 		nvgpu_err(g, "Unable to DMA alloc!");
 		return -ENOMEM;
 	}
@@ -330,7 +346,7 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
 		return -EINVAL;
 	}
 
-	nvgpu_assert(bytes < PAGE_SIZE);
+	nvgpu_assert(bytes < NVGPU_PD_CACHE_SIZE);
 
 	pentry = nvgpu_pd_cache_get_partial(cache, bytes);
 	if (pentry == NULL) {
@@ -360,7 +376,7 @@ int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
 	 * Simple case: PD is bigger than a page so just do a regular DMA
 	 * alloc.
 	 */
-	if (bytes >= PAGE_SIZE) {
+	if (bytes >= NVGPU_PD_CACHE_SIZE) {
 		err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
 		if (err != 0) {
 			return err;
@@ -424,7 +440,21 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
 		/*
 		 * Partially full still. If it was already on the partial list
 		 * this just re-adds it.
+		 *
+		 * Since the memory used for the entries is still mapped, if
+		 * igpu make sure the entries are invalidated so that the hw
+		 * doesn't acccidentally try to prefetch non-existent fb memory.
+		 *
+		 * As IOMMU prefetching of invalid pd entries cause the IOMMU fault,
+		 * fill them with zero.
 		 */
+		if ((nvgpu_iommuable(g)) &&
+			(NVGPU_PD_CACHE_SIZE > PAGE_SIZE) &&
+			(pd->mem->cpu_va != NULL)) {
+			(void)memset(((u8 *)pd->mem->cpu_va + pd->mem_offs), 0,
+					pd->pd_size);
+		}
+
 		nvgpu_list_del(&pentry->list_entry);
 		nvgpu_list_add(&pentry->list_entry,
 			&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
index 2e2d6562c..742b2f9dd 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -80,23 +80,37 @@
  * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
  * structure is of course depending on this.
  */
-#define NVGPU_PD_CACHE_MIN		256U
+#define NVGPU_PD_CACHE_MIN		256UL
 /**
  * MIN_SHIFT is the right number of bits to shift to determine
  * which list to use in the array of lists.
  */
-#define NVGPU_PD_CACHE_MIN_SHIFT	9U
+#define NVGPU_PD_CACHE_MIN_SHIFT	9UL
+
 /**
- * Maximum PD cache count. This value varies depending on PAGE_SIZE.
+ * Maximum PD cache count. This specifies the number of slabs; since each
+ * slab represents a PO2 increase in size a count of 8 leads to:
+ *
+ *   NVGPU_PD_CACHE_SIZE = 256B * 2^8 = 64KB
+ *
+ * For Linux with 4K pages, if the cache size is larger than 4KB then we
+ * need to allocate from CMA. This puts a lot of pressure on the CMA space.
+ * For kernel with a PAGE_SIZE of 64K this isn't the case, so allow the
+ * PD cache size to be 64K if PAGE_SIZE > 4K (i.e PAGE_SIZE == 64K).
  */
-#if PAGE_SIZE == 4096
-#define NVGPU_PD_CACHE_COUNT		4U
-#elif PAGE_SIZE == 65536
-#define NVGPU_PD_CACHE_COUNT		8U
+#ifdef __KERNEL__
+#  if PAGE_SIZE > 4096
+#    define NVGPU_PD_CACHE_COUNT	8UL
+#  else
+#    define NVGPU_PD_CACHE_COUNT	4UL
+#  endif
 #else
-#error "Unsupported page size."
+#define NVGPU_PD_CACHE_COUNT		8UL
 #endif
 
+#define NVGPU_PD_CACHE_SIZE		(NVGPU_PD_CACHE_MIN * \
+						(1UL << NVGPU_PD_CACHE_COUNT))
+
 /**
  * This structure describes a slab within the slab allocator.
  */
@@ -115,7 +129,7 @@ struct nvgpu_pd_mem_entry {
 	 * The size of mem will always
 	 * be one page. pd_size will always be a power of 2.
 	 */
-	DECLARE_BITMAP(alloc_map, PAGE_SIZE / NVGPU_PD_CACHE_MIN);
+	DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
 	/**
 	 * Total number of allocations in this PD.
 	 */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/pd_cache.h b/drivers/gpu/nvgpu/include/nvgpu/pd_cache.h
index 1426db648..16b166775 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/pd_cache.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/pd_cache.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -189,4 +189,6 @@ void nvgpu_pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
  */
 u64  nvgpu_pd_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
 
+int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				struct nvgpu_gmmu_pd *pd, u32 bytes);
 #endif
diff --git a/drivers/gpu/nvgpu/os/linux/linux-dma.c b/drivers/gpu/nvgpu/os/linux/linux-dma.c
index 83e238d83..8eebb6f7f 100644
--- a/drivers/gpu/nvgpu/os/linux/linux-dma.c
+++ b/drivers/gpu/nvgpu/os/linux/linux-dma.c
@@ -111,7 +111,7 @@ static void nvgpu_dma_print_err(struct gk20a *g, size_t size,
 
 	nvgpu_dma_flags_to_str(g, flags, flags_str);
 
-	nvgpu_err(g,
+	nvgpu_info(g,
 		  "DMA %s FAILED: [%s] size=%-7zu "
 		  "aligned=%-7zu flags:%s",
 		  what, type,
diff --git a/userspace/units/acr/nvgpu-acr.c b/userspace/units/acr/nvgpu-acr.c
index edeb8a6d9..6b3ddf5ce 100644
--- a/userspace/units/acr/nvgpu-acr.c
+++ b/userspace/units/acr/nvgpu-acr.c
@@ -308,6 +308,11 @@ static int init_test_env(struct unit_module *m, struct gk20a *g)
 		unit_return_fail(m, "ecc init failed\n");
 	}
 
+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_return_fail(m, "failed to init pd cache");
+	}
+
 	err = g->ops.mm.init_mm_support(g);
 	if (err != 0) {
 		unit_return_fail(m, "failed to init gk20a mm");
diff --git a/userspace/units/fifo/nvgpu-fifo-common.c b/userspace/units/fifo/nvgpu-fifo-common.c
index fe9327359..81c604ec1 100644
--- a/userspace/units/fifo/nvgpu-fifo-common.c
+++ b/userspace/units/fifo/nvgpu-fifo-common.c
@@ -174,6 +174,10 @@ int test_fifo_init_support(struct unit_module *m, struct gk20a *g, void *args)
 	g->ops.userd.setup_sw = stub_userd_setup_sw;
 #endif
 	g->ops.ecc.ecc_init_support(g);
+
+	/* PD cache must be initialized prior to mm init */
+	err = nvgpu_pd_cache_init(g);
+
 	g->ops.mm.init_mm_support(g);
 
 	nvgpu_device_init(g);
diff --git a/userspace/units/gr/nvgpu-gr.c b/userspace/units/gr/nvgpu-gr.c
index 2f1de6383..efbef32e1 100644
--- a/userspace/units/gr/nvgpu-gr.c
+++ b/userspace/units/gr/nvgpu-gr.c
@@ -84,6 +84,12 @@ int test_gr_init_setup(struct unit_module *m, struct gk20a *g, void *args)
 		return -ENOMEM;
 	}
 
+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_err(m, "PD cache initialization failed\n");
+		return -ENOMEM;
+	}
+
 	return UNIT_SUCCESS;
 
 fail:
diff --git a/userspace/units/mm/as/as.c b/userspace/units/mm/as/as.c
index a14ce17e3..8cd7239de 100644
--- a/userspace/units/mm/as/as.c
+++ b/userspace/units/mm/as/as.c
@@ -158,6 +158,11 @@ int test_init_mm(struct unit_module *m, struct gk20a *g, void *args)
 	g->ops.fb.intr.enable = gv11b_fb_intr_enable;
 	g->ops.fb.ecc.init = NULL;
 
+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_return_fail(m, "pd cache initialization failed\n");
+	}
+
 	err = nvgpu_init_mm_support(g);
 	if (err != 0) {
 		unit_return_fail(m, "nvgpu_init_mm_support failed err=%d\n",
diff --git a/userspace/units/mm/dma/dma.c b/userspace/units/mm/dma/dma.c
index beb1f92a1..3485f73cc 100644
--- a/userspace/units/mm/dma/dma.c
+++ b/userspace/units/mm/dma/dma.c
@@ -204,6 +204,10 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
 		unit_return_fail(m, "nvgpu_vm_init failed\n");
 	}
 
+	if (nvgpu_pd_cache_init(g) != 0) {
+		unit_return_fail(m, "pd cache initialization failed\n");
+	}
+
 	return UNIT_SUCCESS;
 }
 
diff --git a/userspace/units/mm/gmmu/page_table/page_table.c b/userspace/units/mm/gmmu/page_table/page_table.c
index 41adb4349..fae1d7a8a 100644
--- a/userspace/units/mm/gmmu/page_table/page_table.c
+++ b/userspace/units/mm/gmmu/page_table/page_table.c
@@ -383,6 +383,10 @@ int test_nvgpu_gmmu_init(struct unit_module *m, struct gk20a *g, void *args)
 
 	init_platform(m, g, true);
 
+	if (nvgpu_pd_cache_init(g) != 0) {
+		unit_return_fail(m, "PD cache initialization failed\n");
+	}
+
 	if (init_mm(m, g) != 0) {
 		unit_return_fail(m, "nvgpu_init_mm_support failed\n");
 	}
diff --git a/userspace/units/mm/gmmu/pd_cache/pd_cache.c b/userspace/units/mm/gmmu/pd_cache/pd_cache.c
index de067e4a0..9be380470 100644
--- a/userspace/units/mm/gmmu/pd_cache/pd_cache.c
+++ b/userspace/units/mm/gmmu/pd_cache/pd_cache.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -321,7 +321,7 @@ int test_pd_free_empty_pd(struct unit_module *m, struct gk20a *g,
 
 	/* And now direct frees. */
 	memset(&pd, 0U, sizeof(pd));
-	err = nvgpu_pd_alloc(&vm, &pd, PAGE_SIZE);
+	err = nvgpu_pd_alloc(&vm, &pd, NVGPU_PD_CACHE_SIZE);
 	if (err != 0) {
 		unit_return_fail(m, "PD alloc failed");
 	}
@@ -610,7 +610,7 @@ static int do_test_pd_cache_packing_size(struct unit_module *m, struct gk20a *g,
 {
 	int err;
 	u32 i;
-	u32 n = PAGE_SIZE / pd_size;
+	u32 n = NVGPU_PD_CACHE_SIZE / pd_size;
 	struct nvgpu_gmmu_pd pds[n], pd;
 	struct nvgpu_posix_fault_inj *dma_fi =
 		nvgpu_dma_alloc_get_fault_injection();
@@ -667,7 +667,7 @@ static int do_test_pd_reusability(struct unit_module *m, struct gk20a *g,
 {
 	int err = UNIT_SUCCESS;
 	u32 i;
-	u32 n = PAGE_SIZE / pd_size;
+	u32 n = NVGPU_PD_CACHE_SIZE / pd_size;
 	struct nvgpu_gmmu_pd pds[n];
 	struct nvgpu_posix_fault_inj *dma_fi =
 		nvgpu_dma_alloc_get_fault_injection();
diff --git a/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c b/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c
index dd1dc2151..42980f291 100644
--- a/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c
+++ b/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c
@@ -126,6 +126,7 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
 	u64 low_hole, aperture_size;
 	struct nvgpu_os_posix *p = nvgpu_os_posix_from_gk20a(g);
 	struct mm_gk20a *mm = &g->mm;
+	int err;
 
 	p->mm_is_iommuable = true;
 
@@ -193,6 +194,11 @@ static int init_mm(struct unit_module *m, struct gk20a *g)
 		unit_return_fail(m, "'bar2' nvgpu_vm_init failed\n");
 	}
 
+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_return_fail(m, "PD cache init failed\n");
+	}
+
 	/*
 	 * This initialization will make sure that correct aperture mask
 	 * is returned */
diff --git a/userspace/units/pmu/nvgpu-pmu.c b/userspace/units/pmu/nvgpu-pmu.c
index f99c81d10..b757e45ce 100644
--- a/userspace/units/pmu/nvgpu-pmu.c
+++ b/userspace/units/pmu/nvgpu-pmu.c
@@ -229,6 +229,12 @@ static int init_pmu_falcon_test_env(struct unit_module *m, struct gk20a *g)
 		return -ENOMEM;
 	}
 
+	err = nvgpu_pd_cache_init(g);
+	if (err != 0) {
+		unit_err(m, " PD cache allocation failed!\n");
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
diff --git a/userspace/units/rc/nvgpu-rc.c b/userspace/units/rc/nvgpu-rc.c
index c41faa70c..6fe968073 100644
--- a/userspace/units/rc/nvgpu-rc.c
+++ b/userspace/units/rc/nvgpu-rc.c
@@ -89,6 +89,11 @@ int test_rc_init(struct unit_module *m, struct gk20a *g, void *args)
 		unit_return_fail(m, "fifo reg_space failure");
 	}
 
+	ret = nvgpu_pd_cache_init(g);
+	if (ret != 0) {
+		unit_return_fail(m, "PD cache initialization failure");
+	}
+
 	nvgpu_device_init(g);
 
 	g->ops.gr.init.get_no_of_sm = stub_gv11b_gr_init_get_no_of_sm;
diff --git a/userspace/units/sync/nvgpu-sync.c b/userspace/units/sync/nvgpu-sync.c
index df1542e20..59437d448 100644
--- a/userspace/units/sync/nvgpu-sync.c
+++ b/userspace/units/sync/nvgpu-sync.c
@@ -98,6 +98,10 @@ static int init_channel_vm(struct unit_module *m, struct nvgpu_channel *ch)
 
 	ch->vm = mm->pmu.vm;
 
+	if (nvgpu_pd_cache_init(g) != 0) {
+		unit_return_fail(m, "pd cache initialization failed\n");
+	}
+
 	return UNIT_SUCCESS;
 }