diff --git a/drivers/gpu/nvgpu/common/linux/cde.c b/drivers/gpu/nvgpu/common/linux/cde.c
index 5b0fb9102..0c52271a8 100644
--- a/drivers/gpu/nvgpu/common/linux/cde.c
+++ b/drivers/gpu/nvgpu/common/linux/cde.c
@@ -975,7 +975,7 @@ __releases(&l->cde_app->mutex)
 	u64 big_page_mask = 0;
 	u32 flags;
 	int err, i;
-	const s32 compbits_kind = 0;
+	const s16 compbits_kind = 0;
 
 	gk20a_dbg(gpu_dbg_cde, "compbits_byte_offset=%llu scatterbuffer_byte_offset=%llu",
 		  compbits_byte_offset, scatterbuffer_byte_offset);
@@ -1038,8 +1038,11 @@ __releases(&l->cde_app->mutex)
 	/* map the destination buffer */
 	get_dma_buf(compbits_scatter_buf); /* a ref for nvgpu_vm_map */
 	map_vaddr = nvgpu_vm_map(cde_ctx->vm, compbits_scatter_buf, 0,
-				 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
-				 compbits_kind, true,
+				 NVGPU_AS_MAP_BUFFER_FLAGS_CACHEABLE |
+				 NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL,
+				 NV_KIND_INVALID,
+				 compbits_kind, /* incompressible kind */
+				 true,
 				 gk20a_mem_flag_none,
 				 map_offset, map_size,
 				 NULL);
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_as.c b/drivers/gpu/nvgpu/common/linux/ioctl_as.c
index d4242955e..cfc4e7ef5 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_as.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_as.c
@@ -79,11 +79,22 @@ static int gk20a_as_ioctl_map_buffer_ex(
 		struct gk20a_as_share *as_share,
 		struct nvgpu_as_map_buffer_ex_args *args)
 {
+	s16 compressible_kind;
+	s16 incompressible_kind;
+
 	gk20a_dbg_fn("");
 
+	if (args->flags & NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL) {
+		compressible_kind = args->compr_kind;
+		incompressible_kind = args->incompr_kind;
+	} else {
+		compressible_kind = args->kind;
+		incompressible_kind = NV_KIND_INVALID;
+	}
+
 	return nvgpu_vm_map_buffer(as_share->vm, args->dmabuf_fd,
 				   &args->offset, args->flags,
-				   args->kind,
+				   compressible_kind, incompressible_kind,
 				   args->buffer_offset,
 				   args->mapping_size,
 				   NULL);
@@ -97,6 +108,7 @@ static int gk20a_as_ioctl_map_buffer(
 	return nvgpu_vm_map_buffer(as_share->vm, args->dmabuf_fd,
 				   &args->o_a.offset,
 				   args->flags, NV_KIND_DEFAULT,
+				   NV_KIND_DEFAULT,
 				   0, 0, NULL);
 	/* args->o_a.offset will be set if !err */
 }
@@ -158,6 +170,9 @@ static int gk20a_as_ioctl_map_buffer_batch(
 	}
 
 	for (i = 0; i < args->num_maps; ++i) {
+		s16 compressible_kind;
+		s16 incompressible_kind;
+
 		struct nvgpu_as_map_buffer_ex_args map_args;
 		memset(&map_args, 0, sizeof(map_args));
 
@@ -167,10 +182,19 @@ static int gk20a_as_ioctl_map_buffer_batch(
 			break;
 		}
 
+		if (map_args.flags &
+		    NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL) {
+			compressible_kind = map_args.compr_kind;
+			incompressible_kind = map_args.incompr_kind;
+		} else {
+			compressible_kind = map_args.kind;
+			incompressible_kind = NV_KIND_INVALID;
+		}
+
 		err = nvgpu_vm_map_buffer(
 			as_share->vm, map_args.dmabuf_fd,
 			&map_args.offset, map_args.flags,
-			map_args.kind,
+			compressible_kind, incompressible_kind,
 			map_args.buffer_offset,
 			map_args.mapping_size,
 			&batch);
diff --git a/drivers/gpu/nvgpu/common/linux/vm.c b/drivers/gpu/nvgpu/common/linux/vm.c
index 58e2da130..86d8bec9c 100644
--- a/drivers/gpu/nvgpu/common/linux/vm.c
+++ b/drivers/gpu/nvgpu/common/linux/vm.c
@@ -177,11 +177,46 @@ static u64 __nvgpu_vm_find_mapping(struct vm_gk20a *vm,
 	return mapped_buffer->addr;
 }
 
+static int setup_bfr_kind_fields(struct buffer_attrs *bfr, s16 compr_kind,
+				 s16 incompr_kind, u32 flags)
+{
+	if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL) {
+		/* were we supplied with a kind in either parameter? */
+		if ((compr_kind < 0 || compr_kind >= NV_KIND_ATTR_SIZE) &&
+		    (incompr_kind < 0 || incompr_kind >= NV_KIND_ATTR_SIZE))
+			return -EINVAL;
+
+		if (compr_kind != NV_KIND_INVALID) {
+			bfr->use_kind_v = true;
+			bfr->kind_v = (u8)compr_kind;
+		}
+
+		if (incompr_kind != NV_KIND_INVALID) {
+			bfr->use_uc_kind_v = true;
+			bfr->uc_kind_v = (u8)incompr_kind;
+		}
+	} else {
+		if (compr_kind < 0 || compr_kind >= NV_KIND_ATTR_SIZE)
+			return -EINVAL;
+
+		bfr->use_kind_v = true;
+		bfr->kind_v = (u8)compr_kind;
+
+		/*
+		 * Note: setup_buffer_kind_and_compression() will
+		 * figure out uc_kind_v or return an error
+		 */
+	}
+
+	return 0;
+}
+
 u64 nvgpu_vm_map(struct vm_gk20a *vm,
 		 struct dma_buf *dmabuf,
 		 u64 offset_align,
 		 u32 flags,
-		 int kind,
+		 s16 compr_kind,
+		 s16 incompr_kind,
 		 bool user_mapped,
 		 int rw_flag,
 		 u64 buffer_offset,
@@ -203,6 +238,22 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	u32 ctag_offset;
 	enum nvgpu_aperture aperture;
 
+	/*
+	 * The kind used as part of the key for map caching. HW may
+	 * actually be programmed with the fallback kind in case the
+	 * key kind is compressible but we're out of comptags.
+	 */
+	s16 map_key_kind;
+
+	if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL) {
+		if (compr_kind != NV_KIND_INVALID)
+			map_key_kind = compr_kind;
+		else
+			map_key_kind = incompr_kind;
+	} else {
+		map_key_kind = compr_kind;
+	}
+
 	if (user_mapped && vm->userspace_managed &&
 	    !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)) {
 		nvgpu_err(g, "non-fixed-offset mapping not available on "
@@ -216,7 +267,7 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	if (!vm->userspace_managed) {
 		map_offset = __nvgpu_vm_find_mapping(
 			vm, dmabuf, offset_align,
-			flags, kind,
+			flags, map_key_kind,
 			user_mapped, rw_flag);
 		if (map_offset) {
 			nvgpu_mutex_release(&vm->update_gmmu_lock);
@@ -239,12 +290,10 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 		goto clean_up;
 	}
 
-	if (kind >= NV_KIND_ATTR_SIZE) {
-		err = -EINVAL;
+	err = setup_bfr_kind_fields(&bfr, compr_kind, incompr_kind, flags);
+	if (err)
 		goto clean_up;
-	} else {
-		bfr.kind_v = (u8)kind;
-	}
+
 	bfr.size = dmabuf->size;
 	sgl = bfr.sgt->sgl;
 
@@ -306,10 +355,15 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 		err = gk20a_alloc_comptags(g, dev, dmabuf,
 					   ctag_allocator,
 					   bfr.ctag_lines);
-		if (err) {
-			/* ok to fall back here if we ran out */
+		if (unlikely(err)) {
 			/* TBD: we can partially alloc ctags as well... */
-			bfr.kind_v = bfr.uc_kind_v;
+			if (bfr.use_uc_kind_v) {
+				/* no comptags, but fallback kind available */
+				bfr.kind_v = bfr.uc_kind_v;
+			} else {
+				nvgpu_err(g, "comptag alloc failed and no fallback kind specified");
+				goto clean_up;
+			}
 		} else {
 			gk20a_get_comptags(dev,
 					   dmabuf, &comptags);
@@ -371,7 +425,7 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 	mapped_buffer->ctag_allocated_lines = bfr.ctag_allocated_lines;
 	mapped_buffer->vm          = vm;
 	mapped_buffer->flags       = flags;
-	mapped_buffer->kind        = kind;
+	mapped_buffer->kind        = map_key_kind;
 	mapped_buffer->va_allocated = va_allocated;
 	mapped_buffer->user_mapped = user_mapped ? 1 : 0;
 	mapped_buffer->own_mem_ref = user_mapped;
diff --git a/drivers/gpu/nvgpu/common/linux/vm_priv.h b/drivers/gpu/nvgpu/common/linux/vm_priv.h
index 4f6b10bbd..1eadf1d0e 100644
--- a/drivers/gpu/nvgpu/common/linux/vm_priv.h
+++ b/drivers/gpu/nvgpu/common/linux/vm_priv.h
@@ -34,7 +34,9 @@ struct buffer_attrs {
 	u32 ctag_allocated_lines;
 	int pgsz_idx;
 	u8 kind_v;
+	bool use_kind_v;
 	u8 uc_kind_v;
+	bool use_uc_kind_v;
 	bool ctag_user_mappable;
 };
 
@@ -42,19 +44,43 @@ u64 nvgpu_vm_map(struct vm_gk20a *vm,
 		 struct dma_buf *dmabuf,
 		 u64 offset_align,
 		 u32 flags,
-		 int kind,
+
+		 /*
+		  * compressible kind if
+		  * NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL is
+		  * specified, otherwise just the kind
+		  */
+		 s16 compr_kind,
+
+		 /*
+		  * incompressible kind if
+		  * NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL is
+		  * specified, otherwise ignored
+		  */
+		 s16 incompr_kind,
+
 		 bool user_mapped,
 		 int rw_flag,
 		 u64 buffer_offset,
 		 u64 mapping_size,
 		 struct vm_gk20a_mapping_batch *mapping_batch);
 
-/* Note: batch may be NULL if map op is not part of a batch */
+/*
+ * Notes:
+ * - Batch may be NULL if map op is not part of a batch.
+ * - If NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL is set,
+ *   compr_kind and incompr_kind work as explained in nvgpu.h.
+ * - If NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL is NOT set,
+ *   compr_kind holds the kind and kernel will figure out whether
+ *   it is a compressible or incompressible kind. If compressible, kernel will
+ *   also figure out the incompressible counterpart or return an error.
+ */
 int nvgpu_vm_map_buffer(struct vm_gk20a *vm,
 			int dmabuf_fd,
 			u64 *offset_align,
 			u32 flags, /* NVGPU_AS_MAP_BUFFER_FLAGS_ */
-			int kind,
+			s16 compr_kind,
+			s16 incompr_kind,
 			u64 buffer_offset,
 			u64 mapping_size,
 			struct vm_gk20a_mapping_batch *batch);
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index 19433df9a..2b37a62af 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -1929,6 +1929,7 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 			0,
 			0,
 			0,
+			0,
 			args->mapping_size,
 			NULL);
 	if (err)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 455fa2382..e9948c161 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -411,7 +411,8 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
 			gpu->available_big_page_sizes |= g->ops.mm.get_big_page_sizes();
 	}
 
-	gpu->flags = NVGPU_GPU_FLAGS_SUPPORT_PARTIAL_MAPPINGS;
+	gpu->flags = NVGPU_GPU_FLAGS_SUPPORT_PARTIAL_MAPPINGS |
+		NVGPU_GPU_FLAGS_SUPPORT_MAP_DIRECT_KIND_CTRL;
 
 	if (IS_ENABLED(CONFIG_SYNC))
 		gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_SYNC_FENCE_FDS;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index a6507d2dc..97b7aa800 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -822,14 +822,11 @@ static void nvgpu_vm_unmap_user(struct vm_gk20a *vm, u64 offset,
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
 }
 
-int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
-				      u32 flags,
-				      struct buffer_attrs *bfr,
-				      enum gmmu_pgsz_gk20a pgsz_idx)
+static int setup_kind_legacy(struct vm_gk20a *vm, struct buffer_attrs *bfr,
+			     bool *pkind_compressible)
 {
-	bool kind_compressible;
 	struct gk20a *g = gk20a_from_vm(vm);
-	int ctag_granularity = g->ops.fb.compression_page_size(g);
+	bool kind_compressible;
 
 	if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
 		bfr->kind_v = gmmu_pte_kind_pitch_v();
@@ -840,7 +837,7 @@ int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
 	}
 
 	bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
-	/* find a suitable uncompressed kind if it becomes necessary later */
+	/* find a suitable incompressible kind if it becomes necessary later */
 	kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
 	if (kind_compressible) {
 		bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
@@ -852,6 +849,36 @@ int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
 			return -EINVAL;
 		}
 	}
+
+	*pkind_compressible = kind_compressible;
+	return 0;
+}
+
+int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
+				      u32 flags,
+				      struct buffer_attrs *bfr,
+				      enum gmmu_pgsz_gk20a pgsz_idx)
+{
+	bool kind_compressible;
+	struct gk20a *g = gk20a_from_vm(vm);
+	int ctag_granularity = g->ops.fb.compression_page_size(g);
+
+	if (!bfr->use_kind_v)
+		bfr->kind_v = gmmu_pte_kind_invalid_v();
+	if (!bfr->use_uc_kind_v)
+		bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
+
+	if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL) {
+		kind_compressible = (bfr->kind_v != gmmu_pte_kind_invalid_v());
+		if (!kind_compressible)
+			bfr->kind_v = bfr->uc_kind_v;
+	} else {
+		int err = setup_kind_legacy(vm, bfr, &kind_compressible);
+
+		if (err)
+			return err;
+	}
+
 	/* comptags only supported for suitable kinds, 128KB pagesize */
 	if (kind_compressible &&
 	    vm->gmmu_page_sizes[pgsz_idx] < g->ops.fb.compressible_page_size(g)) {
@@ -865,6 +892,9 @@ int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
 	else
 		bfr->ctag_lines = 0;
 
+	bfr->use_kind_v = (bfr->kind_v != gmmu_pte_kind_invalid_v());
+	bfr->use_uc_kind_v = (bfr->uc_kind_v != gmmu_pte_kind_invalid_v());
+
 	return 0;
 }
 
@@ -1649,7 +1679,8 @@ int nvgpu_vm_map_buffer(struct vm_gk20a *vm,
 			int dmabuf_fd,
 			u64 *offset_align,
 			u32 flags, /*NVGPU_AS_MAP_BUFFER_FLAGS_*/
-			int kind,
+			s16 compr_kind,
+			s16 incompr_kind,
 			u64 buffer_offset,
 			u64 mapping_size,
 			struct vm_gk20a_mapping_batch *batch)
@@ -1690,7 +1721,7 @@ int nvgpu_vm_map_buffer(struct vm_gk20a *vm,
 	}
 
 	ret_va = nvgpu_vm_map(vm, dmabuf, *offset_align,
-			flags, kind, true,
+			flags, compr_kind, incompr_kind, true,
 			gk20a_mem_flag_none,
 			buffer_offset,
 			mapping_size,
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 5b1d606a5..9c883a93b 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -146,6 +146,9 @@ struct nvgpu_gpu_zbc_query_table_args {
 #define NVGPU_GPU_FLAGS_SUPPORT_IO_COHERENCE		(1ULL << 20)
 /* NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST is available */
 #define NVGPU_GPU_FLAGS_SUPPORT_RESCHEDULE_RUNLIST	(1ULL << 21)
+/* Direct PTE kind control is supported (map_buffer_ex) */
+#define NVGPU_GPU_FLAGS_SUPPORT_MAP_DIRECT_KIND_CTRL	(1ULL << 23)
+
 
 struct nvgpu_gpu_characteristics {
 	__u32 arch;
@@ -1751,6 +1754,7 @@ struct nvgpu_as_map_buffer_args {
 #define NVGPU_AS_MAP_BUFFER_FLAGS_IO_COHERENT	    (1 << 4)
 #define NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE	    (1 << 5)
 #define NVGPU_AS_MAP_BUFFER_FLAGS_MAPPABLE_COMPBITS (1 << 6)
+#define NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL  (1 << 8)
 	__u32 reserved;		/* in */
 	__u32 dmabuf_fd;	/* in */
 	__u32 page_size;	/* inout, 0:= best fit to buffer */
@@ -1760,7 +1764,7 @@ struct nvgpu_as_map_buffer_args {
 	} o_a;
 };
 
- /*
+/*
  * Mapping dmabuf fds into an address space:
  *
  * The caller requests a mapping to a particular page 'kind'.
@@ -1772,7 +1776,37 @@ struct nvgpu_as_map_buffer_args {
 struct nvgpu_as_map_buffer_ex_args {
 	__u32 flags;		/* in/out */
 #define NV_KIND_DEFAULT -1
-	__s32 kind;		/* in (-1 represents default) */
+	union {
+		/*
+		 * Used if NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL
+		 * is not set.
+		 */
+		__s32 kind;	/* in (-1 represents default) */
+
+		/*
+		 * If NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL is
+		 * set, this is used, instead. The rules are:
+		 *
+		 * - If both compr_kind and incompr_kind are set
+		 *   (i.e., value is other than NV_KIND_INVALID),
+		 *   kernel attempts to use compr_kind first.
+		 *
+		 * - If compr_kind is set, kernel attempts to allocate
+		 *   comptags for the buffer. If successful,
+		 *   compr_kind is used as the PTE kind.
+		 *
+		 * - If incompr_kind is set, kernel uses incompr_kind
+		 *   as the PTE kind. Comptags are not allocated.
+		 *
+		 * - If neither compr_kind or incompr_kind is set, the
+		 *   map call will fail.
+		 */
+#define NV_KIND_INVALID -1
+		struct {
+		       __s16 compr_kind;
+		       __s16 incompr_kind;
+	       };
+	};
 	__u32 dmabuf_fd;	/* in */
 	__u32 page_size;	/* inout, 0:= best fit to buffer */