diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml
index 240d2ede8..1cf9395e7 100644
--- a/arch/nvgpu-common.yaml
+++ b/arch/nvgpu-common.yaml
@@ -446,6 +446,12 @@ fifo:
                  include/nvgpu/gops/tsg.h,
                  include/nvgpu/tsg.h ]
       deps: [ ]
+    tsg_subctx:
+      safe: yes
+      sources: [ common/fifo/tsg_subctx.c,
+                 common/fifo/tsg_subctx_priv.h,
+                 include/nvgpu/tsg_subctx.h ]
+      deps: [ ]
     submit:
       safe: yes
       sources: [ common/fifo/submit.c,
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 718f910f2..60b1f02cb 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -647,6 +647,7 @@ nvgpu-y += \
 	common/fifo/job.o \
 	common/fifo/priv_cmdbuf.o \
 	common/fifo/tsg.o \
+	common/fifo/tsg_subctx.o \
 	common/fifo/runlist.o \
 	common/fifo/engine_status.o \
 	common/fifo/engines.o \
diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources
index eaf74ace9..37e2a3040 100644
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -155,6 +155,7 @@ srcs +=	common/device.c \
 	common/fifo/fifo.c \
 	common/fifo/pbdma.c \
 	common/fifo/tsg.c \
+	common/fifo/tsg_subctx.c \
 	common/fifo/runlist.c \
 	common/fifo/engine_status.c \
 	common/fifo/engines.c \
diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index 113e924ee..3051135fd 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -983,11 +983,6 @@ static void channel_free(struct nvgpu_channel *ch, bool force)
 		g->ops.gr.fecs_trace.unbind_channel(g, &ch->inst_block);
 #endif
 
-	if (g->ops.gr.setup.free_subctx != NULL) {
-		g->ops.gr.setup.free_subctx(ch);
-		ch->subctx = NULL;
-	}
-
 	g->ops.gr.intr.flush_channel_tlb(g);
 
 	if (ch->usermode_submit_enabled) {
@@ -1803,6 +1798,7 @@ int nvgpu_channel_init_support(struct gk20a *g, u32 chid)
 	nvgpu_mutex_init(&c->dbg_s_lock);
 #endif
 	nvgpu_init_list_node(&c->ch_entry);
+	nvgpu_init_list_node(&c->subctx_entry);
 	nvgpu_list_add(&c->free_chs, &g->fifo.free_chs);
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c
index 40e30191e..357292631 100644
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -28,6 +28,7 @@
 #include <nvgpu/channel.h>
 #include <nvgpu/tsg.h>
 #include <nvgpu/atomic.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/rc.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/error_notifier.h>
@@ -142,6 +143,13 @@ int nvgpu_tsg_bind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
 	}
 
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+	err = nvgpu_tsg_subctx_bind_channel(tsg, ch);
+	if (err != 0) {
+		nvgpu_err(g, "Subcontext %u bind failed", ch->subctx_id);
+		nvgpu_rwsem_up_write(&tsg->ch_list_lock);
+		return err;
+	}
+
 	nvgpu_list_add_tail(&ch->ch_entry, &tsg->ch_list);
 	tsg->ch_count = nvgpu_safe_add_u32(tsg->ch_count, 1U);
 	ch->tsgid = tsg->tsgid;
@@ -284,8 +292,15 @@ static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
 	}
 #endif
 
-	/* Remove channel from TSG and re-enable rest of the channels */
+	/**
+	 * Remove channel from TSG and re-enable rest of the channels.
+	 * Since channel removal can lead to subctx removal and/or
+	 * VM mappings removal, acquire ctx_init_lock.
+	 */
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+	nvgpu_tsg_subctx_unbind_channel(tsg, ch);
 	nvgpu_list_del(&ch->ch_entry);
 	tsg->ch_count = nvgpu_safe_sub_u32(tsg->ch_count, 1U);
 	ch->tsgid = NVGPU_INVALID_TSG_ID;
@@ -296,6 +311,8 @@ static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
 	g->ops.channel.disable(ch);
 	nvgpu_rwsem_up_write(&tsg->ch_list_lock);
 
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	/*
 	 * Don't re-enable all channels if TSG has timed out already
 	 *
@@ -396,12 +413,17 @@ fail_common:
 	}
 #endif
 
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+	nvgpu_tsg_subctx_unbind_channel(tsg, ch);
 	nvgpu_list_del(&ch->ch_entry);
 	ch->tsgid = NVGPU_INVALID_TSG_ID;
 	tsg->ch_count = nvgpu_safe_sub_u32(tsg->ch_count, 1U);
 	nvgpu_rwsem_up_write(&tsg->ch_list_lock);
 
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release);
 
 	return err;
@@ -512,6 +534,8 @@ static void nvgpu_tsg_init_support(struct gk20a *g, u32 tsgid)
 	tsg->abortable = true;
 
 	nvgpu_init_list_node(&tsg->ch_list);
+	nvgpu_init_list_node(&tsg->subctx_list);
+	nvgpu_init_list_node(&tsg->gr_ctx_mappings_list);
 	nvgpu_rwsem_init(&tsg->ch_list_lock);
 	nvgpu_mutex_init(&tsg->ctx_init_lock);
 
@@ -869,7 +893,6 @@ int nvgpu_tsg_open_common(struct gk20a *g, struct nvgpu_tsg *tsg, pid_t pid)
 	tsg->ch_count = 0U;
 	nvgpu_ref_init(&tsg->refcount);
 
-	tsg->vm = NULL;
 	tsg->interleave_level = NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW;
 	tsg->timeslice_us = g->ops.tsg.default_timeslice_us(g);
 	tsg->runlist = NULL;
@@ -963,11 +986,6 @@ void nvgpu_tsg_release_common(struct gk20a *g, struct nvgpu_tsg *tsg)
 		tsg->rl_domain = NULL;
 	}
 
-	if (tsg->vm != NULL) {
-		nvgpu_vm_put(tsg->vm);
-		tsg->vm = NULL;
-	}
-
 	if(tsg->sm_error_states != NULL) {
 		nvgpu_kfree(g, tsg->sm_error_states);
 		tsg->sm_error_states = NULL;
diff --git a/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c b/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
new file mode 100644
index 000000000..4864ccf72
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/tsg.h>
+#include <nvgpu/tsg_subctx.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/log.h>
+#include <nvgpu/gr/subctx.h>
+#include <nvgpu/gr/ctx_mappings.h>
+
+#include "tsg_subctx_priv.h"
+
+static inline struct nvgpu_tsg_subctx *
+nvgpu_tsg_subctx_from_tsg_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_tsg_subctx *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_tsg_subctx, tsg_entry));
+};
+
+static struct nvgpu_tsg_subctx *nvgpu_tsg_subctx_from_id(struct nvgpu_tsg *tsg,
+					u32 subctx_id)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->subctx_id == subctx_id) {
+			return subctx;
+		}
+	}
+
+	return NULL;
+}
+
+int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
+				  struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	subctx = nvgpu_tsg_subctx_from_id(tsg, ch->subctx_id);
+	if (subctx != NULL) {
+		if (subctx->vm != ch->vm) {
+			nvgpu_err(g, "subctx vm mismatch");
+			return -EINVAL;
+		}
+
+		goto add_ch_subctx;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "Allocating subctx %u", ch->subctx_id);
+
+	subctx = nvgpu_kzalloc(g, sizeof(struct nvgpu_tsg_subctx));
+	if (subctx == NULL) {
+		nvgpu_err(g, "Failed to allocate subctx");
+		return -ENOMEM;
+	}
+
+	subctx->subctx_id = ch->subctx_id;
+	subctx->tsg = tsg;
+	subctx->vm = ch->vm;
+	nvgpu_init_list_node(&subctx->ch_list);
+	nvgpu_init_list_node(&subctx->tsg_entry);
+
+	nvgpu_list_add_tail(&subctx->tsg_entry, &tsg->subctx_list);
+
+add_ch_subctx:
+	ch->subctx = subctx;
+	nvgpu_list_add_tail(&ch->subctx_entry, &subctx->ch_list);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;
+}
+
+void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
+				     struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return;
+	}
+
+	subctx = ch->subctx;
+	nvgpu_assert(subctx != NULL);
+
+	nvgpu_list_del(&ch->subctx_entry);
+
+	if (nvgpu_list_empty(&subctx->ch_list)) {
+		if (g->ops.gr.setup.free_subctx != NULL) {
+			g->ops.gr.setup.free_subctx(ch);
+			subctx->gr_subctx = NULL;
+		}
+
+		nvgpu_list_del(&subctx->tsg_entry);
+		nvgpu_kfree(tsg->g, subctx);
+	}
+
+	ch->subctx = NULL;
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+
+int nvgpu_tsg_subctx_alloc_gr_subctx(struct gk20a *g, struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	subctx = ch->subctx;
+	if (subctx == NULL) {
+		nvgpu_err(g, "channel not bound to TSG subctx");
+		return -EINVAL;
+	}
+
+	if (subctx->gr_subctx == NULL) {
+		subctx->gr_subctx = nvgpu_gr_subctx_alloc(g);
+		if (subctx->gr_subctx == NULL) {
+			nvgpu_err(g, "gr_subctx alloc failed");
+			return -ENOMEM;
+		}
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;
+}
+
+int nvgpu_tsg_subctx_setup_subctx_header(struct gk20a *g,
+					 struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx;
+	int err;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	subctx = ch->subctx;
+	if ((subctx == NULL) || (subctx->gr_subctx == NULL)) {
+		nvgpu_err(g, "channel not bound to TSG/GR subctx");
+		return -EINVAL;
+	}
+
+	err = nvgpu_gr_subctx_setup_header(g, subctx->gr_subctx, subctx->vm);
+	if (err != 0) {
+		nvgpu_err(g, "gr_subctx header setup failed %d", err);
+		return err;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;
+}
+
+struct nvgpu_gr_subctx *nvgpu_tsg_subctx_get_gr_subctx(
+				struct nvgpu_tsg_subctx *subctx)
+{
+	return subctx->gr_subctx;
+}
+
+u32 nvgpu_tsg_subctx_get_id(struct nvgpu_tsg_subctx *subctx)
+{
+	return subctx->subctx_id;
+}
+
+struct nvgpu_gr_ctx_mappings *nvgpu_tsg_subctx_alloc_or_get_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct nvgpu_channel *ch)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_gr_subctx *gr_subctx = NULL;
+	struct vm_gk20a *vm = ch->vm;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_assert(ch->subctx != NULL);
+	nvgpu_assert(ch->subctx->vm == vm);
+
+	mappings = nvgpu_gr_ctx_mappings_get_subctx_mappings(g, tsg, vm);
+	if (mappings != NULL) {
+		goto add_gr_subctx;
+	}
+
+	mappings = nvgpu_gr_ctx_mappings_create_subctx_mappings(g, tsg, vm);
+	if (mappings == NULL) {
+		nvgpu_err(g, "failed to allocate gr_ctx mappings");
+		return NULL;
+	}
+
+add_gr_subctx:
+	gr_subctx = nvgpu_tsg_subctx_get_gr_subctx(ch->subctx);
+	nvgpu_assert(gr_subctx != NULL);
+
+	nvgpu_gr_ctx_mappings_add_gr_subctx(mappings, gr_subctx);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return mappings;
+}
+
+#ifdef CONFIG_NVGPU_GFXP
+static struct nvgpu_gr_ctx_mappings *nvgpu_tsg_subctx_get_veid0_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_tsg_subctx *subctx = NULL;
+
+	subctx = nvgpu_tsg_subctx_from_id(tsg, CHANNEL_INFO_VEID0);
+	if (subctx == NULL) {
+		nvgpu_log(g, gpu_dbg_gr, "VEID0 subctx not available");
+		return NULL;
+	}
+
+	mappings = nvgpu_gr_subctx_get_mappings(subctx->gr_subctx);
+	if (mappings == NULL) {
+		nvgpu_log(g, gpu_dbg_gr, "VEID0 mappings not available");
+		return NULL;
+	}
+
+	return mappings;
+}
+
+void nvgpu_tsg_subctxs_set_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx)
+{
+	struct nvgpu_gr_ctx_mappings *veid0_mappings;
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct nvgpu_tsg *tsg = tsg_subctx->tsg;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+
+	veid0_mappings = nvgpu_tsg_subctx_get_veid0_mappings(g, tsg);
+	if (veid0_mappings == NULL) {
+		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+		return;
+	}
+
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->gr_subctx != NULL) {
+			nvgpu_gr_subctx_set_preemption_buffer_va(g,
+				subctx->gr_subctx, veid0_mappings);
+		}
+	}
+
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+
+void nvgpu_tsg_subctxs_clear_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct nvgpu_tsg *tsg = tsg_subctx->tsg;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->gr_subctx != NULL) {
+			nvgpu_gr_subctx_clear_preemption_buffer_va(g,
+				subctx->gr_subctx);
+		}
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+#endif /* CONFIG_NVGPU_GFXP */
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+void nvgpu_tsg_subctxs_set_pm_buffer_va(struct nvgpu_tsg *tsg,
+					bool set_pm_ctx_gpu_va)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->gr_subctx != NULL) {
+			nvgpu_gr_subctx_set_hwpm_ptr(g, subctx->gr_subctx,
+						     set_pm_ctx_gpu_va);
+		}
+	}
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+#endif /* CONFIG_NVGPU_DEBUGGER */
diff --git a/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h b/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h
new file mode 100644
index 000000000..ed6376148
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_COMMON_FIFO_TSG_SUBCTX_PRIV_H
+#define NVGPU_COMMON_FIFO_TSG_SUBCTX_PRIV_H
+
+#include <nvgpu/types.h>
+#include <nvgpu/list.h>
+
+struct nvgpu_tsg;
+struct vm_gk20a;
+struct nvgpu_gr_subctx;
+
+struct nvgpu_tsg_subctx {
+
+	/** Subcontext Id (aka. veid). */
+	u32 subctx_id;
+
+	/** TSG to which this subcontext belongs. */
+	struct nvgpu_tsg *tsg;
+
+	/** Subcontext's address space. */
+	struct vm_gk20a *vm;
+
+	/** Subcontext's GR ctx header and GR ctx buffers mappings. */
+	struct nvgpu_gr_subctx *gr_subctx;
+
+	/**
+	 * Subcontext's entry in TSG's (#nvgpu_tsg) subcontexts list
+	 * #subctx_list.
+	 */
+	struct nvgpu_list_node tsg_entry;
+
+	/**
+	 * List of channels (#nvgpu_channel) bound to this TSG subcontext.
+	 * Accessed by holding #ch_list_lock from TSG.
+	 */
+	struct nvgpu_list_node ch_list;
+};
+
+#endif /* NVGPU_COMMON_FIFO_TSG_SUBCTX_PRIV_H */
diff --git a/drivers/gpu/nvgpu/common/gr/ctx.c b/drivers/gpu/nvgpu/common/gr/ctx.c
index 541e5a234..5616d5432 100644
--- a/drivers/gpu/nvgpu/common/gr/ctx.c
+++ b/drivers/gpu/nvgpu/common/gr/ctx.c
@@ -30,6 +30,7 @@
 #include <nvgpu/gmmu.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/string.h>
+#include <nvgpu/tsg_subctx.h>
 
 #include <nvgpu/power_features/pg.h>
 #include "common/gr/ctx_priv.h"
@@ -116,9 +117,7 @@ int nvgpu_gr_ctx_alloc_ctx_buffers(struct gk20a *g,
 		}
 #endif
 
-		if (desc->size[i] != 0U) {
-			nvgpu_assert(!nvgpu_mem_is_valid(&ctx->mem[i]));
-
+		if (desc->size[i] != 0U && !nvgpu_mem_is_valid(&ctx->mem[i])) {
 			err = nvgpu_dma_alloc_sys(g, desc->size[i],
 				&ctx->mem[i]);
 			if (err != 0) {
@@ -126,10 +125,14 @@ int nvgpu_gr_ctx_alloc_ctx_buffers(struct gk20a *g,
 				nvgpu_gr_ctx_free_ctx_buffers(g, ctx);
 				return err;
 			}
+
+			nvgpu_log(g, gpu_dbg_gr, "ctx buffer %u allocated", i);
 		}
 	}
 
-	ctx->ctx_id_valid = false;
+	if (!nvgpu_gr_ctx_get_ctx_initialized(ctx)) {
+		ctx->ctx_id_valid = false;
+	}
 
 	nvgpu_log(g, gpu_dbg_gr, "done");
 
@@ -206,6 +209,8 @@ int nvgpu_gr_ctx_alloc_ctx_preemption_buffers(struct gk20a *g,
 				nvgpu_gr_ctx_free_ctx_preemption_buffers(g, ctx);
 				return err;
 			}
+
+			nvgpu_log(g, gpu_dbg_gr, "ctx preemption buffer %u allocated", i);
 		}
 	}
 
@@ -219,32 +224,54 @@ void nvgpu_gr_ctx_free(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer)
 {
+	struct nvgpu_tsg *tsg;
+
 	nvgpu_log(g, gpu_dbg_gr, " ");
 
-	if ((gr_ctx != NULL) && (gr_ctx->mappings != NULL)) {
-		nvgpu_gr_ctx_unmap_buffers(g,
-			gr_ctx, global_ctx_buffer, gr_ctx->mappings);
+	if (gr_ctx != NULL) {
+		tsg = nvgpu_tsg_get_from_id(g, gr_ctx->tsgid);
 
-		nvgpu_gr_ctx_free_mappings(g, gr_ctx);
+		nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+		if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+			nvgpu_assert(nvgpu_list_empty(&tsg->ch_list));
+			nvgpu_assert(nvgpu_list_empty(&tsg->subctx_list));
+			nvgpu_assert(nvgpu_list_empty(&tsg->gr_ctx_mappings_list));
+		} else {
+			if (gr_ctx->mappings != NULL) {
+				nvgpu_gr_ctx_unmap_buffers(g,
+					gr_ctx, NULL, global_ctx_buffer,
+					gr_ctx->mappings);
+
+				nvgpu_gr_ctx_free_mappings(g, gr_ctx);
+			}
+		}
 
 		nvgpu_gr_ctx_set_patch_ctx_data_count(gr_ctx, 0);
 
 		nvgpu_gr_ctx_free_ctx_buffers(g, gr_ctx);
 
 		(void) memset(gr_ctx, 0, sizeof(*gr_ctx));
+
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 	}
 
 	nvgpu_log(g, gpu_dbg_gr, "done");
 }
 
 struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_alloc_or_get_mappings(struct gk20a *g,
-				struct nvgpu_tsg *tsg, struct vm_gk20a *vm)
+				struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
 {
 	struct nvgpu_gr_ctx_mappings *mappings = NULL;
 	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct vm_gk20a *vm = ch->vm;
 
 	nvgpu_log(g, gpu_dbg_gr, " ");
 
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return nvgpu_tsg_subctx_alloc_or_get_mappings(g, tsg, ch);
+	}
+
 	mappings = gr_ctx->mappings;
 	if (mappings != NULL) {
 		return mappings;
@@ -278,11 +305,16 @@ void nvgpu_gr_ctx_free_mappings(struct gk20a *g,
 	nvgpu_log(g, gpu_dbg_gr, "done");
 }
 
-struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg)
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg,
+					struct nvgpu_channel *ch)
 {
-	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct gk20a *g = tsg->g;
 
-	return gr_ctx->mappings;
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return nvgpu_gr_ctx_mappings_get_subctx_mappings(g, tsg, ch->vm);
+	}
+
+	return tsg->gr_ctx->mappings;
 }
 
 void nvgpu_gr_ctx_set_patch_ctx_data_count(struct nvgpu_gr_ctx *gr_ctx,
@@ -639,9 +671,9 @@ int nvgpu_gr_ctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,
 
 #ifdef CONFIG_NVGPU_GFXP
 void nvgpu_gr_ctx_set_preemption_buffer_va(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings)
+	struct nvgpu_gr_ctx *gr_ctx)
 {
+	struct nvgpu_gr_ctx_mappings *mappings = gr_ctx->mappings;
 	struct nvgpu_mem *mem = &gr_ctx->mem[NVGPU_GR_CTX_CTX];
 	u64 preempt_ctxsw_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
 						NVGPU_GR_CTX_PREEMPT_CTXSW);
@@ -744,12 +776,6 @@ int nvgpu_gr_ctx_alloc_map_pm_ctx(struct gk20a *g,
 		return 0;
 	}
 
-	mappings = nvgpu_gr_ctx_get_mappings(tsg);
-	if (mappings == NULL) {
-		nvgpu_err(g, "gr_ctx mappings struct not allocated");
-		return -ENOMEM;
-	}
-
 	nvgpu_gr_ctx_set_size(gr_ctx_desc,
 		NVGPU_GR_CTX_PM_CTX,
 		nvgpu_gr_hwpm_map_get_size(hwpm_map));
@@ -761,8 +787,25 @@ int nvgpu_gr_ctx_alloc_map_pm_ctx(struct gk20a *g,
 		return ret;
 	}
 
-	ret = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, gr_ctx,
-			NVGPU_GR_CTX_PM_CTX, mappings);
+	/*
+	 * Commit NVGPU_GR_CTX_PM_CTX gpu va for all subcontexts
+	 * when subcontexts are enabled.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		ret = nvgpu_gr_ctx_mappings_map_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PM_CTX);
+	} else {
+		mappings = nvgpu_gr_ctx_get_mappings(tsg, NULL);
+		if (mappings == NULL) {
+			nvgpu_err(g, "gr_ctx mappings struct not allocated");
+			nvgpu_gr_ctx_free_pm_ctx(g, gr_ctx);
+			return -ENOMEM;
+		}
+
+		ret = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, gr_ctx,
+				NVGPU_GR_CTX_PM_CTX, mappings);
+	}
+
 	if (ret != 0) {
 		nvgpu_err(g, "gr_ctx pm_ctx buffer map failed %d", ret);
 		nvgpu_gr_ctx_free_pm_ctx(g, gr_ctx);
@@ -839,13 +882,13 @@ int nvgpu_gr_ctx_set_smpc_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,
 
 int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
-	u32 mode, u64 *pm_ctx_gpu_va, bool *skip_update)
+	u32 mode, bool *set_pm_ctx_gpu_va, bool *skip_update)
 {
-	struct nvgpu_gr_ctx_mappings *mappings = gr_ctx->mappings;
 	struct nvgpu_mem *mem = &gr_ctx->mem[NVGPU_GR_CTX_CTX];
 	struct pm_ctx_desc *pm_ctx = &gr_ctx->pm_ctx;
 	int ret = 0;
 
+	*set_pm_ctx_gpu_va = false;
 	*skip_update = false;
 
 	if (!nvgpu_mem_is_valid(mem)) {
@@ -868,8 +911,7 @@ int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 			return 0;
 		}
 		pm_ctx->pm_mode = g->ops.gr.ctxsw_prog.hw_get_pm_mode_ctxsw();
-		*pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
-					NVGPU_GR_CTX_PM_CTX);
+		*set_pm_ctx_gpu_va = true;
 		break;
 	case  NVGPU_GR_CTX_HWPM_CTXSW_MODE_NO_CTXSW:
 		if (pm_ctx->pm_mode ==
@@ -879,7 +921,7 @@ int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 		}
 		pm_ctx->pm_mode =
 			g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw();
-		*pm_ctx_gpu_va = 0;
+		*set_pm_ctx_gpu_va = false;
 		break;
 	case NVGPU_GR_CTX_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
 		if (pm_ctx->pm_mode ==
@@ -889,8 +931,7 @@ int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 		}
 		pm_ctx->pm_mode =
 			g->ops.gr.ctxsw_prog.hw_get_pm_mode_stream_out_ctxsw();
-		*pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
-					NVGPU_GR_CTX_PM_CTX);
+		*set_pm_ctx_gpu_va = true;
 		break;
 	default:
 		nvgpu_err(g, "invalid hwpm context switch mode");
@@ -909,9 +950,16 @@ void nvgpu_gr_ctx_set_hwpm_pm_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx)
 }
 
 void nvgpu_gr_ctx_set_hwpm_ptr(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,
-			       u64 pm_ctx_gpu_va)
+			       bool set_pm_ctx_gpu_va)
 {
 	struct nvgpu_mem *mem = &gr_ctx->mem[NVGPU_GR_CTX_CTX];
+	u64 pm_ctx_gpu_va = 0ULL;
+
+	if (set_pm_ctx_gpu_va) {
+		pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+						gr_ctx->mappings,
+						NVGPU_GR_CTX_PM_CTX);
+	}
 
 	g->ops.gr.ctxsw_prog.set_pm_ptr(g, mem, pm_ctx_gpu_va);
 }
@@ -921,3 +969,47 @@ void nvgpu_gr_ctx_set_pm_ctx_mapped(struct nvgpu_gr_ctx *ctx, bool mapped)
 	ctx->pm_ctx.mapped = mapped;
 }
 #endif /* CONFIG_NVGPU_DEBUGGER */
+
+bool nvgpu_gr_obj_ctx_global_ctx_buffers_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->global_ctx_buffers_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->global_ctx_buffers_patched = patched;
+}
+
+bool nvgpu_gr_obj_ctx_preempt_buffers_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->preempt_buffers_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_preempt_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->preempt_buffers_patched = patched;
+}
+
+bool nvgpu_gr_obj_ctx_default_compute_regs_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->default_compute_regs_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_default_compute_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->default_compute_regs_patched = patched;
+}
+
+bool nvgpu_gr_obj_ctx_default_gfx_regs_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->default_gfx_regs_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_default_gfx_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->default_gfx_regs_patched = patched;
+}
diff --git a/drivers/gpu/nvgpu/common/gr/ctx_mappings.c b/drivers/gpu/nvgpu/common/gr/ctx_mappings.c
index 541066a11..cfa60afe5 100644
--- a/drivers/gpu/nvgpu/common/gr/ctx_mappings.c
+++ b/drivers/gpu/nvgpu/common/gr/ctx_mappings.c
@@ -22,17 +22,38 @@
 
 #include <nvgpu/gk20a.h>
 #include <nvgpu/static_analysis.h>
+#include <nvgpu/tsg_subctx.h>
+#include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/global_ctx.h>
 #include <nvgpu/gr/ctx.h>
+#include <nvgpu/gr/obj_ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
 #include <nvgpu/vm.h>
 #include <nvgpu/io.h>
 #include <nvgpu/gmmu.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/string.h>
+#include <nvgpu/list.h>
+#include <nvgpu/gr/gr_utils.h>
+#include <nvgpu/fifo.h>
 
 #include <nvgpu/power_features/pg.h>
 #include "common/gr/ctx_mappings_priv.h"
+#include "common/gr/subctx_priv.h"
+
+static inline struct nvgpu_gr_ctx_mappings *
+nvgpu_gr_ctx_mappings_from_tsg_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_gr_ctx_mappings *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_gr_ctx_mappings, tsg_entry));
+};
+
+static inline struct nvgpu_gr_subctx *
+nvgpu_gr_subctx_from_gr_ctx_mappings_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_gr_subctx *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_gr_subctx, gr_ctx_mappings_entry));
+};
 
 struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_create(struct gk20a *g,
 				struct nvgpu_tsg *tsg, struct vm_gk20a *vm)
@@ -83,11 +104,14 @@ int nvgpu_gr_ctx_mappings_map_ctx_buffer(struct gk20a *g,
 
 	nvgpu_log(g, gpu_dbg_gr, " ");
 
+	if (mappings->ctx_buffer_va[index] != 0ULL) {
+		nvgpu_log_info(g, "buffer %u already mapped", index);
+		return 0;
+	}
+
 	mem = nvgpu_gr_ctx_get_ctx_mem(ctx, index);
 	mapping_flags = nvgpu_gr_ctx_get_ctx_mapping_flags(ctx, index);
 
-	nvgpu_assert(mappings->ctx_buffer_va[index] == 0ULL);
-
 	if (nvgpu_mem_is_valid(mem)) {
 		gpu_va = nvgpu_gmmu_map(vm,
 				mem,
@@ -138,28 +162,149 @@ static void nvgpu_gr_ctx_mappings_unmap_ctx_buffer(struct nvgpu_gr_ctx *ctx,
 	}
 }
 
+static void nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(
+		struct nvgpu_tsg *tsg, u32 index)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+
+	nvgpu_assert(index < NVGPU_GR_CTX_COUNT);
+
+	nvgpu_list_for_each_entry(mappings, &tsg->gr_ctx_mappings_list,
+				  nvgpu_gr_ctx_mappings, tsg_entry) {
+		nvgpu_gr_ctx_mappings_unmap_ctx_buffer(gr_ctx,
+				index, mappings);
+	}
+}
+
+int nvgpu_gr_ctx_mappings_map_buffer_all_subctx(
+		struct nvgpu_tsg *tsg, u32 index)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct gk20a *g = tsg->g;
+	int err;
+
+	nvgpu_assert(index < NVGPU_GR_CTX_COUNT);
+
+	nvgpu_list_for_each_entry(mappings, &tsg->gr_ctx_mappings_list,
+				  nvgpu_gr_ctx_mappings, tsg_entry) {
+		err = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, gr_ctx,
+				index, mappings);
+		if (err != 0) {
+			nvgpu_err(g, "gr_ctx buffer %u map failed %d", index, err);
+			nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg, index);
+			return err;
+		}
+
+	}
+
+	return 0;
+}
+
 static void nvgpu_gr_ctx_mappings_unmap_ctx_buffers(struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_COUNT;
 	u32 i;
+#ifdef CONFIG_NVGPU_GFXP
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct gk20a *g = tsg->g;
+	bool is_sync_veid;
+	bool gfxp_active;
+#endif
 
-	for (i = 0; i < NVGPU_GR_CTX_COUNT; i++) {
+	(void) subctx;
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);
+
+		if (is_sync_veid && gfxp_active) {
+			nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg,
+				NVGPU_GR_CTX_PREEMPT_CTXSW);
+			nvgpu_tsg_subctxs_clear_preemption_buffer_va(subctx);
+			nvgpu_gr_ctx_init_graphics_preemption_mode(ctx,
+				NVGPU_PREEMPTION_MODE_GRAPHICS_WFI);
+		}
+
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				nvgpu_gr_subctx_clear_preemption_buffer_va(g,
+					nvgpu_tsg_subctx_get_gr_subctx(subctx));
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW + 1U;
+			} else {
+				buffers_count = NVGPU_GR_CTX_PATCH_CTX + 1U;
+			}
+		}
+	}
+#endif
+
+	for (i = 0; i < buffers_count; i++) {
 		nvgpu_gr_ctx_mappings_unmap_ctx_buffer(ctx, i, mappings);
 	}
 }
 
 static int nvgpu_gr_ctx_mappings_map_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_COUNT;
 	int err = 0;
 	u32 i;
+#ifdef CONFIG_NVGPU_GFXP
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	bool is_sync_veid;
+	bool gfxp_active;
+#endif
 
-	for (i = 0; i < NVGPU_GR_CTX_COUNT; i++) {
+	(void) subctx;
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);
+
+		if (is_sync_veid && gfxp_active) {
+			err = nvgpu_gr_ctx_mappings_map_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
+			if (err != 0) {
+				nvgpu_err(g, "preempt buffer mapping failed %d",
+					  err);
+				nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(
+					tsg, NVGPU_GR_CTX_PREEMPT_CTXSW);
+				return err;
+			}
+		}
+
+		/*
+		 * Only NVGPU_GR_CTX_PREEMPT_CTXSW is to be mapped for
+		 * all VEIDs.
+		 * Don't map other preemption buffers for ASYNC VEIDs.
+		 */
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW + 1U;
+			} else {
+				buffers_count = NVGPU_GR_CTX_PATCH_CTX + 1U;
+			}
+		}
+	}
+#endif
+
+	for (i = 0; i < buffers_count; i++) {
 		err = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, ctx, i, mappings);
 		if (err != 0) {
 			nvgpu_err(g, "gr_ctx buffer %u map failed %d", i, err);
-			nvgpu_gr_ctx_mappings_unmap_ctx_buffers(ctx, mappings);
+			nvgpu_gr_ctx_mappings_unmap_ctx_buffers(ctx,
+						subctx, mappings);
 			return err;
 		}
 	}
@@ -170,35 +315,96 @@ static int nvgpu_gr_ctx_mappings_map_ctx_buffers(struct gk20a *g,
 #ifdef CONFIG_NVGPU_GFXP
 static void nvgpu_gr_ctx_mappings_unmap_ctx_preemption_buffers(
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_GFXP_RTVCB_CTXSW;
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct gk20a *g = tsg->g;
+	bool is_sync_veid;
+	bool gfxp_active;
 	u32 i;
 
-	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW;
-			i <= NVGPU_GR_CTX_GFXP_RTVCB_CTXSW; i++) {
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);
+
+		if (is_sync_veid && gfxp_active) {
+			nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg,
+				NVGPU_GR_CTX_PREEMPT_CTXSW);
+
+			nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+			nvgpu_tsg_subctxs_clear_preemption_buffer_va(subctx);
+			nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+			nvgpu_gr_ctx_init_graphics_preemption_mode(ctx,
+				NVGPU_PREEMPTION_MODE_GRAPHICS_WFI);
+		}
+
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				nvgpu_gr_subctx_clear_preemption_buffer_va(g,
+					nvgpu_tsg_subctx_get_gr_subctx(subctx));
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW;
+			} else {
+				return;
+			}
+		}
+	}
+
+	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW; i <= buffers_count; i++) {
 		nvgpu_gr_ctx_mappings_unmap_ctx_buffer(ctx, i, mappings);
 	}
 }
 
 int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_GFXP_RTVCB_CTXSW;
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	bool is_sync_veid;
+	bool gfxp_active;
 	int err = 0;
 	u32 i;
 
-	nvgpu_log(g, gpu_dbg_gr, " ");
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);
 
-	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW;
-			i <= NVGPU_GR_CTX_GFXP_RTVCB_CTXSW; i++) {
-		if (mappings->ctx_buffer_va[i] == 0ULL) {
-			err = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, ctx, i, mappings);
+		if (is_sync_veid && gfxp_active) {
+			err = nvgpu_gr_ctx_mappings_map_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
 			if (err != 0) {
-				nvgpu_err(g, "gr_ctx buffer %u map failed %d", i, err);
-				nvgpu_gr_ctx_mappings_unmap_ctx_preemption_buffers(ctx, mappings);
+				nvgpu_err(g, "preempt buffer mapping failed %d", err);
+				nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
 				return err;
 			}
 		}
+
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW;
+			} else {
+				return 0;
+			}
+		}
+	}
+
+	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW; i <= buffers_count; i++) {
+		err = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, ctx, i, mappings);
+		if (err != 0) {
+			nvgpu_err(g, "gr_ctx buffer %u map failed %d", i, err);
+			nvgpu_gr_ctx_mappings_unmap_ctx_preemption_buffers(ctx,
+				subctx, mappings);
+			return err;
+		}
 	}
 
 	nvgpu_log(g, gpu_dbg_gr, "done");
@@ -273,7 +479,9 @@ static void nvgpu_gr_ctx_mappings_unmap_global_ctx_buffers(
 
 static int nvgpu_gr_ctx_mappings_map_global_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
-	struct nvgpu_gr_ctx_mappings *mappings, bool vpr)
+	struct nvgpu_tsg_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *mappings,
+	bool vpr)
 {
 	int err;
 
@@ -282,7 +490,7 @@ static int nvgpu_gr_ctx_mappings_map_global_ctx_buffers(struct gk20a *g,
 	 * Allocate BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB and RTV_CB
 	 * if 2D/3D/I2M classes(graphics) are supported.
 	 */
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
 		/* Circular Buffer */
 		err = nvgpu_gr_ctx_mappings_map_global_ctx_buffer(
 					global_ctx_buffer,
@@ -388,7 +596,7 @@ fail:
 }
 
 int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	bool vpr)
@@ -403,17 +611,17 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	err = nvgpu_gr_ctx_mappings_map_ctx_buffers(g, gr_ctx, mappings);
+	err = nvgpu_gr_ctx_mappings_map_ctx_buffers(g, gr_ctx, subctx, mappings);
 	if (err != 0) {
 		nvgpu_err(g, "fail to map ctx buffers");
 		return err;
 	}
 
 	err = nvgpu_gr_ctx_mappings_map_global_ctx_buffers(g,
-			global_ctx_buffer, mappings, vpr);
+			global_ctx_buffer, subctx, mappings, vpr);
 	if (err != 0) {
 		nvgpu_err(g, "fail to map global ctx buffer");
-		nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, mappings);
+		nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, subctx, mappings);
 		return err;
 	}
 
@@ -424,6 +632,7 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
 
 void nvgpu_gr_ctx_unmap_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
@@ -432,7 +641,7 @@ void nvgpu_gr_ctx_unmap_buffers(struct gk20a *g,
 	nvgpu_gr_ctx_mappings_unmap_global_ctx_buffers(global_ctx_buffer,
 		mappings);
 
-	nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, mappings);
+	nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, subctx, mappings);
 
 	nvgpu_log(g, gpu_dbg_gr, "done");
 }
@@ -450,3 +659,118 @@ u64 nvgpu_gr_ctx_mappings_get_ctx_va(struct nvgpu_gr_ctx_mappings *mappings,
 	nvgpu_assert(index < NVGPU_GR_CTX_COUNT);
 	return mappings->ctx_buffer_va[index];
 }
+
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_get_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_list_for_each_entry(mappings, &tsg->gr_ctx_mappings_list,
+				nvgpu_gr_ctx_mappings, tsg_entry) {
+		if (mappings->vm == vm) {
+			return mappings;
+		}
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return NULL;
+}
+
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_create_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	mappings = (struct nvgpu_gr_ctx_mappings *)
+			nvgpu_kzalloc(g, sizeof(struct nvgpu_gr_ctx_mappings));
+	if (mappings == NULL) {
+		nvgpu_err(g, "failed to alloc mappings");
+		return NULL;
+	}
+
+	nvgpu_vm_get(vm);
+	mappings->tsg = tsg;
+	mappings->vm = vm;
+
+	nvgpu_init_list_node(&mappings->tsg_entry);
+	nvgpu_init_list_node(&mappings->subctx_list);
+
+	/* add mappings to the list in the tsg */
+	nvgpu_list_add_tail(&mappings->tsg_entry,
+			    &tsg->gr_ctx_mappings_list);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return mappings;
+}
+
+void nvgpu_gr_ctx_mappings_add_gr_subctx(struct nvgpu_gr_ctx_mappings *mappings,
+				   struct nvgpu_gr_subctx *subctx)
+{
+	struct nvgpu_gr_subctx *subctx_iter = NULL;
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct gk20a *g = tsg->g;
+	bool found = false;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_list_for_each_entry(subctx_iter, &mappings->subctx_list,
+				nvgpu_gr_subctx, gr_ctx_mappings_entry) {
+		if (subctx_iter == subctx) {
+			found = true;
+			goto out;
+		}
+	}
+
+out:
+	if (!found) {
+		subctx->mappings = mappings;
+		nvgpu_list_add_tail(&subctx->gr_ctx_mappings_entry,
+				    &mappings->subctx_list);
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+
+void nvgpu_gr_ctx_mappings_free_subctx_mappings(struct nvgpu_tsg_subctx *subctx,
+			struct nvgpu_gr_ctx_mappings *mappings, bool unmap)
+{
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct gk20a *g = tsg->g;
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer =
+				nvgpu_gr_get_global_ctx_buffer_ptr(g);
+	bool is_sync_veid;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (nvgpu_list_empty(&mappings->subctx_list)) {
+		if (unmap) {
+			nvgpu_gr_ctx_unmap_buffers(g,
+				gr_ctx, subctx, global_ctx_buffer, mappings);
+		}
+
+		/* remove mappings from the list in the tsg */
+		nvgpu_list_del(&mappings->tsg_entry);
+
+		nvgpu_gr_ctx_mappings_free(g, mappings);
+	}
+
+	is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) == CHANNEL_INFO_VEID0;
+
+	if (is_sync_veid) {
+		nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(gr_ctx, false);
+		nvgpu_gr_obj_ctx_set_preempt_buffers_patched(gr_ctx, false);
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
diff --git a/drivers/gpu/nvgpu/common/gr/ctx_mappings_priv.h b/drivers/gpu/nvgpu/common/gr/ctx_mappings_priv.h
index 34b3e6722..fe857d830 100644
--- a/drivers/gpu/nvgpu/common/gr/ctx_mappings_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/ctx_mappings_priv.h
@@ -53,5 +53,16 @@ struct nvgpu_gr_ctx_mappings {
 	 * corresponding to GPU virtual addresses above.
 	 */
 	u32	global_ctx_buffer_index[NVGPU_GR_GLOBAL_CTX_VA_COUNT];
+
+	/**
+	 * GR ctx mappings' entry in TSG's (#nvgpu_tsg) mappings list
+	 * #gr_ctx_mappings_list.
+	 */
+	struct nvgpu_list_node tsg_entry;
+
+	/**
+	 * List of GR subcontexts (#nvgpu_gr_subctx) using this mapping.
+	 */
+	struct nvgpu_list_node subctx_list;
 };
 #endif /* NVGPU_GR_CTX_MAPPINGS_PRIV_H */
diff --git a/drivers/gpu/nvgpu/common/gr/ctx_priv.h b/drivers/gpu/nvgpu/common/gr/ctx_priv.h
index 887fe4731..6ef818470 100644
--- a/drivers/gpu/nvgpu/common/gr/ctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/ctx_priv.h
@@ -160,6 +160,11 @@ struct nvgpu_gr_ctx {
 	 */
 	u32 sm_diversity_config;
 #endif
+
+	bool global_ctx_buffers_patched;
+	bool preempt_buffers_patched;
+	bool default_compute_regs_patched;
+	bool default_gfx_regs_patched;
 };
 
 #endif /* NVGPU_GR_CTX_PRIV_H */
diff --git a/drivers/gpu/nvgpu/common/gr/gr_setup.c b/drivers/gpu/nvgpu/common/gr/gr_setup.c
index 3b0f0daac..6c5c9005d 100644
--- a/drivers/gpu/nvgpu/common/gr/gr_setup.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_setup.c
@@ -33,6 +33,7 @@
 #include <nvgpu/gr/gr_instances.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/preempt.h>
+#include <nvgpu/tsg_subctx.h>
 
 #include "gr_priv.h"
 
@@ -140,22 +141,6 @@ static int nvgpu_gr_setup_validate_channel_and_class(struct gk20a *g,
 	return err;
 }
 
-static int nvgpu_gr_setup_alloc_subctx(struct gk20a *g, struct nvgpu_channel *c)
-{
-	int err = 0;
-
-	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
-		if (c->subctx == NULL) {
-			c->subctx = nvgpu_gr_subctx_alloc(g, c->vm);
-			if (c->subctx == NULL) {
-				err = -ENOMEM;
-			}
-		}
-	}
-
-	return err;
-}
-
 int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 		u32 flags)
 {
@@ -165,6 +150,9 @@ int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 	int err = 0;
 	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
 	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+#ifdef CONFIG_NVGPU_FECS_TRACE
+	struct nvgpu_gr_subctx *gr_subctx = NULL;
+#endif
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr,
 		"GR%u: allocate object context for channel %u",
@@ -195,54 +183,53 @@ int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 		return -EINVAL;
 	}
 
-	err = nvgpu_gr_setup_alloc_subctx(g, c);
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, c);
 	if (err != 0) {
-		nvgpu_err(g, "failed to allocate gr subctx buffer");
+		nvgpu_err(g, "failed to alloc gr subctx");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		goto out;
 	}
 
-	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+	err = nvgpu_tsg_subctx_setup_subctx_header(g, c);
+	if (err != 0) {
+		nvgpu_err(g, "failed to setup subctx header");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		goto out;
+	}
 
 	gr_ctx = tsg->gr_ctx;
 
-	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, c->vm);
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, c);
 	if (mappings == NULL) {
 		nvgpu_err(g, "fail to allocate/get ctx mappings struct");
 		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		goto out;
 	}
 
-	if (!nvgpu_mem_is_valid(nvgpu_gr_ctx_get_ctx_mem(gr_ctx,
-							 NVGPU_GR_CTX_CTX))) {
-		tsg->vm = c->vm;
-		nvgpu_vm_get(tsg->vm);
-
-		err = nvgpu_gr_obj_ctx_alloc(g, gr->golden_image,
-				gr->global_ctx_buffer, gr->gr_ctx_desc,
-				gr->config, gr_ctx, c->subctx,
-				mappings, &c->inst_block, class_num, flags,
-				c->cde, c->vpr);
-		if (err != 0) {
-			nvgpu_err(g,
-				"failed to allocate gr ctx buffer");
-			nvgpu_gr_ctx_free_mappings(g, gr_ctx);
-			nvgpu_mutex_release(&tsg->ctx_init_lock);
-			nvgpu_vm_put(tsg->vm);
-			tsg->vm = NULL;
-			goto out;
-		}
-
-		nvgpu_gr_ctx_set_tsgid(gr_ctx, tsg->tsgid);
-	} else {
-		/* commit gr ctx buffer */
-		nvgpu_gr_obj_ctx_commit_inst(g, &c->inst_block, gr_ctx,
-			c->subctx, mappings);
+	err = nvgpu_gr_obj_ctx_alloc(g, gr->golden_image,
+			gr->global_ctx_buffer, gr->gr_ctx_desc,
+			gr->config, gr_ctx, c->subctx,
+			mappings, &c->inst_block, class_num, flags,
+			c->cde, c->vpr);
+	if (err != 0) {
+		nvgpu_err(g,
+			"failed to allocate gr ctx buffer");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		goto out;
 	}
 
+	nvgpu_gr_ctx_set_tsgid(gr_ctx, tsg->tsgid);
+
 #ifdef CONFIG_NVGPU_FECS_TRACE
 	if (g->ops.gr.fecs_trace.bind_channel && !c->vpr) {
+		if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+			gr_subctx = nvgpu_tsg_subctx_get_gr_subctx(c->subctx);
+		}
+
 		err = g->ops.gr.fecs_trace.bind_channel(g, &c->inst_block,
-			c->subctx, gr_ctx, mappings, tsg->tgid, 0);
+			gr_subctx, gr_ctx, mappings, tsg->tgid, 0);
 		if (err != 0) {
 			nvgpu_warn(g,
 				"fail to bind channel for ctxsw trace");
@@ -274,11 +261,6 @@ int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
 	return 0;
 out:
-	if (c->subctx != NULL) {
-		nvgpu_gr_subctx_free(g, c->subctx, c->vm);
-		c->subctx = NULL;
-	}
-
 	/* 1. gr_ctx, patch_ctx and global ctx buffer mapping
 	   can be reused so no need to release them.
 	   2. golden image init and load is a one time thing so if
@@ -320,13 +302,12 @@ void nvgpu_gr_setup_free_subctx(struct nvgpu_channel *c)
 		return;
 	}
 
-	if (c->subctx != NULL) {
-		nvgpu_gr_subctx_free(c->g, c->subctx, c->vm);
-		c->subctx = NULL;
-	}
+	nvgpu_gr_subctx_free(c->g, c->subctx, c->vm, true);
+
+	nvgpu_log_fn(c->g, "done");
 }
 
-static bool nvgpu_gr_setup_validate_preemption_mode(u32 *graphics_preempt_mode,
+bool nvgpu_gr_setup_validate_preemption_mode(u32 *graphics_preempt_mode,
 				u32 *compute_preempt_mode,
 				struct nvgpu_gr_ctx *gr_ctx)
 {
@@ -383,9 +364,19 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
 
 	gr_ctx = tsg->gr_ctx;
 
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+	g->ops.tsg.disable(tsg);
+
+	err = nvgpu_preempt_channel(g, ch);
+	if (err != 0) {
+		nvgpu_err(g, "failed to preempt channel/TSG");
+		goto enable_ch;
+	}
+
 	if (nvgpu_gr_setup_validate_preemption_mode(&graphics_preempt_mode,
 				&compute_preempt_mode, gr_ctx) == false) {
-		return 0;
+		goto enable_ch;
 	}
 
 	nvgpu_log(g, gpu_dbg_gr | gpu_dbg_sched, "chid=%d tsgid=%d pid=%d "
@@ -398,13 +389,14 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
 			graphics_preempt_mode, compute_preempt_mode);
 	if (err != 0) {
 		nvgpu_err(g, "set_ctxsw_preemption_mode failed");
-		return err;
+		goto enable_ch;
 	}
 
-	mappings = nvgpu_gr_ctx_get_mappings(tsg);
+	mappings = nvgpu_gr_ctx_get_mappings(tsg, ch);
 	if (mappings == NULL) {
 		nvgpu_err(g, "failed to get gr_ctx mappings");
-		return -EINVAL;
+		err = -EINVAL;
+		goto enable_ch;
 	}
 
 #ifdef CONFIG_NVGPU_GFXP
@@ -412,29 +404,21 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
 			gr->gr_ctx_desc, gr_ctx);
 	if (err != 0) {
 		nvgpu_err(g, "fail to allocate ctx preemption buffers");
-		return err;
+		goto enable_ch;
 	}
 
 	err = nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(g,
-			gr_ctx, mappings);
+			gr_ctx, ch->subctx, mappings);
 	if (err != 0) {
 		nvgpu_err(g, "fail to map ctx preemption buffers");
-		return err;
-	}
- #endif
-
-	g->ops.tsg.disable(tsg);
-
-	err = nvgpu_preempt_channel(g, ch);
-	if (err != 0) {
-		nvgpu_err(g, "failed to preempt channel/TSG");
 		goto enable_ch;
 	}
+ #endif
 
 	nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(g, gr->config, gr_ctx,
 		ch->subctx, mappings);
 
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, ch->subctx)) {
 		nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);
 		g->ops.gr.init.commit_global_cb_manager(g, gr->config, gr_ctx,
 			true);
@@ -443,9 +427,12 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
 
 	g->ops.tsg.enable(tsg);
 
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	return err;
 
 enable_ch:
 	g->ops.tsg.enable(tsg);
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
 	return err;
 }
diff --git a/drivers/gpu/nvgpu/common/gr/gr_utils.c b/drivers/gpu/nvgpu/common/gr/gr_utils.c
index 6239b3f29..85a27a941 100644
--- a/drivers/gpu/nvgpu/common/gr/gr_utils.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_utils.c
@@ -85,14 +85,12 @@ struct nvgpu_gr_zbc *nvgpu_gr_get_zbc_ptr(struct gk20a *g)
 }
 #endif
 
-#ifdef CONFIG_NVGPU_FECS_TRACE
 struct nvgpu_gr_global_ctx_buffer_desc *nvgpu_gr_get_global_ctx_buffer_ptr(
 							struct gk20a *g)
 {
 	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
 	return gr->global_ctx_buffer;
 }
-#endif
 
 #ifdef CONFIG_NVGPU_CILP
 u32 nvgpu_gr_get_cilp_preempt_pending_chid(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/common/gr/obj_ctx.c b/drivers/gpu/nvgpu/common/gr/obj_ctx.c
index 3f4206307..ebed24c31 100644
--- a/drivers/gpu/nvgpu/common/gr/obj_ctx.c
+++ b/drivers/gpu/nvgpu/common/gr/obj_ctx.c
@@ -31,10 +31,13 @@
 #endif
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
+#include <nvgpu/tsg_subctx.h>
+#include <nvgpu/gr/setup.h>
 #include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/global_ctx.h>
 #include <nvgpu/gr/obj_ctx.h>
 #include <nvgpu/gr/config.h>
+#include <nvgpu/gr/ctx.h>
 #include <nvgpu/netlist.h>
 #include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/gr/fs_state.h>
@@ -54,16 +57,46 @@ void nvgpu_gr_obj_ctx_commit_inst_gpu_va(struct gk20a *g,
 	g->ops.ramin.set_gr_ptr(g, inst_block, gpu_va);
 }
 
+#ifdef CONFIG_NVGPU_DEBUGGER
+static void nvgpu_gr_obj_ctx_set_pm_ctx_gpu_va(struct gk20a *g,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *tsg_subctx)
+{
+	struct nvgpu_gr_subctx *subctx;
+	bool set_pm_ctx_gpu_va;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	set_pm_ctx_gpu_va = nvgpu_gr_ctx_get_pm_ctx_pm_mode(gr_ctx) !=
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw();
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		subctx = nvgpu_tsg_subctx_get_gr_subctx(tsg_subctx);
+		nvgpu_gr_subctx_set_hwpm_ptr(g, subctx,
+					     set_pm_ctx_gpu_va);
+	} else {
+		nvgpu_gr_ctx_set_hwpm_ptr(g, gr_ctx, set_pm_ctx_gpu_va);
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+}
+#endif
+
 void nvgpu_gr_obj_ctx_commit_inst(struct gk20a *g, struct nvgpu_mem *inst_block,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *tsg_subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	struct nvgpu_gr_subctx *subctx;
 	struct nvgpu_mem *ctxheader;
 	u64 gpu_va;
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
 
+#ifdef CONFIG_NVGPU_DEBUGGER
+	nvgpu_gr_obj_ctx_set_pm_ctx_gpu_va(g, gr_ctx, tsg_subctx);
+#endif
+
 	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		subctx = nvgpu_tsg_subctx_get_gr_subctx(tsg_subctx);
 		nvgpu_gr_subctx_load_ctx_header(g, subctx, gr_ctx, mappings);
 
 		ctxheader = nvgpu_gr_subctx_get_ctx_header(subctx);
@@ -78,7 +111,50 @@ void nvgpu_gr_obj_ctx_commit_inst(struct gk20a *g, struct nvgpu_mem *inst_block,
 }
 
 #if defined(CONFIG_NVGPU_GFXP) || defined(CONFIG_NVGPU_CILP)
-static int nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
+static void nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_ctx_desc *gr_ctx_desc,
+	u32 class_num, u32 flags,
+	u32 *graphics_preempt_mode, u32 *compute_preempt_mode)
+{
+	u32 default_graphics_preempt_mode = 0U;
+	u32 default_compute_preempt_mode = 0U;
+
+	g->ops.gr.init.get_default_preemption_modes(
+			&default_graphics_preempt_mode,
+			&default_compute_preempt_mode);
+
+#ifdef CONFIG_NVGPU_GFXP
+	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP) != 0U) {
+		*graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
+	}
+
+	if (g->ops.gpu_class.is_valid_gfx(class_num) &&
+			nvgpu_gr_ctx_desc_force_preemption_gfxp(gr_ctx_desc)) {
+		*graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
+	}
+#endif
+
+#ifdef CONFIG_NVGPU_CILP
+	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP) != 0U) {
+		*compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
+	}
+
+	if (g->ops.gpu_class.is_valid_compute(class_num) &&
+			nvgpu_gr_ctx_desc_force_preemption_cilp(gr_ctx_desc)) {
+		*compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
+	}
+#endif
+
+	if (*compute_preempt_mode == 0U) {
+		*compute_preempt_mode = default_compute_preempt_mode;
+	}
+
+	if (*graphics_preempt_mode == 0U) {
+		*graphics_preempt_mode = default_graphics_preempt_mode;
+	}
+}
+
+static int nvgpu_gr_obj_ctx_init_ctxsw_preemption(struct gk20a *g,
 	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx_desc *gr_ctx_desc,
 	struct nvgpu_gr_ctx *gr_ctx,
 	u32 class_num, u32 flags)
@@ -86,8 +162,6 @@ static int nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
 	int err;
 	u32 graphics_preempt_mode = 0U;
 	u32 compute_preempt_mode = 0U;
-	u32 default_graphics_preempt_mode = 0U;
-	u32 default_compute_preempt_mode = 0U;
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
 
@@ -97,38 +171,26 @@ static int nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
 		return 0;
 	}
 
-	g->ops.gr.init.get_default_preemption_modes(
-			&default_graphics_preempt_mode,
-			&default_compute_preempt_mode);
-
+	if (nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
 #ifdef CONFIG_NVGPU_GFXP
-	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP) != 0U) {
-		graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
-	}
-
-	if (g->ops.gpu_class.is_valid_gfx(class_num) &&
-			nvgpu_gr_ctx_desc_force_preemption_gfxp(gr_ctx_desc)) {
-		graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
-	}
+		if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP) != 0U) {
+			graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
+		}
 #endif
 
 #ifdef CONFIG_NVGPU_CILP
-	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP) != 0U) {
-		compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
-	}
-
-	if (g->ops.gpu_class.is_valid_compute(class_num) &&
-			nvgpu_gr_ctx_desc_force_preemption_cilp(gr_ctx_desc)) {
-		compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
-	}
+		if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP) != 0U) {
+			compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
+		}
 #endif
-
-	if (compute_preempt_mode == 0U) {
-		compute_preempt_mode = default_compute_preempt_mode;
-	}
-
-	if (graphics_preempt_mode == 0U) {
-		graphics_preempt_mode = default_graphics_preempt_mode;
+		if (nvgpu_gr_setup_validate_preemption_mode(&graphics_preempt_mode,
+					&compute_preempt_mode, gr_ctx) == false) {
+			return 0;
+		}
+	} else {
+		nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(g, gr_ctx_desc,
+			class_num, flags, &graphics_preempt_mode,
+			&compute_preempt_mode);
 	}
 
 	err = nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(g, config,
@@ -266,43 +328,15 @@ fail:
 	return err;
 }
 
-void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
+#ifdef CONFIG_NVGPU_GFXP
+static void nvgpu_gr_obj_ctx_commit_veid0_preemption_buffers(struct gk20a *g,
 	struct nvgpu_gr_config *config,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
-#ifdef CONFIG_NVGPU_GFXP
 	u64 addr;
 	u32 size;
 	struct nvgpu_mem *mem;
-#endif
-
-	(void)config;
-	(void)subctx;
-	(void)mappings;
-
-	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
-
-	nvgpu_gr_ctx_set_preemption_modes(g, gr_ctx);
-
-#ifdef CONFIG_NVGPU_GFXP
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_PREEMPTION_GFXP)) {
-		goto done;
-	}
-
-	if (!nvgpu_mem_is_valid(
-			nvgpu_gr_ctx_get_ctx_mem(gr_ctx,
-				NVGPU_GR_CTX_PREEMPT_CTXSW))) {
-		goto done;
-	}
-
-	if (subctx != NULL) {
-		nvgpu_gr_subctx_set_preemption_buffer_va(g, subctx, mappings);
-	} else {
-		nvgpu_gr_ctx_set_preemption_buffer_va(g, gr_ctx, mappings);
-	}
-
-	nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);
 
 	addr = nvgpu_gr_ctx_mappings_get_ctx_va(mappings, NVGPU_GR_CTX_BETACB_CTXSW);
 	g->ops.gr.init.commit_global_attrib_cb(g, gr_ctx, mappings,
@@ -324,6 +358,75 @@ void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
 	size = (u32)mem->size;
 
 	g->ops.gr.init.commit_ctxsw_spill(g, gr_ctx, addr, size, true);
+}
+#endif
+
+bool nvgpu_gr_obj_ctx_is_gfx_engine(struct gk20a *g, struct nvgpu_tsg_subctx *subctx)
+{
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG) &&
+		     nvgpu_tsg_subctx_get_id(subctx) == CHANNEL_INFO_VEID0) {
+			return true;
+		}
+	} else if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		return true;
+	}
+
+	return false;
+}
+
+void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_config *config,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *mappings)
+{
+	(void)config;
+	(void)subctx;
+	(void)mappings;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	nvgpu_gr_ctx_set_preemption_modes(g, gr_ctx);
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_PREEMPTION_GFXP)) {
+		goto done;
+	}
+
+	if (!nvgpu_mem_is_valid(
+			nvgpu_gr_ctx_get_ctx_mem(gr_ctx,
+				NVGPU_GR_CTX_PREEMPT_CTXSW))) {
+		goto done;
+	}
+
+	/*
+	 * Commit NVGPU_GR_CTX_PREEMPT_CTXSW gpu va for all subcontexts
+	 * considering VEID0 gpu va when subcontexts are enabled.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		nvgpu_tsg_subctxs_set_preemption_buffer_va(subctx);
+	} else {
+		nvgpu_gr_ctx_set_preemption_buffer_va(g, gr_ctx);
+	}
+
+	if (!nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
+		goto done;
+	}
+
+	if (nvgpu_gr_obj_ctx_preempt_buffers_patched(gr_ctx)) {
+		goto done;
+	}
+
+	nvgpu_gr_obj_ctx_set_preempt_buffers_patched(gr_ctx, true);
+
+	/*
+	 * Commit other preemption buffers only for VEID0 when subcontexts are
+	 * enabled. Commit always when subcontext are disabled.
+	 */
+	nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);
+
+	nvgpu_gr_obj_ctx_commit_veid0_preemption_buffers(g, config,
+			gr_ctx, mappings);
 
 	g->ops.gr.init.commit_cbes_reserve(g, gr_ctx, true);
 
@@ -346,6 +449,7 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	bool patch)
 {
@@ -363,7 +467,11 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	 * Skip BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB and RTV_CB
 	 * if 2D/3D/I2M classes(graphics) are not supported.
 	 */
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
+		if (patch && nvgpu_gr_obj_ctx_global_ctx_buffers_patched(gr_ctx)) {
+			goto commit_sm_id;
+		}
+
 		/* global pagepool buffer */
 		addr = nvgpu_gr_ctx_mappings_get_global_ctx_va(mappings,
 			NVGPU_GR_GLOBAL_CTX_PAGEPOOL_VA);
@@ -403,6 +511,16 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 			g->ops.gr.init.commit_rtv_cb(g, addr, gr_ctx, patch);
 		}
 #endif
+
+		if (patch) {
+			nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(gr_ctx,
+									true);
+		}
+	}
+
+commit_sm_id:
+	if (patch && nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
+		goto out;
 	}
 
 #ifdef CONFIG_NVGPU_SM_DIVERSITY
@@ -427,6 +545,7 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	}
 #endif
 
+out:
 	if (patch) {
 		nvgpu_gr_ctx_patch_write_end(g, gr_ctx, false);
 	}
@@ -561,7 +680,7 @@ clean_up:
 static int nvgpu_gr_obj_ctx_commit_hw_state(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings)
+	struct nvgpu_tsg_subctx *subctx, struct nvgpu_gr_ctx_mappings *mappings)
 {
 	int err = 0;
 	struct netlist_av_list *sw_method_init =
@@ -577,7 +696,7 @@ static int nvgpu_gr_obj_ctx_commit_hw_state(struct gk20a *g,
 	g->ops.gr.init.fe_go_idle_timeout(g, false);
 
 	nvgpu_gr_obj_ctx_commit_global_ctx_buffers(g, global_ctx_buffer,
-		config, gr_ctx, mappings, false);
+		config, gr_ctx, subctx, mappings, false);
 
 	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
 		/* override a few ctx state registers */
@@ -722,6 +841,7 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block)
 {
@@ -745,13 +865,13 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
 	}
 
 	err = nvgpu_gr_obj_ctx_commit_hw_state(g, global_ctx_buffer,
-					       config, gr_ctx, mappings);
+					config, gr_ctx, subctx, mappings);
 	if (err != 0) {
 		goto clean_up;
 	}
 
 #ifdef CONFIG_NVGPU_GRAPHICS
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
 		err = nvgpu_gr_ctx_init_zcull(g, gr_ctx);
 		if (err != 0) {
 			goto clean_up;
@@ -832,14 +952,14 @@ static int nvgpu_gr_obj_ctx_alloc_buffers(struct gk20a *g,
 
 	nvgpu_log(g, gpu_dbg_gr, " ");
 
-	nvgpu_gr_obj_ctx_gr_ctx_set_size(g, golden_image, gr_ctx_desc);
-
-	nvgpu_gr_obj_ctx_patch_ctx_set_size(g, config, gr_ctx_desc);
-
-	nvgpu_gr_ctx_set_patch_ctx_data_count(gr_ctx, 0);
+	if (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
+		nvgpu_gr_obj_ctx_gr_ctx_set_size(g, golden_image, gr_ctx_desc);
+		nvgpu_gr_obj_ctx_patch_ctx_set_size(g, config, gr_ctx_desc);
+		nvgpu_gr_ctx_set_patch_ctx_data_count(gr_ctx, 0);
+	}
 
 #if defined(CONFIG_NVGPU_GFXP) || defined(CONFIG_NVGPU_CILP)
-	err = nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(g, config,
+	err = nvgpu_gr_obj_ctx_init_ctxsw_preemption(g, config,
 		gr_ctx_desc, gr_ctx, class_num, flags);
 	if (err != 0) {
 		nvgpu_err(g, "fail to init preemption mode");
@@ -982,13 +1102,54 @@ out:
 	return err;
 }
 
+static int nvgpu_gr_obj_ctx_load_golden_image(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
+	struct nvgpu_gr_config *config,
+	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *mappings,
+	struct nvgpu_mem *inst_block,
+	bool cde)
+{
+	int err;
+
+	/* init golden image */
+	err = nvgpu_gr_obj_ctx_alloc_golden_ctx_image(g, golden_image,
+		global_ctx_buffer, config, gr_ctx, subctx,
+		mappings, inst_block);
+	if (err != 0) {
+		nvgpu_err(g, "fail to init golden ctx image");
+		return err;
+	}
+
+#ifdef CONFIG_NVGPU_POWER_PG
+	/* Re-enable ELPG now that golden image has been initialized.
+	 * The PMU PG init code may already have tried to enable elpg, but
+	 * would not have been able to complete this action since the golden
+	 * image hadn't been initialized yet, so do this now.
+	 */
+	err = nvgpu_pmu_reenable_elpg(g);
+	if (err != 0) {
+		nvgpu_err(g, "fail to re-enable elpg");
+		return err;
+	}
+#endif
+
+	/* load golden image */
+	nvgpu_gr_ctx_load_golden_ctx_image(g, gr_ctx, mappings,
+		golden_image->local_golden_image, cde);
+
+	return 0;
+}
+
 int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_desc *gr_ctx_desc,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block,
 	u32 class_num, u32 flags,
@@ -1005,9 +1166,11 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 		goto out;
 	}
 
-	nvgpu_gr_ctx_init_ctx_buffers_mapping_flags(g, gr_ctx);
+	if (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
+		nvgpu_gr_ctx_init_ctx_buffers_mapping_flags(g, gr_ctx);
+	}
 
-	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx,
+	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, subctx,
 			global_ctx_buffer, mappings, vpr);
 	if (err != 0) {
 		nvgpu_err(g, "failed to map ctx buffers");
@@ -1015,52 +1178,42 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	}
 
 	nvgpu_gr_obj_ctx_commit_global_ctx_buffers(g, global_ctx_buffer,
-			config, gr_ctx, mappings, true);
+			config, gr_ctx, subctx, mappings, true);
 
 	/* commit gr ctx buffer */
 	nvgpu_gr_obj_ctx_commit_inst(g, inst_block, gr_ctx, subctx, mappings);
 
-	/* init golden image */
-	err = nvgpu_gr_obj_ctx_alloc_golden_ctx_image(g, golden_image,
-		global_ctx_buffer, config, gr_ctx, mappings, inst_block);
-	if (err != 0) {
-		nvgpu_err(g, "fail to init golden ctx image");
-		goto out;
+	if (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
+		err = nvgpu_gr_obj_ctx_load_golden_image(g, golden_image,
+				global_ctx_buffer, config, gr_ctx, subctx,
+				mappings, inst_block, cde);
+		if (err != 0) {
+			nvgpu_err(g, "fail to load golden ctx image");
+			goto out;
+		}
 	}
 
-#ifdef CONFIG_NVGPU_POWER_PG
-	/* Re-enable ELPG now that golden image has been initialized.
-	 * The PMU PG init code may already have tried to enable elpg, but
-	 * would not have been able to complete this action since the golden
-	 * image hadn't been initialized yet, so do this now.
-	 */
-	err = nvgpu_pmu_reenable_elpg(g);
-	if (err != 0) {
-		nvgpu_err(g, "fail to re-enable elpg");
-		goto out;
-	}
-#endif
-
-	/* load golden image */
-	nvgpu_gr_ctx_load_golden_ctx_image(g, gr_ctx, mappings,
-		golden_image->local_golden_image, cde);
-
 	nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(g, config, gr_ctx,
 		subctx, mappings);
 
 #ifndef CONFIG_NVGPU_NON_FUSA
 	if (g->ops.gpu_class.is_valid_compute(class_num) &&
-	    g->ops.gr.init.set_default_compute_regs != NULL) {
+	    (g->ops.gr.init.set_default_compute_regs != NULL) &&
+	    (!nvgpu_gr_obj_ctx_default_compute_regs_patched(gr_ctx))) {
 		g->ops.gr.init.set_default_compute_regs(g, gr_ctx);
+		nvgpu_gr_obj_ctx_set_default_compute_regs_patched(gr_ctx, true);
 	}
 
-	if (g->ops.ltc.set_default_l2_max_ways_evict_last != NULL) {
+	if ((g->ops.ltc.set_default_l2_max_ways_evict_last != NULL) &&
+	    (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx))) {
 		g->ops.ltc.set_default_l2_max_ways_evict_last(g, gr_ctx);
 	}
 #endif
 
 #ifdef CONFIG_NVGPU_NON_FUSA
-	if (g->ops.gr.init.enable_mme_config_ptimer != NULL) {
+	if ((g->ops.gr.init.enable_mme_config_ptimer != NULL) &&
+	    (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx))) {
+
 		err = nvgpu_pg_elpg_protected_call(g,
 				g->ops.gr.init.enable_mme_config_ptimer(g, gr_ctx));
 
@@ -1076,8 +1229,10 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	 * required for graphics contexts.
 	 */
 	if (g->ops.gpu_class.is_valid_gfx(class_num) &&
-	    g->ops.gr.init.set_default_gfx_regs != NULL) {
+	    (g->ops.gr.init.set_default_gfx_regs != NULL) &&
+	    (!nvgpu_gr_obj_ctx_default_gfx_regs_patched(gr_ctx))) {
 		g->ops.gr.init.set_default_gfx_regs(g, gr_ctx, &golden_image->gfx_regs);
+		nvgpu_gr_obj_ctx_set_default_gfx_regs_patched(gr_ctx, true);
 	}
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
diff --git a/drivers/gpu/nvgpu/common/gr/subctx.c b/drivers/gpu/nvgpu/common/gr/subctx.c
index 6cc6d5773..09c930b4c 100644
--- a/drivers/gpu/nvgpu/common/gr/subctx.c
+++ b/drivers/gpu/nvgpu/common/gr/subctx.c
@@ -21,6 +21,7 @@
  */
 
 #include <nvgpu/gk20a.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
@@ -30,17 +31,16 @@
 
 #include "common/gr/subctx_priv.h"
 
-struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
+int nvgpu_gr_subctx_setup_header(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx,
 	struct vm_gk20a *vm)
 {
-	struct nvgpu_gr_subctx *subctx;
 	int err = 0;
 
-	nvgpu_log_fn(g, " ");
+	nvgpu_log(g, gpu_dbg_gr, " ");
 
-	subctx = nvgpu_kzalloc(g, sizeof(*subctx));
-	if (subctx == NULL) {
-		return NULL;
+	if (subctx->ctx_header.gpu_va != 0ULL) {
+		return 0;
 	}
 
 	err = nvgpu_dma_alloc_sys(g,
@@ -48,7 +48,7 @@ struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
 			&subctx->ctx_header);
 	if (err != 0) {
 		nvgpu_err(g, "failed to allocate sub ctx header");
-		goto err_free_subctx;
+		return err;
 	}
 
 	subctx->ctx_header.gpu_va = nvgpu_gmmu_map(vm,
@@ -58,26 +58,65 @@ struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
 				subctx->ctx_header.aperture);
 	if (subctx->ctx_header.gpu_va == 0ULL) {
 		nvgpu_err(g, "failed to map ctx header");
+		err = -ENOMEM;
 		goto err_free_ctx_header;
 	}
 
-	return subctx;
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;
 
 err_free_ctx_header:
 	nvgpu_dma_free(g, &subctx->ctx_header);
-err_free_subctx:
-	nvgpu_kfree(g, subctx);
-	return NULL;
+	return err;
+}
+
+struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g)
+{
+	struct nvgpu_gr_subctx *subctx;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	subctx = nvgpu_kzalloc(g, sizeof(*subctx));
+	if (subctx == NULL) {
+		return NULL;
+	}
+
+	nvgpu_init_list_node(&subctx->gr_ctx_mappings_entry);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return subctx;
 }
 
 void nvgpu_gr_subctx_free(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx,
-	struct vm_gk20a *vm)
+	struct nvgpu_tsg_subctx *subctx,
+	struct vm_gk20a *vm,
+	bool unmap)
 {
-	nvgpu_log_fn(g, " ");
+	struct nvgpu_gr_subctx *gr_subctx =
+			nvgpu_tsg_subctx_get_gr_subctx(subctx);
 
-	nvgpu_dma_unmap_free(vm, &subctx->ctx_header);
-	nvgpu_kfree(g, subctx);
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (gr_subctx == NULL) {
+		return;
+	}
+
+	if (gr_subctx->mappings != NULL) {
+		nvgpu_list_del(&gr_subctx->gr_ctx_mappings_entry);
+		nvgpu_gr_ctx_mappings_free_subctx_mappings(subctx,
+						gr_subctx->mappings, unmap);
+		gr_subctx->mappings = NULL;
+	}
+
+	if (unmap) {
+		nvgpu_dma_unmap_free(vm, &gr_subctx->ctx_header);
+	}
+
+	nvgpu_kfree(g, gr_subctx);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
 }
 
 void nvgpu_gr_subctx_load_ctx_header(struct gk20a *g,
@@ -100,11 +139,6 @@ void nvgpu_gr_subctx_load_ctx_header(struct gk20a *g,
 	g->ops.gr.ctxsw_prog.set_patch_addr(g, ctxheader,
 		nvgpu_gr_ctx_mappings_get_ctx_va(mappings, NVGPU_GR_CTX_PATCH_CTX));
 
-#ifdef CONFIG_NVGPU_DEBUGGER
-	g->ops.gr.ctxsw_prog.set_pm_ptr(g, ctxheader,
-		nvgpu_gr_ctx_mappings_get_ctx_va(mappings, NVGPU_GR_CTX_PM_CTX));
-#endif
-
 #ifdef CONFIG_NVGPU_GRAPHICS
 	g->ops.gr.ctxsw_prog.set_zcull_ptr(g, ctxheader,
 		nvgpu_gr_ctx_get_zcull_ctx_va(gr_ctx));
@@ -120,6 +154,16 @@ struct nvgpu_mem *nvgpu_gr_subctx_get_ctx_header(struct nvgpu_gr_subctx *subctx)
 	return &subctx->ctx_header;
 }
 
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_subctx_get_mappings(
+				struct nvgpu_gr_subctx *subctx)
+{
+	if (subctx == NULL) {
+		return NULL;
+	}
+
+	return subctx->mappings;
+}
+
 #ifdef CONFIG_NVGPU_GRAPHICS
 void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx)
@@ -134,25 +178,59 @@ void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx
 
 #ifdef CONFIG_NVGPU_GFXP
 void nvgpu_gr_subctx_set_preemption_buffer_va(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx, struct nvgpu_gr_ctx_mappings *mappings)
+	struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *veid0_mappings)
 {
-	u64 preempt_ctxsw_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
-						NVGPU_GR_CTX_PREEMPT_CTXSW);
+	u64 preempt_ctxsw_veid0_gpu_va;
+	u64 preempt_ctxsw_gpu_va;
+	struct nvgpu_mem *ctxheader;
 
-	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, &subctx->ctx_header,
+	ctxheader = nvgpu_gr_subctx_get_ctx_header(subctx);
+
+	preempt_ctxsw_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+					subctx->mappings,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
+
+	preempt_ctxsw_veid0_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+					veid0_mappings,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
+
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, ctxheader,
 				preempt_ctxsw_gpu_va);
 
 	if (g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0 != NULL) {
 		g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0(g,
-			&subctx->ctx_header, preempt_ctxsw_gpu_va);
+			ctxheader, preempt_ctxsw_veid0_gpu_va);
+	}
+}
+
+void nvgpu_gr_subctx_clear_preemption_buffer_va(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx)
+{
+	struct nvgpu_mem *ctxheader = nvgpu_gr_subctx_get_ctx_header(subctx);
+
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, ctxheader, 0ULL);
+
+	if (g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0 != NULL) {
+		g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0(g,
+			ctxheader, 0ULL);
 	}
 }
 #endif /* CONFIG_NVGPU_GFXP */
 
 #ifdef CONFIG_NVGPU_DEBUGGER
 void nvgpu_gr_subctx_set_hwpm_ptr(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx, u64 pm_ctx_gpu_va)
+	struct nvgpu_gr_subctx *subctx,
+	bool set_pm_ctx_gpu_va)
 {
+	u64 pm_ctx_gpu_va = 0ULL;
+
+	if (set_pm_ctx_gpu_va) {
+		pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+					subctx->mappings,
+					NVGPU_GR_CTX_PM_CTX);
+	}
+
 	g->ops.gr.ctxsw_prog.set_pm_ptr(g, &subctx->ctx_header,
 			pm_ctx_gpu_va);
 }
diff --git a/drivers/gpu/nvgpu/common/gr/subctx_priv.h b/drivers/gpu/nvgpu/common/gr/subctx_priv.h
index 5737aedd8..ff2e6dadd 100644
--- a/drivers/gpu/nvgpu/common/gr/subctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/subctx_priv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -27,14 +27,23 @@ struct nvgpu_mem;
 
 /**
  * GR subcontext data structure.
- *
- * One subcontext is allocated per GPU channel.
  */
 struct nvgpu_gr_subctx {
 	/**
 	 * Memory to hold subcontext header image.
 	 */
 	struct nvgpu_mem ctx_header;
+
+	/**
+	 * GPU mappings of the GR ctx buffers for this subcontext.
+	 */
+	struct nvgpu_gr_ctx_mappings *mappings;
+
+	/**
+	 * GR subcontext's entry in gr ctx mappings' (#nvgpu_gr_ctx_mappings)
+	 * subcontexts list #subctx_list.
+	 */
+	struct nvgpu_list_node gr_ctx_mappings_entry;
 };
 
 #endif /* NVGPU_GR_SUBCTX_PRIV_H */
diff --git a/drivers/gpu/nvgpu/common/gr/zcull.c b/drivers/gpu/nvgpu/common/gr/zcull.c
index f745ea9dd..c374e2586 100644
--- a/drivers/gpu/nvgpu/common/gr/zcull.c
+++ b/drivers/gpu/nvgpu/common/gr/zcull.c
@@ -23,6 +23,7 @@
 #include <nvgpu/log.h>
 #include <nvgpu/io.h>
 #include <nvgpu/gk20a.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/zcull.h>
@@ -159,15 +160,17 @@ int nvgpu_gr_zcull_init_hw(struct gk20a *g,
 	return 0;
 }
 
-int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
+int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_tsg_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx)
 {
+	struct nvgpu_gr_subctx *gr_subctx;
 	int ret = 0;
 
-	if (subctx != NULL) {
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		gr_subctx = nvgpu_tsg_subctx_get_gr_subctx(subctx);
 		ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, false);
 		if (ret == 0) {
-			nvgpu_gr_subctx_zcull_setup(g, subctx, gr_ctx);
+			nvgpu_gr_subctx_zcull_setup(g, gr_subctx, gr_ctx);
 		}
 	} else {
 		ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, true);
diff --git a/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c b/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c
index c50e95720..7d5d127f7 100644
--- a/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c
@@ -50,6 +50,8 @@
 #include <nvgpu/cyclestats_snapshot.h>
 #include <nvgpu/power_features/pg.h>
 
+#include <nvgpu/tsg_subctx.h>
+
 #include "gr_vgpu.h"
 #include "ctx_vgpu.h"
 #include "subctx_vgpu.h"
@@ -173,6 +175,7 @@ int vgpu_gr_alloc_obj_ctx(struct nvgpu_channel  *c, u32 class_num, u32 flags)
 	struct nvgpu_tsg *tsg = NULL;
 	struct tegra_vgpu_cmd_msg msg = {};
 	struct tegra_vgpu_alloc_obj_ctx_params *p = &msg.params.alloc_obj_ctx;
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
 	int err = 0;
 
 	nvgpu_log_fn(g, " ");
@@ -211,11 +214,27 @@ int vgpu_gr_alloc_obj_ctx(struct nvgpu_channel  *c, u32 class_num, u32 flags)
 	gr_ctx = tsg->gr_ctx;
 
 	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
-	if (tsg->vm == NULL) {
-		tsg->vm = c->vm;
-		nvgpu_vm_get(tsg->vm);
-		gr_ctx->tsgid = tsg->tsgid;
+
+	/*
+	 * gr_subctx and mappings are allocated/setup here just to track the
+	 * VM references. When a new mapping is created VM reference is taken.
+	 * It will be dropped when the last channel in the subcontext is
+	 * released.
+	 */
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, c);
+	if (err != 0) {
+		nvgpu_err(g, "failed to alloc gr subctx");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		return err;
 	}
+
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, c);
+	if (mappings == NULL) {
+		nvgpu_err(g, "fail to allocate/get ctx mappings struct");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		return -ENOMEM;
+	}
+
 	nvgpu_mutex_release(&tsg->ctx_init_lock);
 
 	msg.cmd = TEGRA_VGPU_CMD_ALLOC_OBJ_CTX;
@@ -234,6 +253,7 @@ int vgpu_gr_alloc_obj_ctx(struct nvgpu_channel  *c, u32 class_num, u32 flags)
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	err = err ? err : msg.ret;
 	if (err == 0) {
+		gr_ctx->tsgid = tsg->tsgid;
 		nvgpu_gr_ctx_mark_ctx_initialized(gr_ctx);
 	} else {
 		nvgpu_err(g, "alloc obj ctx failed err %d", err);
diff --git a/drivers/gpu/nvgpu/common/vgpu/gr/subctx_vgpu.c b/drivers/gpu/nvgpu/common/vgpu/gr/subctx_vgpu.c
index 850ee875d..ac47bf163 100644
--- a/drivers/gpu/nvgpu/common/vgpu/gr/subctx_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gr/subctx_vgpu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,35 +20,22 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-#include <nvgpu/vgpu/vgpu.h>
-#include <nvgpu/vgpu/tegra_vgpu.h>
-#include <nvgpu/gk20a.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/gr/subctx.h>
-
-#include "common/gr/subctx_priv.h"
+#include <nvgpu/channel.h>
+#include <nvgpu/log.h>
 
 #include "subctx_vgpu.h"
-#include "common/vgpu/ivc/comm_vgpu.h"
 
 void vgpu_gr_setup_free_subctx(struct nvgpu_channel *c)
 {
-	struct tegra_vgpu_cmd_msg msg = {};
-	struct tegra_vgpu_free_ctx_header_params *p =
-				&msg.params.free_ctx_header;
-	struct gk20a *g = c->g;
-	int err;
+	nvgpu_log(c->g, gpu_dbg_gr, " ");
 
-	msg.cmd = TEGRA_VGPU_CMD_FREE_CTX_HEADER;
-	msg.handle = vgpu_get_handle(g);
-	p->ch_handle = c->virt_ctx;
-	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-	err = err ? err : msg.ret;
-	if (unlikely(err != 0)) {
-		nvgpu_err(g, "free ctx_header failed err %d", err);
+	if (!nvgpu_is_enabled(c->g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return;
 	}
 
-	if (c->subctx != NULL) {
-		nvgpu_kfree(g, c->subctx);
-		c->subctx = NULL;
-	}
+	nvgpu_gr_subctx_free(c->g, c->subctx, c->vm, false);
+
+	nvgpu_log(c->g, gpu_dbg_gr, "done");
 }
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
index 4258f2ff7..fbbc6d968 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
@@ -46,6 +46,7 @@
 #include <nvgpu/gr/hwpm_map.h>
 #include <nvgpu/preempt.h>
 #include <nvgpu/power_features/pg.h>
+#include <nvgpu/tsg_subctx.h>
 
 #include "gr_gk20a.h"
 #include "gr_pri_gk20a.h"
@@ -82,15 +83,16 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  struct nvgpu_tsg *tsg,
 				  u32 mode)
 {
-	struct nvgpu_channel *ch;
+	bool set_pm_ctx_gpu_va = false;
 	struct nvgpu_gr_ctx *gr_ctx;
 	bool skip_update = false;
-	u64 pm_ctx_gpu_va = 0ULL;
 	int ret;
 	struct nvgpu_gr *gr = nvgpu_gr_get_instance_ptr(g, gr_instance_id);
 
 	nvgpu_log_fn(g, " ");
 
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	gr_ctx = tsg->gr_ctx;
 
 	if (mode != NVGPU_GR_CTX_HWPM_CTXSW_MODE_NO_CTXSW) {
@@ -99,6 +101,7 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		if (ret != 0) {
 			nvgpu_err(g,
 				"failed to allocate and map pm ctxt buffer");
+			nvgpu_mutex_release(&tsg->ctx_init_lock);
 			return ret;
 		}
 
@@ -109,11 +112,14 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	}
 
 	ret = nvgpu_gr_ctx_prepare_hwpm_mode(g, gr_ctx, mode,
-					     &pm_ctx_gpu_va, &skip_update);
+					     &set_pm_ctx_gpu_va, &skip_update);
 	if (ret != 0) {
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		return ret;
 	}
+
 	if (skip_update) {
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		return 0;
 	}
 
@@ -128,20 +134,16 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	nvgpu_gr_ctx_set_hwpm_pm_mode(g, gr_ctx);
 
 	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
-		nvgpu_rwsem_down_read(&tsg->ch_list_lock);
-		nvgpu_list_for_each_entry(ch, &tsg->ch_list,
-					  nvgpu_channel, ch_entry) {
-			nvgpu_gr_subctx_set_hwpm_ptr(g, ch->subctx,
-				pm_ctx_gpu_va);
-		}
-		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+		nvgpu_tsg_subctxs_set_pm_buffer_va(tsg, set_pm_ctx_gpu_va);
 	} else {
-		nvgpu_gr_ctx_set_hwpm_ptr(g, gr_ctx, pm_ctx_gpu_va);
+		nvgpu_gr_ctx_set_hwpm_ptr(g, gr_ctx, set_pm_ctx_gpu_va);
 	}
 
 out:
 	g->ops.tsg.enable(tsg);
 
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	return ret;
 }
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h
index 0243b44d9..573a2a5a8 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -43,7 +43,6 @@ struct nvgpu_fence_type;
 struct nvgpu_swprofiler;
 struct nvgpu_channel_sync;
 struct nvgpu_gpfifo_userdata;
-struct nvgpu_gr_subctx;
 struct nvgpu_gr_ctx;
 struct nvgpu_debug_context;
 struct priv_cmd_queue;
@@ -363,6 +362,12 @@ struct nvgpu_channel {
 	/** Channel's entry in TSG's channel list. */
 	struct nvgpu_list_node ch_entry;
 
+	/**
+	 * Channel's entry in TSG Subcontext's (#nvgpu_tsg_subctx) channels list
+	 * #ch_list.
+	 */
+	struct nvgpu_list_node subctx_entry;
+
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 	struct nvgpu_channel_joblist joblist;
 	struct gpfifo_desc gpfifo;
@@ -440,8 +445,8 @@ struct nvgpu_channel {
 	u64 virt_ctx;
 #endif
 
-	/** Channel's graphics subcontext. */
-	struct nvgpu_gr_subctx *subctx;
+	/** Channel's subcontext. */
+	struct nvgpu_tsg_subctx *subctx;
 
 	/** Lock to access unserviceable state. */
 	struct nvgpu_spinlock unserviceable_lock;
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/ctx.h b/drivers/gpu/nvgpu/include/nvgpu/gr/ctx.h
index 990c68cf6..f50e985f0 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/ctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/ctx.h
@@ -42,6 +42,7 @@ struct gk20a;
 struct vm_gk20a;
 struct nvgpu_tsg;
 struct nvgpu_gr_ctx;
+struct nvgpu_channel;
 struct nvgpu_gr_ctx_mappings;
 struct nvgpu_gr_global_ctx_buffer_desc;
 struct nvgpu_gr_global_ctx_local_golden_image;
@@ -470,30 +471,33 @@ void nvgpu_gr_ctx_init_ctx_buffers_mapping_flags(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx);
 
 /**
- * @brief Allocate or get GR ctx buffers mappings for a TSG.
+ * @brief Allocate or get GR ctx buffers mappings for a TSG/Subcontext.
  *
  * @param g [in]		Pointer to GPU driver struct.
  * @param tsg [in]		Pointer to TSG struct.
- * @param vm [in]		Pointer to vm struct.
+ * @param ch [in]		Pointer to Channel struct.
  *
- * This function allocates the mappings struct for TSG corresponding to
- * given vm if not available already else returns the same.
+ * This function allocates the mappings struct for TSG/subcontext corresponding
+ * to given Channel's VM if not available already else returns the same.
  *
  * @return mappings struct in case of success, null in case of failure.
  */
 struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_alloc_or_get_mappings(struct gk20a *g,
-				struct nvgpu_tsg *tsg, struct vm_gk20a *vm);
+				struct nvgpu_tsg *tsg, struct nvgpu_channel *ch);
 
 /**
- * @brief Get GR ctx buffers mappings for a TSG.
+ * @brief Get GR ctx buffers mappings for a TSG or Subcontext corresponding to
+ *        a channel.
  *
  * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
  *
  * This function returns the mappings struct for TSG.
  *
  * @return mappings struct.
  */
-struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg);
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg,
+					struct nvgpu_channel *ch);
 
 /**
  * @brief Free the gr ctx mapping struct.
@@ -564,8 +568,7 @@ bool nvgpu_gr_ctx_desc_force_preemption_cilp(
 
 #ifdef CONFIG_NVGPU_GFXP
 void nvgpu_gr_ctx_set_preemption_buffer_va(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings);
+	struct nvgpu_gr_ctx *gr_ctx);
 
 bool nvgpu_gr_ctx_desc_force_preemption_gfxp(
 		struct nvgpu_gr_ctx_desc *gr_ctx_desc);
@@ -608,10 +611,10 @@ int nvgpu_gr_ctx_set_smpc_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,
 
 int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
-	u32 mode, u64 *pm_ctx_gpu_va, bool *skip_update);
+	u32 mode, bool *set_pm_ctx_gpu_va, bool *skip_update);
 void nvgpu_gr_ctx_set_hwpm_pm_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx);
 void nvgpu_gr_ctx_set_hwpm_ptr(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,
-			       u64 pm_ctx_gpu_va);
+			       bool set_pm_ctx_gpu_va);
 void nvgpu_gr_ctx_set_pm_ctx_mapped(struct nvgpu_gr_ctx *ctx, bool mapped);
 
 #ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
@@ -625,4 +628,17 @@ bool nvgpu_gr_ctx_desc_dump_ctxsw_stats_on_channel_close(
 		struct nvgpu_gr_ctx_desc *gr_ctx_desc);
 #endif
 
+bool nvgpu_gr_obj_ctx_global_ctx_buffers_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+bool nvgpu_gr_obj_ctx_preempt_buffers_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_preempt_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+bool nvgpu_gr_obj_ctx_default_compute_regs_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_default_compute_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+bool nvgpu_gr_obj_ctx_default_gfx_regs_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_default_gfx_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+
 #endif /* NVGPU_GR_CTX_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/ctx_mappings.h b/drivers/gpu/nvgpu/include/nvgpu/gr/ctx_mappings.h
index 0ce3e54a4..4f99f715b 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/ctx_mappings.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/ctx_mappings.h
@@ -27,6 +27,7 @@ struct gk20a;
 struct nvgpu_tsg;
 struct vm_gk20a;
 struct nvgpu_gr_ctx;
+struct nvgpu_gr_subctx;
 struct nvgpu_gr_ctx_mappings;
 struct nvgpu_gr_global_ctx_buffer_desc;
 
@@ -78,15 +79,18 @@ int nvgpu_gr_ctx_mappings_map_ctx_buffer(struct gk20a *g,
  *
  * @param g [in]		Pointer to GPU driver struct.
  * @param ctx [in]		Pointer to GR context struct.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
  * @param mappings [in]		Pointer to GR context buffer mappings struct.
  *
  * This function will map the GR context preemption buffers in #mappings->vm
- * and stores the mapped address.
+ * and stores the mapped address. For subcontext case NVGPU_GR_CTX_PREEMPT_CTXSW
+ * buffer is mapped to all subcontexts.
  *
  * @return 0 in case of success, < 0 in case of failure.
  */
 int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);
 
 /**
@@ -94,6 +98,7 @@ int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
  *
  * @param g [in]			Pointer to GPU driver struct.
  * @param gr_ctx [in]			Pointer to GR context struct.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
  * @param global_ctx_buffer [in]	Pointer global context buffer desc.
  * @param mappings [in]			Pointer to GR context buffer
  *					mappings struct.
@@ -106,7 +111,7 @@ int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
  * @return 0 in case of success, < 0 in case of failure.
  */
 int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	bool vpr);
@@ -116,6 +121,7 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
  *
  * @param g [in]			Pointer to GPU driver struct.
  * @param gr_ctx [in]			Pointer to GR context struct.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
  * @param global_ctx_buffer [in]	Pointer global context buffer desc.
  * @param mappings [in]			Pointer to GR context buffer
  *					mappings struct.
@@ -124,6 +130,7 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
  */
 void nvgpu_gr_ctx_unmap_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings);
 
@@ -157,4 +164,80 @@ u64 nvgpu_gr_ctx_mappings_get_global_ctx_va(struct nvgpu_gr_ctx_mappings *mappin
 u64 nvgpu_gr_ctx_mappings_get_ctx_va(struct nvgpu_gr_ctx_mappings *mappings,
 	u32 index);
 
+/**
+ * @brief Get GR ctx buffers mappings for a TSG corresponding to VM.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param vm [in]		Pointer to vm struct.
+ *
+ * This function retrieves the mappings struct for TSG corresponding to
+ * given vm from #tsg->gr_ctx_mappings_list.
+ *
+ * @return mappings struct in case of success, null in case of failure.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_get_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm);
+
+/**
+ * @brief Allocate GR ctx buffers mappings for a TSG corresponding to VM.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param vm [in]		Pointer to vm struct.
+ *
+ * This function allocates the mappings struct for TSG corresponding to
+ * given vm and inserts in #tsg->gr_ctx_mappings_list.
+ *
+ * @return mappings struct in case of success, null in case of failure.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_create_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm);
+
+/**
+ * @brief Link GR subctx to mappings struct.
+ *
+ * @param mappings [in]		Pointer to GR context buffers mappings struct.
+ * @param subctx [in]		Pointer to GR subcontext struct.
+ *
+ * This function checks and inserts the subctx in #mappings->subctx_list.
+ */
+void nvgpu_gr_ctx_mappings_add_gr_subctx(struct nvgpu_gr_ctx_mappings *mappings,
+				   struct nvgpu_gr_subctx *subctx);
+
+/**
+ * @brief Free GR context buffers mappings struct for subcontexts.
+ *
+ * @param subctx [in]		Pointer to GR subcontext struct.
+ * @param mappings [in]		Pointer to GR context buffers mappings struct.
+ * @param unmap [in]		Indicates if the GR context buffers are to be
+ *				unmapped. true in case of native nvgpu config,
+ *				false in case of vgpu config. For vgpu case,
+ *				this path is used to handle the VM references
+ *				per subcontext.
+ *
+ * This function checks if the #mappings->subctx_list is empty and if empty,
+ * unmaps the buffers and deletes the mappings.
+ */
+void nvgpu_gr_ctx_mappings_free_subctx_mappings(struct nvgpu_tsg_subctx *subctx,
+			struct nvgpu_gr_ctx_mappings *mappings, bool unmap);
+
+/**
+ * @brief Map GR context buffer to all subcontext VMs.
+ *
+ * @param tsg [in]		Pointer to tsg struct.
+ * @param index [in]		Index of the buffer to be mapped.
+ *
+ * This function maps the GR context buffer at #index to all VMs listed
+ * in #tsg->gr_ctx_mappings_list.
+ *
+ * @return 0 in case of success, < 0 in case of failure.
+ */
+int nvgpu_gr_ctx_mappings_map_buffer_all_subctx(
+		struct nvgpu_tsg *tsg, u32 index);
+
 #endif /* NVGPU_GR_CTX_MAPPINGS_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_utils.h b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_utils.h
index 6096fedd2..e85fe09ca 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_utils.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_utils.h
@@ -130,9 +130,7 @@ struct nvgpu_gr_hwpm_map *nvgpu_gr_get_hwpm_map_ptr(struct gk20a *g);
 void nvgpu_gr_reset_falcon_ptr(struct gk20a *g);
 void nvgpu_gr_reset_golden_image_ptr(struct gk20a *g);
 #endif
-#ifdef CONFIG_NVGPU_FECS_TRACE
 struct nvgpu_gr_global_ctx_buffer_desc *nvgpu_gr_get_global_ctx_buffer_ptr(
 							struct gk20a *g);
-#endif
 
 #endif /* NVGPU_GR_UTILS_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/obj_ctx.h b/drivers/gpu/nvgpu/include/nvgpu/gr/obj_ctx.h
index ee5d147b5..d5edab064 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/obj_ctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/obj_ctx.h
@@ -34,7 +34,7 @@
 struct gk20a;
 struct nvgpu_gr_ctx;
 struct nvgpu_gr_ctx_mappings;
-struct nvgpu_gr_subctx;
+struct nvgpu_tsg_subctx;
 struct nvgpu_gr_config;
 struct nvgpu_gr_ctx_desc;
 struct vm_gk20a;
@@ -70,7 +70,7 @@ void nvgpu_gr_obj_ctx_commit_inst_gpu_va(struct gk20a *g,
  * @param g [in]		Pointer to GPU driver struct.
  * @param inst_block [in]	Pointer to channel instance block.
  * @param gr_ctx [in]		Pointer to graphics context buffer.
- * @param subctx [in]		Pointer to graphics subcontext buffer.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
  * @param mappings [in]		Pointer to mappings of the GR context buffers.
  *
  * If graphics subcontexts are supported, subcontext buffer GPU virtual
@@ -82,9 +82,23 @@ void nvgpu_gr_obj_ctx_commit_inst_gpu_va(struct gk20a *g,
  * instance block.
  */
 void nvgpu_gr_obj_ctx_commit_inst(struct gk20a *g, struct nvgpu_mem *inst_block,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);
 
+/**
+ * brief Check if the VEID is sync when subcontexts are enabled.
+ *
+ * @param g [in]			Pointer to GPU driver struct.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
+ *
+ * @retval true if subcontexts are enabled, MIG is disabled and \a subctx
+ * corresponds to VEID0.
+ * @retval true if subcontexts are disabled and MIG is disabled.
+ * @retval false otherwise.
+ */
+bool nvgpu_gr_obj_ctx_is_gfx_engine(struct gk20a *g,
+				    struct nvgpu_tsg_subctx *subctx);
+
 /**
  * brief Initialize preemption mode in context struct.
  *
@@ -120,7 +134,7 @@ int nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(struct gk20a *g,
  * @param g [in]		Pointer to GPU driver struct.
  * @param config [in]		Pointer to GR configuration struct.
  * @param gr_ctx [in]		Pointer to graphics context.
- * @param subctx [in]		Pointer to graphics subcontext buffer.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
  * @param mappings [in]		Pointer to mappings of GR context buffers.
  *
  * This function will read preemption modes stored in #nvgpu_gr_ctx
@@ -134,7 +148,7 @@ int nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(struct gk20a *g,
  */
 void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
 	struct nvgpu_gr_config *config,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);
 
 /**
@@ -144,6 +158,7 @@ void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
  * @param global_ctx_buffer [in]	Pointer to global context descriptor struct.
  * @param config [in]			Pointer to GR configuration struct.
  * @param gr_ctx [in]			Pointer to graphics context.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
  * @param mappings [in]			Pointer to mappings of GR context buffers.
  * @param patch [in]			Boolean flag to use patch context buffer.
  *
@@ -156,7 +171,8 @@ void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
 void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings, bool patch);
+	struct nvgpu_tsg_subctx *subctx, struct nvgpu_gr_ctx_mappings *mappings,
+	bool patch);
 
 /**
  * @brief Allocate and setup object context s/w image for VEID0 GPU channel.
@@ -191,6 +207,7 @@ int nvgpu_gr_obj_ctx_init_golden_context_image(struct gk20a *g);
  * @param global_ctx_buffer [in]	Pointer to global context descriptor struct.
  * @param config [in]			Pointer to GR configuration struct.
  * @param gr_ctx [in]			Pointer to graphics context.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
  * @param inst_block [in]		Pointer to channel instance block.
  *
  * This function allocates golden context image.
@@ -222,6 +239,7 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block);
 
@@ -234,7 +252,7 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
  * @param gr_ctx_desc [in]		Pointer to GR context descriptor struct.
  * @param config [in]			Pointer to GR configuration struct.
  * @param gr_ctx [in]			Pointer to graphics context.
- * @param subctx [in]			Pointer to graphics subcontext buffer.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
  * @param mappings [in]			Pointer to mappings of the GR context buffers.
  * @param inst_block [in]		Pointer to channel instance block.
  * @param class_num [in]		GR engine class.
@@ -274,7 +292,7 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	struct nvgpu_gr_ctx_desc *gr_ctx_desc,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block,
 	u32 class_num, u32 flags,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h b/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
index 4b6ed33d0..593ed6d89 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
@@ -106,6 +106,20 @@ void nvgpu_gr_setup_free_gr_ctx(struct gk20a *g,
  */
 void nvgpu_gr_setup_free_subctx(struct nvgpu_channel *c);
 
+/**
+ * @brief Validate preemption mode in GR engine context image in case
+ *        Application optionally wants to change default preemption mode.
+ *
+ * @param graphics_preempt_mode [in]	Requested graphics preemption mode.
+ * @param compute_preempt_mode [in]	Requested compute preemption mode.
+ * @param gr_ctx [in]			Pointer to GR engine context image.
+ *
+ * @return true in case of success, false in case of failure.
+ */
+bool nvgpu_gr_setup_validate_preemption_mode(u32 *graphics_preempt_mode,
+				u32 *compute_preempt_mode,
+				struct nvgpu_gr_ctx *gr_ctx);
+
 /**
  * @brief Setup preemption mode in GR engine context image in case
  *        Application optionally wants to change default preemption mode.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/subctx.h b/drivers/gpu/nvgpu/include/nvgpu/gr/subctx.h
index 8739a165b..df2b98d26 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/subctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/subctx.h
@@ -32,41 +32,62 @@
  */
 struct gk20a;
 struct vm_gk20a;
+struct nvgpu_gr_ctx;
 struct nvgpu_gr_subctx;
 struct nvgpu_mem;
 struct nvgpu_gr_ctx_mappings;
 
 /**
- * @brief Allocate graphics subcontext buffer.
- *
- * @param g [in]		Pointer to GPU driver struct.
- * @param vm [in]		Pointer to virtual memory.
- *
- * This function allocates memory for #nvgpu_gr_subctx structure
- * and subcontext header stored in #nvgpu_gr_subctx structure.
- *
- * Subcontext header memory will be mapped to given virtual
- * memory.
- *
- * @return pointer to #nvgpu_gr_subctx struct in case of success,
- *         NULL in case of failure.
- */
-struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
-	struct vm_gk20a *vm);
-
-/**
- * @brief Free graphics subcontext buffer.
+ * @brief Allocate and map graphics subcontext context header buffer.
  *
  * @param g [in]		Pointer to GPU driver struct.
  * @param subctx [in]		Pointer to graphics subcontext struct.
  * @param vm [in]		Pointer to virtual memory.
  *
+ * This function allocates memory for subcontext header stored in
+ * #nvgpu_gr_subctx structure.
+ *
+ * Subcontext header memory will be mapped to given virtual
+ * memory.
+ *
+ * @return 0 in case of success, < 0 in case of failure.
+ */
+int nvgpu_gr_subctx_setup_header(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx,
+	struct vm_gk20a *vm);
+
+/**
+ * @brief Allocate graphics subcontext buffer.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ *
+ * This function allocates memory for #nvgpu_gr_subctx structure.
+ *
+ * @return pointer to #nvgpu_gr_subctx struct in case of success,
+ *         NULL in case of failure.
+ */
+struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g);
+
+/**
+ * @brief Free graphics subcontext buffer.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
+ * @param vm [in]		Pointer to virtual memory.
+ * @param unmap [in]		Indicates if GR context buffers and subctx
+ *				buffer are to be unmapped.
+ *				true in case of native nvgpu config and
+ *				false in case of vgpu config. For vgpu case,
+ *				this path is used to handle the VM references
+ *				per subcontext.
+ *
  * This function will free memory allocated for subcontext header and
  * #nvgpu_gr_subctx structure.
  */
 void nvgpu_gr_subctx_free(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx,
-	struct vm_gk20a *vm);
+	struct nvgpu_tsg_subctx *subctx,
+	struct vm_gk20a *vm,
+	bool unmap);
 
 /**
  * @brief Initialize graphics subcontext buffer header.
@@ -101,6 +122,19 @@ void nvgpu_gr_subctx_load_ctx_header(struct gk20a *g,
  */
 struct nvgpu_mem *nvgpu_gr_subctx_get_ctx_header(struct nvgpu_gr_subctx *subctx);
 
+/**
+ * @brief Get pointer of GR context buffers mappings struct for a subcontext.
+ *
+ * @param subctx [in]		Pointer to graphics subcontext struct.
+ *
+ * This function returns #nvgpu_gr_ctx_mappings pointer of GR context buffers
+ * mappings stored in #nvgpu_gr_subctx.
+ *
+ * @return pointer to subcontext GR context buffers mappings struct.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_subctx_get_mappings(
+				struct nvgpu_gr_subctx *subctx);
+
 #ifdef CONFIG_NVGPU_GRAPHICS
 void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx);
@@ -108,10 +142,14 @@ void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx
 void nvgpu_gr_subctx_set_preemption_buffer_va(struct gk20a *g,
 	struct nvgpu_gr_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);
+
+void nvgpu_gr_subctx_clear_preemption_buffer_va(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx);
 #endif
 
 #ifdef CONFIG_NVGPU_DEBUGGER
 void nvgpu_gr_subctx_set_hwpm_ptr(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx, u64 pm_ctx_gpu_va);
+	struct nvgpu_gr_subctx *subctx,
+	bool set_pm_ctx_gpu_va);
 #endif
 #endif /* NVGPU_GR_SUBCTX_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/zcull.h b/drivers/gpu/nvgpu/include/nvgpu/gr/zcull.h
index b3e21c7eb..c84db9129 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/zcull.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/zcull.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -28,8 +28,8 @@
 struct gk20a;
 struct nvgpu_gr_config;
 struct nvgpu_gr_ctx;
-struct nvgpu_gr_subctx;
 struct nvgpu_gr_zcull;
+struct nvgpu_tsg_subctx;
 
 struct nvgpu_gr_zcull_info {
 	u32 width_align_pixels;
@@ -54,7 +54,7 @@ int nvgpu_gr_zcull_init_hw(struct gk20a *g,
 			struct nvgpu_gr_zcull *gr_zcull,
 			struct nvgpu_gr_config *gr_config);
 
-int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
+int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_tsg_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx);
 
 #endif /* NVGPU_GR_ZCULL_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg.h b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
index 36f907cd7..9b6280df1 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
@@ -78,8 +78,6 @@ struct nvgpu_tsg {
 	/** Pointer to GPU driver struct. */
 	struct gk20a *g;
 
-	/** Points to TSG's virtual memory */
-	struct vm_gk20a *vm;
 	/**
 	 * Starting with Volta, when a Channel/TSG is set up, a recovery buffer
 	 * region must be allocated in BAR2, to allow engine to save methods if
@@ -98,6 +96,12 @@ struct nvgpu_tsg {
 	 */
 	struct nvgpu_gr_ctx *gr_ctx;
 
+	/**
+	 * List of gr_ctx buffers maps (#nvgpu_gr_ctx_mappings) for gr ctx
+	 * for this TSG. Accessed by holding #ctx_init_lock from TSG.
+	 */
+	struct nvgpu_list_node gr_ctx_mappings_list;
+
 	/**
 	 * Mutex to prevent concurrent context initialization for channels
 	 * in same TSG. All channels in one TSG share the context buffer,
@@ -113,6 +117,12 @@ struct nvgpu_tsg {
 	 */
 	struct nvgpu_ref refcount;
 
+	/**
+	 * List of subcontexts (#nvgpu_tsg_subctx) bound to this TSG.
+	 * Accessed by holding #ch_list_lock from TSG.
+	 */
+	struct nvgpu_list_node subctx_list;
+
 	/** List of channels bound to this TSG. */
 	struct nvgpu_list_node ch_list;
 #ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
@@ -128,7 +138,7 @@ struct nvgpu_tsg {
 #endif
 	/**
 	 * Read write type of semaphore lock used for accessing/modifying
-	 * #ch_list.
+	 * #ch_list, #subctx_list and #ch_list in #nvgpu_tsg_subctx.
 	 */
 	struct nvgpu_rwsem ch_list_lock;
 
@@ -272,8 +282,6 @@ struct nvgpu_tsg *nvgpu_tsg_open(struct gk20a *g, pid_t pid);
  * - Call non-NULL HAL to release tsg. This HAL is non-NULL for vgpu only.
  * - Call nvgpu_free_gr_ctx_struct to free #nvgpu_tsg.gr_ctx.
  * - Set #nvgpu_tsg.gr_ctx to NULL.
- * - If #nvgpu_tsg.vm is non-NULL, do #nvgpu_vm_put for this vm and set
- *   it to NULL (Unhook TSG from VM).
  * - If #nvgpu_tsg.sm_error_states is non-NULL, free allocated memory and set
  *   it to NULL.
  */
@@ -286,7 +294,7 @@ void nvgpu_tsg_release_common(struct gk20a *g, struct nvgpu_tsg *tsg);
  *
  * - Get pointer to the #nvgpu_tsg using #ref.
  * - Call HAL to free #nvgpu_tsg.gr_ctx if this memory pointer is non-NULL
- *   and valid and also #nvgpu_tsg.vm is non-NULL.
+ *   and valid.
  * - Unhook all events created on the TSG being released.
  * -- Acquire #nvgpu_tsg.event_id_list_lock.
  * -- While #nvgpu_tsg.event_id_list is non-empty,
@@ -363,6 +371,7 @@ void nvgpu_tsg_disable(struct nvgpu_tsg *tsg);
  * - If channel had ASYNC subctx id, then set runqueue selector to 1.
  * - Set runlist id of TSG to channel's runlist_id if runlist_id of TSG
  *   is set to #NVGPU_INVALID_TSG_ID.
+ * - Bind channel to TSG subcontext calling #nvgpu_tsg_subctx_bind_channel.
  * - Call HAL to bind channel to TSG.
  * - Add channel to TSG's list of channels. See #nvgpu_tsg.ch_list
  * - Set #nvgpu_channel.tsgid to #nvgpu_tsg.tsgid.
@@ -445,6 +454,7 @@ struct nvgpu_tsg *nvgpu_tsg_check_and_get_from_id(struct gk20a *g, u32 tsgid);
  *  - If NEXT bit is set and force is set to false, caller will
  *    have to retry unbind.
  *  - Remove channel from its runlist.
+ *  - Remove channel from subctx by calling #nvgpu_tsg_subctx_unbind_channel.
  *  - Remove channel from TSG's channel list.
  *  - Set tsgid of the channel to #NVGPU_INVALID_TSG_ID.
  *  - Disable channel so that it is not picked up by h/w scheduler.
@@ -456,6 +466,7 @@ struct nvgpu_tsg *nvgpu_tsg_check_and_get_from_id(struct gk20a *g, u32 tsgid);
  *  - Call #nvgpu_channel_update_runlist to remove the channel from the runlist.
  *  - Acquire #nvgpu_tsg.ch_list_lock of the tsg and delete channel from
  *    #nvgpu_tsg.ch_list.
+ *  - Remove channel from subctx by calling #nvgpu_tsg_subctx_unbind_channel.
  *  - Remove channel from TSG's channel list.
  *  - Set #nvgpu_channel.tsgid to #NVGPU_INVALID_TSG_ID
  *  - Release #nvgpu_tsg.ch_list_lock of the tsg.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h b/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
new file mode 100644
index 000000000..73cfd444e
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_TSG_SUBCTX_H
+#define NVGPU_TSG_SUBCTX_H
+
+#include <nvgpu/types.h>
+
+struct gk20a;
+struct nvgpu_tsg;
+struct nvgpu_tsg_subctx;
+struct nvgpu_channel;
+/**
+ * @brief Bind a channel to the TSG subcontext.
+ *
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Loop through the #subctx_list in #tsg to check if the subctx
+ *   exists for the provided channel.
+ * - If it exists, validate the channel VM with subctx VM.
+ * - If validated, add the channel to the subctx #ch_list and exit.
+ * - Else allocate and initialize new subctx structure.
+ * - Add the channel to the subctx #ch_list and add subctx to the
+ *   TSG #subctx_list.
+ *
+ * @return 0 for successful bind or if subctx support is disabled,
+ *         < 0 for failure.
+ * @retval -EINVAL if channel VM doesn't match with subctx VM for provided
+ *                 subctx_id.
+ * @retval -ENOMEM if subctx allocation fails.
+ */
+int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
+				  struct nvgpu_channel *ch);
+
+/**
+ * @brief Unbind a channel from the TSG subcontext.
+ *
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Validate that #subctx is allocated for the channel #ch.
+ * - Remove the channel from the subctx #ch_list.
+ * - If the subctx #ch_list is empty
+ *   - Invoke g->ops.gr.setup.free_subctx to free the GR subcontext
+ *     struct (and GR subcontext mappings struct).
+ *   - Remove the subctx from the TSG #subctx_list.
+ *   - Free the subctx memory. If this was the only active channel
+ *     in the TSG this function will delete the objects in the
+ *     sequence: mappings -> gr_subctx -> tsg_subctx
+ */
+void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
+				     struct nvgpu_channel *ch);
+
+/**
+ * @brief Allocate GR subcontext for a TSG subcontext.
+ *
+ * @param g [in]		Pointer to gk20a struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Check if TSG subctx is allocated for the channel.
+ * - If not allocated, return error.
+ * - If allocated, and if GR subcontext is not allocated call
+ *   #nvgpu_gr_subctx_alloc.
+ *
+ * @return 0 for successful allocation, < 0 for failure.
+ */
+int nvgpu_tsg_subctx_alloc_gr_subctx(struct gk20a *g, struct nvgpu_channel *ch);
+
+/**
+ * @brief Allocate and map GR subcontext header for a TSG subcontext.
+ *
+ * @param g [in]		Pointer to gk20a struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Check if TSG and GR subctx is allocated for the channel.
+ * - If not allocated, return error.
+ * - If allocated, setup subcontext header by calling
+ *   #nvgpu_gr_subctx_setup_header.
+ *
+ * @return 0 for successful allocation, < 0 for failure.
+ */
+int nvgpu_tsg_subctx_setup_subctx_header(struct gk20a *g,
+					 struct nvgpu_channel *ch);
+
+/**
+ * @brief Get GR subcontext for a TSG subcontext.
+ *
+ * @param tsg_subctx [in]	Pointer to TSG Subcontext struct.
+ *
+ * - Return #gr_subctx from #nvgpu_tsg_subctx.
+ */
+struct nvgpu_gr_subctx *nvgpu_tsg_subctx_get_gr_subctx(
+				struct nvgpu_tsg_subctx *tsg_subctx);
+
+/**
+ * @brief Get id of a TSG subcontext.
+ *
+ * @param tsg_subctx [in]	Pointer to TSG Subcontext struct.
+ *
+ * - Return #subctx_id from #nvgpu_tsg_subctx.
+ */
+u32 nvgpu_tsg_subctx_get_id(struct nvgpu_tsg_subctx *tsg_subctx);
+
+/**
+ * @brief Allocate or get the mappings struct for the TSG subcontext.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * This function allocates the mappings struct for subcontext corresponding
+ * to given Channel's VM if not available already else returns the same.
+ * It adds the gr_subctx corresponding to the channel the mapping object's
+ * subctx_list.
+ *
+ * @return mappings struct in case of success, null in case of failure.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_tsg_subctx_alloc_or_get_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct nvgpu_channel *ch);
+
+#ifdef CONFIG_NVGPU_GFXP
+/**
+ * @brief Program preemption buffer virtual addresses for all subcontexts.
+ *
+ * @param tsg_subctx [in]		Pointer to TSG subcontext struct.
+ *
+ * - Checks if VEID0 mappings are available.
+ * - If available, program the preemption buffer virtual addresses
+ *   (VEID0 VA and VA in subcontext VM) for all GR subcontexts'
+ *   headers.
+ */
+void nvgpu_tsg_subctxs_set_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx);
+
+/**
+ * @brief Clear preemption buffer virtual addresses for all subcontexts.
+ *
+ * @param tsg_subctx [in]		Pointer to TSG subcontext struct.
+ *
+ * - Program the preemption buffer virtual addresses
+ *   (VEID0 VA and VA in subcontext VM) for all GR subcontexts'
+ *   headers to 0.
+ */
+void nvgpu_tsg_subctxs_clear_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx);
+#endif /* CONFIG_NVGPU_GFXP */
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+/**
+ * @brief Program PM buffer virtual addresses for all subcontexts.
+ *
+ * @param tsg [in]			Pointer to TSG struct.
+ * @param set_pm_ctx_gpu_va [in]	Indicates if PM ctx buffer GPU VA
+ *					is to be programmed.
+ *
+ * - Program the PM buffer virtual address for all GR subcontexts' headers.
+ */
+void nvgpu_tsg_subctxs_set_pm_buffer_va(struct nvgpu_tsg *tsg,
+					bool set_pm_ctx_gpu_va);
+#endif /* CONFIG_NVGPU_DEBUGGER */
+
+#endif /* NVGPU_TSG_SUBCTX_H */
diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export
index 48e6653ae..d3e9512c4 100644
--- a/libs/igpu/libnvgpu-drv-igpu_safe.export
+++ b/libs/igpu/libnvgpu-drv-igpu_safe.export
@@ -709,6 +709,10 @@ nvgpu_tsg_store_sm_error_state
 nvgpu_tsg_get_sm_error_state
 nvgpu_tsg_abort
 nvgpu_tsg_bind_channel
+nvgpu_tsg_subctx_bind_channel
+nvgpu_tsg_subctx_unbind_channel
+nvgpu_tsg_subctx_alloc_gr_subctx
+nvgpu_tsg_subctx_setup_subctx_header
 nvgpu_tsg_check_and_get_from_id
 nvgpu_tsg_cleanup_sw
 nvgpu_tsg_default_timeslice_us
diff --git a/userspace/units/fifo/tsg/nvgpu-tsg.c b/userspace/units/fifo/tsg/nvgpu-tsg.c
index 618f15e83..e61e683b9 100644
--- a/userspace/units/fifo/tsg/nvgpu-tsg.c
+++ b/userspace/units/fifo/tsg/nvgpu-tsg.c
@@ -654,7 +654,6 @@ int test_tsg_release(struct unit_module *m,
 	struct nvgpu_fifo *f = &g->fifo;
 	struct gpu_ops gops = g->ops;
 	struct nvgpu_tsg *tsg = NULL;
-	struct vm_gk20a vm;
 	u32 branches = 0U;
 	int ret = UNIT_FAIL;
 	u32 free_gr_ctx_mask =
@@ -706,12 +705,6 @@ int test_tsg_release(struct unit_module *m,
 		if (branches & F_TSG_RELEASE_MEM) {
 			ret = nvgpu_gr_ctx_alloc_ctx_buffers(g, gr_ctx_desc, tsg->gr_ctx);
 			unit_assert(ret == UNIT_SUCCESS, goto done);
-			tsg->vm = &vm;
-			/* prevent nvgpu_vm_remove */
-			nvgpu_ref_init(&vm.ref);
-			nvgpu_ref_get(&vm.ref);
-		} else {
-			tsg->vm = NULL;
 		}
 
 		if ((branches & free_gr_ctx_mask) == free_gr_ctx_mask) {
@@ -755,7 +748,6 @@ int test_tsg_release(struct unit_module *m,
 
 		unit_assert(!f->tsg[tsg->tsgid].in_use, goto done);
 		unit_assert(tsg->gr_ctx == NULL, goto done);
-		unit_assert(tsg->vm == NULL, goto done);
 		unit_assert(tsg->sm_error_states == NULL, goto done);
 	}
 	ret = UNIT_SUCCESS;
diff --git a/userspace/units/gr/ctx/Makefile.tmk b/userspace/units/gr/ctx/Makefile.tmk
index af39b0e43..06ee19f07 100644
--- a/userspace/units/gr/ctx/Makefile.tmk
+++ b/userspace/units/gr/ctx/Makefile.tmk
@@ -28,7 +28,8 @@ NVGPU_UNIT_NAME = nvgpu-gr-ctx
 NVGPU_UNIT_SRCS = nvgpu-gr-ctx.c
 
 NVGPU_UNIT_INTERFACE_DIRS := \
-	$(NV_COMPONENT_DIR)/..
+	$(NV_COMPONENT_DIR)/.. \
+	$(NV_COMPONENT_DIR)/../../fifo
 
 include $(NV_COMPONENT_DIR)/../../Makefile.units.common.tmk
 
diff --git a/userspace/units/gr/ctx/nvgpu-gr-ctx.c b/userspace/units/gr/ctx/nvgpu-gr-ctx.c
index 05b20b8e5..44902fdd2 100644
--- a/userspace/units/gr/ctx/nvgpu-gr-ctx.c
+++ b/userspace/units/gr/ctx/nvgpu-gr-ctx.c
@@ -42,6 +42,8 @@
 #include "../nvgpu-gr.h"
 #include "nvgpu-gr-ctx.h"
 
+#include "../../fifo/nvgpu-fifo-common.h"
+
 #define DUMMY_SIZE	0xF0U
 
 static u64 nvgpu_gmmu_map_locked_stub(struct vm_gk20a *vm,
@@ -92,14 +94,24 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	u64 low_hole = SZ_4K * 16UL;
 	struct nvgpu_channel *channel = (struct nvgpu_channel *)
 		malloc(sizeof(struct nvgpu_channel));
-	struct nvgpu_tsg *tsg = (struct nvgpu_tsg *)
-		malloc(sizeof(struct nvgpu_tsg));
+	struct nvgpu_tsg *tsg;
 	u32 i;
 
-	if (channel == NULL || tsg == NULL) {
+	if (channel == NULL) {
 		unit_return_fail(m, "failed to allocate channel/tsg");
 	}
 
+	err = test_fifo_init_support(m, g, NULL);
+	if (err != 0) {
+		unit_return_fail(m, "failed to init fifo support\n");
+		return err;
+	}
+
+	tsg = nvgpu_tsg_open(g, 0);
+	if (!tsg) {
+		unit_return_fail(m, "failed to allocate tsg");
+	}
+
 	desc = nvgpu_gr_ctx_desc_alloc(g);
 	if (!desc) {
 		unit_return_fail(m, "failed to allocate memory");
@@ -147,7 +159,7 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 
 	tsg->gr_ctx = gr_ctx;
 
-	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, vm);
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, channel);
 	if (mappings == NULL) {
 		unit_return_fail(m, "failed to allocate gr_ctx mappings");
 	}
@@ -179,7 +191,7 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	/* Inject kmem alloc failures to trigger mapping failures */
 	for (i = 0; i < 2; i++) {
 		nvgpu_posix_enable_fault_injection(kmem_fi, true, 2 * i);
-		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx,
+		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
 					global_desc, mappings, false);
 		if (err == 0) {
 			unit_return_fail(m, "unexpected success");
@@ -188,8 +200,8 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	}
 
 	/* global ctx_desc size is not set. */
-	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, global_desc,
-				       mappings, false);
+	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
+					global_desc, mappings, false);
 	if (err == 0) {
 		unit_return_fail(m, "unexpected success");
 	}
@@ -211,8 +223,8 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	/* Fail global ctx buffer mappings */
 	for (i = 0; i < 4; i++) {
 		nvgpu_posix_enable_fault_injection(kmem_fi, true, 4 + (2 * i));
-		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, global_desc,
-					       mappings, false);
+		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
+						global_desc, mappings, false);
 		if (err == 0) {
 			unit_return_fail(m, "unexpected success");
 		}
@@ -221,8 +233,8 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 
 
 	/* Successful mapping */
-	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, global_desc,
-				       mappings, false);
+	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
+					global_desc, mappings, false);
 	if (err != 0) {
 		unit_return_fail(m, "failed to map global buffers");
 	}
@@ -253,6 +265,12 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	nvgpu_gr_ctx_desc_free(g, desc);
 	nvgpu_vm_put(g->mm.bar1.vm);
 
+	err = test_fifo_remove_support(m, g, NULL);
+	if (err != 0) {
+		unit_return_fail(m, "failed to remove fifo support\n");
+		return err;
+	}
+
 	return UNIT_SUCCESS;
 }
 
diff --git a/userspace/units/gr/intr/nvgpu-gr-intr.c b/userspace/units/gr/intr/nvgpu-gr-intr.c
index b928cfb51..4a96c2a93 100644
--- a/userspace/units/gr/intr/nvgpu-gr-intr.c
+++ b/userspace/units/gr/intr/nvgpu-gr-intr.c
@@ -37,7 +37,9 @@
 #include <nvgpu/runlist.h>
 #include <nvgpu/tsg.h>
 #include <nvgpu/class.h>
+#include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/gr_intr.h>
+#include <nvgpu/tsg_subctx.h>
 
 #include <nvgpu/hw/gv11b/hw_gr_gv11b.h>
 
@@ -264,12 +266,45 @@ static int gr_test_intr_cache_current_ctx(struct gk20a *g,
 	return g->ops.gr.intr.stall_isr(g);
 }
 
+static u64 nvgpu_gmmu_map_locked_stub(struct vm_gk20a *vm,
+			  u64 vaddr,
+			  struct nvgpu_sgt *sgt,
+			  u64 buffer_offset,
+			  u64 size,
+			  u32 pgsz_idx,
+			  u8 kind_v,
+			  u32 ctag_offset,
+			  u32 flags,
+			  enum gk20a_mem_rw_flag rw_flag,
+			  bool clear_ctags,
+			  bool sparse,
+			  bool priv,
+			  struct vm_gk20a_mapping_batch *batch,
+			  enum nvgpu_aperture aperture)
+{
+	return 1;
+}
+
+static void nvgpu_gmmu_unmap_locked_stub(struct vm_gk20a *vm,
+			     u64 vaddr,
+			     u64 size,
+			     u32 pgsz_idx,
+			     bool va_allocated,
+			     enum gk20a_mem_rw_flag rw_flag,
+			     bool sparse,
+			     struct vm_gk20a_mapping_batch *batch)
+{
+	return;
+}
+
 static int gr_test_intr_allocate_ch_tsg(struct unit_module *m,
 					struct gk20a *g)
 {
 	u32 tsgid = getpid();
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
 	struct nvgpu_channel *ch = NULL;
 	struct nvgpu_tsg *tsg = NULL;
+	struct vm_gk20a *vm = NULL;
 	bool sema_init, notify_init;
 	int err;
 
@@ -295,12 +330,46 @@ static int gr_test_intr_allocate_ch_tsg(struct unit_module *m,
 		goto ch_cleanup;
 	}
 
+	/* Setup VM */
+	vm = nvgpu_vm_init(g, SZ_4K, SZ_4K << 10,
+		nvgpu_safe_sub_u64(1ULL << 37, SZ_4K << 10),
+		(1ULL << 32), 0ULL,
+		false, false, false, "dummy");
+	if (!vm) {
+		unit_err(m, "failed to allocate VM");
+		goto ch_cleanup;
+	}
+
+	ch->g = g;
+	ch->vm = vm;
+
 	err = nvgpu_tsg_bind_channel(tsg, ch);
 	if (err != 0) {
 		unit_err(m, "failed tsg channel bind\n");
 		goto ch_cleanup;
 	}
 
+	g->ops.mm.gmmu.map = nvgpu_gmmu_map_locked_stub;
+	g->ops.mm.gmmu.unmap = nvgpu_gmmu_unmap_locked_stub;
+
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, ch);
+	if (err != 0) {
+		unit_err(m, "failed to alloc gr subctx");
+		goto ch_cleanup;
+	}
+
+	err = nvgpu_tsg_subctx_setup_subctx_header(g, ch);
+	if (err != 0) {
+		unit_err(m, "failed to setup subctx header");
+		goto ch_cleanup;
+	}
+
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, ch);
+	if (mappings == NULL) {
+		unit_err(m, "failed to allocate gr_ctx mappings");
+		goto ch_cleanup;
+	}
+
 	err = gr_test_intr_block_ptr_as_current_ctx(m, g, ch, tsg, tsgid);
 	if (err != 0) {
 		unit_err(m, "isr failed with block_ptr as current_ctx\n");
diff --git a/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c b/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c
index e8a5e5a62..d155b2db6 100644
--- a/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c
+++ b/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c
@@ -37,6 +37,7 @@
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
 #include <nvgpu/gr/obj_ctx.h>
+#include <nvgpu/tsg_subctx.h>
 
 #include <nvgpu/posix/posix-fault-injection.h>
 #include <nvgpu/posix/dma.h>
@@ -119,7 +120,7 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_desc;
 	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	struct nvgpu_gr_ctx_mappings *mappings = NULL;
-	struct nvgpu_gr_subctx *subctx = NULL;
+	struct nvgpu_tsg_subctx *subctx = NULL;
 	struct nvgpu_mem inst_block;
 	struct nvgpu_gr_config *config = nvgpu_gr_get_config_ptr(g);
 	struct nvgpu_posix_fault_inj *kmem_fi =
@@ -132,6 +133,8 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 		struct nvgpu_gr_config *config);
 	struct nvgpu_tsg *tsg = (struct nvgpu_tsg *)
 		malloc(sizeof(struct nvgpu_tsg));
+	struct nvgpu_channel *channel = (struct nvgpu_channel *)
+		malloc(sizeof(struct nvgpu_channel));
 
 	/* Inject allocation failures and initialize obj_ctx, should fail */
 	nvgpu_posix_enable_fault_injection(kmem_fi, true, 0);
@@ -196,16 +199,31 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 		unit_return_fail(m, "failed to allocate global buffers");
 	}
 
-	subctx = nvgpu_gr_subctx_alloc(g, vm);
-	if (!subctx) {
-		unit_return_fail(m, "failed to allocate subcontext");
+	channel->g = g;
+	channel->vm = vm;
+
+	err = nvgpu_tsg_subctx_bind_channel(tsg, channel);
+	if (err != 0) {
+		unit_return_fail(m, "tsg subctx bind failed");
 	}
 
-	mappings = nvgpu_gr_ctx_mappings_create(g, tsg, vm);
-	if (mappings == NULL) {
-		unit_return_fail(m, "failed to allocate gr_ctx mappings");
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, channel);
+	if (err != 0) {
+		unit_return_fail(m, "failed to allocate gr_subctx");
 	}
 
+	err = nvgpu_tsg_subctx_setup_subctx_header(g, channel);
+	if (err != 0) {
+		unit_return_fail(m, "failed to setup subctx header");
+	}
+
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, channel);
+	if (mappings == NULL) {
+		unit_return_fail(m, "failed to allocate or get mappings");
+	}
+
+	subctx = channel->subctx;
+
 	/* Fail gr_ctx allocation */
 	nvgpu_posix_enable_fault_injection(kmem_fi, true, 0);
 	err = nvgpu_gr_obj_ctx_alloc(g, golden_image, global_desc, desc,
@@ -396,7 +414,7 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 	}
 
 	/* Cleanup */
-	nvgpu_gr_subctx_free(g, subctx, vm);
+	nvgpu_tsg_subctx_unbind_channel(tsg, channel);
 	nvgpu_gr_ctx_free(g, gr_ctx, global_desc);
 	nvgpu_free_gr_ctx_struct(g, gr_ctx);
 	nvgpu_gr_ctx_desc_free(g, desc);
diff --git a/userspace/units/gr/setup/nvgpu-gr-setup.c b/userspace/units/gr/setup/nvgpu-gr-setup.c
index cee8cfdde..8e18a0235 100644
--- a/userspace/units/gr/setup/nvgpu-gr-setup.c
+++ b/userspace/units/gr/setup/nvgpu-gr-setup.c
@@ -209,12 +209,6 @@ static int gr_test_setup_allocate_ch_tsg(struct unit_module *m,
 		goto ch_cleanup;
 	}
 
-	err = nvgpu_tsg_bind_channel(tsg, ch);
-	if (err != 0) {
-		unit_err(m, "failed tsg channel bind\n");
-		goto ch_cleanup;
-	}
-
 	err = gk20a_as_alloc_share(g,
 		0U, NVGPU_AS_ALLOC_UNIFIED_VA,
 		U64(SZ_4K) << U64(10),
@@ -230,6 +224,12 @@ static int gr_test_setup_allocate_ch_tsg(struct unit_module *m,
 		goto tsg_unbind;
 	}
 
+	err = nvgpu_tsg_bind_channel(tsg, ch);
+	if (err != 0) {
+		unit_err(m, "failed tsg channel bind\n");
+		goto ch_cleanup;
+	}
+
 	gr_setup_ch = ch;
 	gr_setup_tsg = tsg;
 
@@ -574,7 +574,7 @@ static int gr_setup_alloc_no_tsg_subcontext(struct unit_module *m, struct gk20a
 
 static void gr_setup_fake_free_obj_ctx(struct unit_module *m, struct gk20a *g)
 {
-	struct nvgpu_gr_subctx *gr_subctx = gr_setup_ch->subctx;
+	struct nvgpu_tsg_subctx *gr_subctx = gr_setup_ch->subctx;
 
 	/* pass NULL variable*/
 	gr_setup_ch->subctx = NULL;