gpu: nvgpu: multiple address spaces support for subcontexts

This patch introduces following relationships among various nvgpu objects to support multiple address spaces with subcontexts. IOCTLs setting the relationships are shown in the braces. nvgpu_tsg 1<---->n nvgpu_tsg_subctx (TSG_BIND_CHANNEL_EX) nvgpu_tsg 1<---->n nvgpu_gr_ctx_mappings (ALLOC_OBJ_CTX) nvgpu_tsg_subctx 1<---->1 nvgpu_gr_subctx (ALLOC_OBJ_CTX) nvgpu_tsg_subctx 1<---->n nvgpu_channel (TSG_BIND_CHANNEL_EX) nvgpu_gr_ctx_mappings 1<---->n nvgpu_gr_subctx (ALLOC_OBJ_CTX) nvgpu_gr_ctx_mappings 1<---->1 vm_gk20a (ALLOC_OBJ_CTX) On unbinding the channel, objects are deleted according to dependencies. Without subcontexts, gr_ctx buffers mappings are maintained in the struct nvgpu_gr_ctx. For subcontexts, they are maintained in the struct nvgpu_gr_subctx. Preemption buffer with index NVGPU_GR_CTX_PREEMPT_CTXSW and PM buffer with index NVGPU_GR_CTX_PM_CTX are to be mapped in all subcontexts when they are programmed from respective ioctls. Global GR context buffers are to be programmed only for VEID0. Based on the channel object class the state is patched in the patch buffer in every ALLOC_OBJ_CTX call unlike setting it for only first channel like before. PM and preemptions buffers programming is protected under TSG ctx_init_lock. tsg->vm is now removed. VM reference for gr_ctx buffers mappings is managed through gr_ctx or gr_subctx mappings object. For vGPU, gr_subctx and mappings objects are created to reference VMs for the gr_ctx lifetime. The functions nvgpu_tsg_subctx_alloc_gr_subctx and nvgpu_tsg_- subctx_setup_subctx_header sets up the subcontext struct header for native driver. The function nvgpu_tsg_subctx_alloc_gr_subctx is called from vgpu to manage the gr ctx mapping references. free_subctx is now done when unbinding channel considering references to the subcontext by other channels. It will unmap the buffers in native driver case. It will just release the VM reference in vgpu case. Note that TEGRA_VGPU_CMD_FREE_CTX_HEADER ioctl is not called by vgpu any longer as it would be taken care by native driver. Bug 3677982 Change-Id: Ia439b251ff452a49f8514498832e24d04db86d2f Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2718760 Reviewed-by: Scott Long <scottl@nvidia.com> Reviewed-by: Ankur Kishore <ankkishore@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2022-05-18 17:32:18 +05:30
parent 9e13b61d4e
commit f55fd5dc8c
37 changed files with 1963 additions and 404 deletions
--- a/arch/nvgpu-common.yaml
+++ b/arch/nvgpu-common.yaml
@@ -446,6 +446,12 @@ fifo:
                 include/nvgpu/gops/tsg.h,
                 include/nvgpu/tsg.h ]
      deps: [ ]
+    tsg_subctx:
+      safe: yes
+      sources: [ common/fifo/tsg_subctx.c,
+                 common/fifo/tsg_subctx_priv.h,
+                 include/nvgpu/tsg_subctx.h ]
+      deps: [ ]
    submit:
      safe: yes
      sources: [ common/fifo/submit.c,
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -647,6 +647,7 @@ nvgpu-y += \
 	common/fifo/job.o \
 	common/fifo/priv_cmdbuf.o \
 	common/fifo/tsg.o \
+	common/fifo/tsg_subctx.o \
 	common/fifo/runlist.o \
 	common/fifo/engine_status.o \
 	common/fifo/engines.o \
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -155,6 +155,7 @@ srcs +=	common/device.c \
 	common/fifo/fifo.c \
 	common/fifo/pbdma.c \
 	common/fifo/tsg.c \
+	common/fifo/tsg_subctx.c \
 	common/fifo/runlist.c \
 	common/fifo/engine_status.c \
 	common/fifo/engines.c \
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -983,11 +983,6 @@ static void channel_free(struct nvgpu_channel *ch, bool force)
 		g->ops.gr.fecs_trace.unbind_channel(g, &ch->inst_block);
 #endif

-	if (g->ops.gr.setup.free_subctx != NULL) {
-		g->ops.gr.setup.free_subctx(ch);
-		ch->subctx = NULL;
-	}
-
 	g->ops.gr.intr.flush_channel_tlb(g);

 	if (ch->usermode_submit_enabled) {
@@ -1803,6 +1798,7 @@ int nvgpu_channel_init_support(struct gk20a *g, u32 chid)
 	nvgpu_mutex_init(&c->dbg_s_lock);
 #endif
 	nvgpu_init_list_node(&c->ch_entry);
+	nvgpu_init_list_node(&c->subctx_entry);
 	nvgpu_list_add(&c->free_chs, &g->fifo.free_chs);

 	return 0;
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -28,6 +28,7 @@
 #include <nvgpu/channel.h>
 #include <nvgpu/tsg.h>
 #include <nvgpu/atomic.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/rc.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/error_notifier.h>
@@ -142,6 +143,13 @@ int nvgpu_tsg_bind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
 	}

 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+	err = nvgpu_tsg_subctx_bind_channel(tsg, ch);
+	if (err != 0) {
+		nvgpu_err(g, "Subcontext %u bind failed", ch->subctx_id);
+		nvgpu_rwsem_up_write(&tsg->ch_list_lock);
+		return err;
+	}
+
 	nvgpu_list_add_tail(&ch->ch_entry, &tsg->ch_list);
 	tsg->ch_count = nvgpu_safe_add_u32(tsg->ch_count, 1U);
 	ch->tsgid = tsg->tsgid;
@@ -284,8 +292,15 @@ static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
 	}
 #endif

-	/* Remove channel from TSG and re-enable rest of the channels */
+	/**
+	 * Remove channel from TSG and re-enable rest of the channels.
+	 * Since channel removal can lead to subctx removal and/or
+	 * VM mappings removal, acquire ctx_init_lock.
+	 */
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+	nvgpu_tsg_subctx_unbind_channel(tsg, ch);
 	nvgpu_list_del(&ch->ch_entry);
 	tsg->ch_count = nvgpu_safe_sub_u32(tsg->ch_count, 1U);
 	ch->tsgid = NVGPU_INVALID_TSG_ID;
@@ -296,6 +311,8 @@ static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
 	g->ops.channel.disable(ch);
 	nvgpu_rwsem_up_write(&tsg->ch_list_lock);

+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	/*
 	 * Don't re-enable all channels if TSG has timed out already
 	 *
@@ -396,12 +413,17 @@ fail_common:
 	}
 #endif

+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+	nvgpu_tsg_subctx_unbind_channel(tsg, ch);
 	nvgpu_list_del(&ch->ch_entry);
 	ch->tsgid = NVGPU_INVALID_TSG_ID;
 	tsg->ch_count = nvgpu_safe_sub_u32(tsg->ch_count, 1U);
 	nvgpu_rwsem_up_write(&tsg->ch_list_lock);

+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release);

 	return err;
@@ -512,6 +534,8 @@ static void nvgpu_tsg_init_support(struct gk20a *g, u32 tsgid)
 	tsg->abortable = true;

 	nvgpu_init_list_node(&tsg->ch_list);
+	nvgpu_init_list_node(&tsg->subctx_list);
+	nvgpu_init_list_node(&tsg->gr_ctx_mappings_list);
 	nvgpu_rwsem_init(&tsg->ch_list_lock);
 	nvgpu_mutex_init(&tsg->ctx_init_lock);

@@ -869,7 +893,6 @@ int nvgpu_tsg_open_common(struct gk20a *g, struct nvgpu_tsg *tsg, pid_t pid)
 	tsg->ch_count = 0U;
 	nvgpu_ref_init(&tsg->refcount);

-	tsg->vm = NULL;
 	tsg->interleave_level = NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW;
 	tsg->timeslice_us = g->ops.tsg.default_timeslice_us(g);
 	tsg->runlist = NULL;
@@ -963,11 +986,6 @@ void nvgpu_tsg_release_common(struct gk20a *g, struct nvgpu_tsg *tsg)
 		tsg->rl_domain = NULL;
 	}

-	if (tsg->vm != NULL) {
-		nvgpu_vm_put(tsg->vm);
-		tsg->vm = NULL;
-	}
-
 	if(tsg->sm_error_states != NULL) {
 		nvgpu_kfree(g, tsg->sm_error_states);
 		tsg->sm_error_states = NULL;
--- a/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/tsg.h>
+#include <nvgpu/tsg_subctx.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/log.h>
+#include <nvgpu/gr/subctx.h>
+#include <nvgpu/gr/ctx_mappings.h>
+
+#include "tsg_subctx_priv.h"
+
+static inline struct nvgpu_tsg_subctx *
+nvgpu_tsg_subctx_from_tsg_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_tsg_subctx *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_tsg_subctx, tsg_entry));
+};
+
+static struct nvgpu_tsg_subctx *nvgpu_tsg_subctx_from_id(struct nvgpu_tsg *tsg,
+					u32 subctx_id)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->subctx_id == subctx_id) {
+			return subctx;
+		}
+	}
+
+	return NULL;
+}
+
+int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
+				  struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	subctx = nvgpu_tsg_subctx_from_id(tsg, ch->subctx_id);
+	if (subctx != NULL) {
+		if (subctx->vm != ch->vm) {
+			nvgpu_err(g, "subctx vm mismatch");
+			return -EINVAL;
+		}
+
+		goto add_ch_subctx;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "Allocating subctx %u", ch->subctx_id);
+
+	subctx = nvgpu_kzalloc(g, sizeof(struct nvgpu_tsg_subctx));
+	if (subctx == NULL) {
+		nvgpu_err(g, "Failed to allocate subctx");
+		return -ENOMEM;
+	}
+
+	subctx->subctx_id = ch->subctx_id;
+	subctx->tsg = tsg;
+	subctx->vm = ch->vm;
+	nvgpu_init_list_node(&subctx->ch_list);
+	nvgpu_init_list_node(&subctx->tsg_entry);
+
+	nvgpu_list_add_tail(&subctx->tsg_entry, &tsg->subctx_list);
+
+add_ch_subctx:
+	ch->subctx = subctx;
+	nvgpu_list_add_tail(&ch->subctx_entry, &subctx->ch_list);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;
+}
+
+void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
+				     struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return;
+	}
+
+	subctx = ch->subctx;
+	nvgpu_assert(subctx != NULL);
+
+	nvgpu_list_del(&ch->subctx_entry);
+
+	if (nvgpu_list_empty(&subctx->ch_list)) {
+		if (g->ops.gr.setup.free_subctx != NULL) {
+			g->ops.gr.setup.free_subctx(ch);
+			subctx->gr_subctx = NULL;
+		}
+
+		nvgpu_list_del(&subctx->tsg_entry);
+		nvgpu_kfree(tsg->g, subctx);
+	}
+
+	ch->subctx = NULL;
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+
+int nvgpu_tsg_subctx_alloc_gr_subctx(struct gk20a *g, struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	subctx = ch->subctx;
+	if (subctx == NULL) {
+		nvgpu_err(g, "channel not bound to TSG subctx");
+		return -EINVAL;
+	}
+
+	if (subctx->gr_subctx == NULL) {
+		subctx->gr_subctx = nvgpu_gr_subctx_alloc(g);
+		if (subctx->gr_subctx == NULL) {
+			nvgpu_err(g, "gr_subctx alloc failed");
+			return -ENOMEM;
+		}
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;
+}
+
+int nvgpu_tsg_subctx_setup_subctx_header(struct gk20a *g,
+					 struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg_subctx *subctx;
+	int err;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	subctx = ch->subctx;
+	if ((subctx == NULL) || (subctx->gr_subctx == NULL)) {
+		nvgpu_err(g, "channel not bound to TSG/GR subctx");
+		return -EINVAL;
+	}
+
+	err = nvgpu_gr_subctx_setup_header(g, subctx->gr_subctx, subctx->vm);
+	if (err != 0) {
+		nvgpu_err(g, "gr_subctx header setup failed %d", err);
+		return err;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;
+}
+
+struct nvgpu_gr_subctx *nvgpu_tsg_subctx_get_gr_subctx(
+				struct nvgpu_tsg_subctx *subctx)
+{
+	return subctx->gr_subctx;
+}
+
+u32 nvgpu_tsg_subctx_get_id(struct nvgpu_tsg_subctx *subctx)
+{
+	return subctx->subctx_id;
+}
+
+struct nvgpu_gr_ctx_mappings *nvgpu_tsg_subctx_alloc_or_get_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct nvgpu_channel *ch)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_gr_subctx *gr_subctx = NULL;
+	struct vm_gk20a *vm = ch->vm;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_assert(ch->subctx != NULL);
+	nvgpu_assert(ch->subctx->vm == vm);
+
+	mappings = nvgpu_gr_ctx_mappings_get_subctx_mappings(g, tsg, vm);
+	if (mappings != NULL) {
+		goto add_gr_subctx;
+	}
+
+	mappings = nvgpu_gr_ctx_mappings_create_subctx_mappings(g, tsg, vm);
+	if (mappings == NULL) {
+		nvgpu_err(g, "failed to allocate gr_ctx mappings");
+		return NULL;
+	}
+
+add_gr_subctx:
+	gr_subctx = nvgpu_tsg_subctx_get_gr_subctx(ch->subctx);
+	nvgpu_assert(gr_subctx != NULL);
+
+	nvgpu_gr_ctx_mappings_add_gr_subctx(mappings, gr_subctx);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return mappings;
+}
+
+#ifdef CONFIG_NVGPU_GFXP
+static struct nvgpu_gr_ctx_mappings *nvgpu_tsg_subctx_get_veid0_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_tsg_subctx *subctx = NULL;
+
+	subctx = nvgpu_tsg_subctx_from_id(tsg, CHANNEL_INFO_VEID0);
+	if (subctx == NULL) {
+		nvgpu_log(g, gpu_dbg_gr, "VEID0 subctx not available");
+		return NULL;
+	}
+
+	mappings = nvgpu_gr_subctx_get_mappings(subctx->gr_subctx);
+	if (mappings == NULL) {
+		nvgpu_log(g, gpu_dbg_gr, "VEID0 mappings not available");
+		return NULL;
+	}
+
+	return mappings;
+}
+
+void nvgpu_tsg_subctxs_set_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx)
+{
+	struct nvgpu_gr_ctx_mappings *veid0_mappings;
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct nvgpu_tsg *tsg = tsg_subctx->tsg;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+
+	veid0_mappings = nvgpu_tsg_subctx_get_veid0_mappings(g, tsg);
+	if (veid0_mappings == NULL) {
+		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+		return;
+	}
+
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->gr_subctx != NULL) {
+			nvgpu_gr_subctx_set_preemption_buffer_va(g,
+				subctx->gr_subctx, veid0_mappings);
+		}
+	}
+
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+
+void nvgpu_tsg_subctxs_clear_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct nvgpu_tsg *tsg = tsg_subctx->tsg;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->gr_subctx != NULL) {
+			nvgpu_gr_subctx_clear_preemption_buffer_va(g,
+				subctx->gr_subctx);
+		}
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+#endif /* CONFIG_NVGPU_GFXP */
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+void nvgpu_tsg_subctxs_set_pm_buffer_va(struct nvgpu_tsg *tsg,
+					bool set_pm_ctx_gpu_va)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct gk20a *g = tsg->g;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+	nvgpu_list_for_each_entry(subctx, &tsg->subctx_list,
+				  nvgpu_tsg_subctx, tsg_entry) {
+		if (subctx->gr_subctx != NULL) {
+			nvgpu_gr_subctx_set_hwpm_ptr(g, subctx->gr_subctx,
+						     set_pm_ctx_gpu_va);
+		}
+	}
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+#endif /* CONFIG_NVGPU_DEBUGGER */
--- a/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h
+++ b/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_COMMON_FIFO_TSG_SUBCTX_PRIV_H
+#define NVGPU_COMMON_FIFO_TSG_SUBCTX_PRIV_H
+
+#include <nvgpu/types.h>
+#include <nvgpu/list.h>
+
+struct nvgpu_tsg;
+struct vm_gk20a;
+struct nvgpu_gr_subctx;
+
+struct nvgpu_tsg_subctx {
+
+	/** Subcontext Id (aka. veid). */
+	u32 subctx_id;
+
+	/** TSG to which this subcontext belongs. */
+	struct nvgpu_tsg *tsg;
+
+	/** Subcontext's address space. */
+	struct vm_gk20a *vm;
+
+	/** Subcontext's GR ctx header and GR ctx buffers mappings. */
+	struct nvgpu_gr_subctx *gr_subctx;
+
+	/**
+	 * Subcontext's entry in TSG's (#nvgpu_tsg) subcontexts list
+	 * #subctx_list.
+	 */
+	struct nvgpu_list_node tsg_entry;
+
+	/**
+	 * List of channels (#nvgpu_channel) bound to this TSG subcontext.
+	 * Accessed by holding #ch_list_lock from TSG.
+	 */
+	struct nvgpu_list_node ch_list;
+};
+
+#endif /* NVGPU_COMMON_FIFO_TSG_SUBCTX_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/ctx.c
+++ b/drivers/gpu/nvgpu/common/gr/ctx.c
@@ -30,6 +30,7 @@
 #include <nvgpu/gmmu.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/string.h>
+#include <nvgpu/tsg_subctx.h>

 #include <nvgpu/power_features/pg.h>
 #include "common/gr/ctx_priv.h"
@@ -116,9 +117,7 @@ int nvgpu_gr_ctx_alloc_ctx_buffers(struct gk20a *g,
 		}
 #endif

-		if (desc->size[i] != 0U) {
-			nvgpu_assert(!nvgpu_mem_is_valid(&ctx->mem[i]));
-
+		if (desc->size[i] != 0U && !nvgpu_mem_is_valid(&ctx->mem[i])) {
 			err = nvgpu_dma_alloc_sys(g, desc->size[i],
 				&ctx->mem[i]);
 			if (err != 0) {
@@ -126,10 +125,14 @@ int nvgpu_gr_ctx_alloc_ctx_buffers(struct gk20a *g,
 				nvgpu_gr_ctx_free_ctx_buffers(g, ctx);
 				return err;
 			}
+
+			nvgpu_log(g, gpu_dbg_gr, "ctx buffer %u allocated", i);
 		}
 	}

+	if (!nvgpu_gr_ctx_get_ctx_initialized(ctx)) {
 		ctx->ctx_id_valid = false;
+	}

 	nvgpu_log(g, gpu_dbg_gr, "done");

@@ -206,6 +209,8 @@ int nvgpu_gr_ctx_alloc_ctx_preemption_buffers(struct gk20a *g,
 				nvgpu_gr_ctx_free_ctx_preemption_buffers(g, ctx);
 				return err;
 			}
+
+			nvgpu_log(g, gpu_dbg_gr, "ctx preemption buffer %u allocated", i);
 		}
 	}

@@ -219,32 +224,54 @@ void nvgpu_gr_ctx_free(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer)
 {
+	struct nvgpu_tsg *tsg;
+
 	nvgpu_log(g, gpu_dbg_gr, " ");

-	if ((gr_ctx != NULL) && (gr_ctx->mappings != NULL)) {
+	if (gr_ctx != NULL) {
+		tsg = nvgpu_tsg_get_from_id(g, gr_ctx->tsgid);
+
+		nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+		if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+			nvgpu_assert(nvgpu_list_empty(&tsg->ch_list));
+			nvgpu_assert(nvgpu_list_empty(&tsg->subctx_list));
+			nvgpu_assert(nvgpu_list_empty(&tsg->gr_ctx_mappings_list));
+		} else {
+			if (gr_ctx->mappings != NULL) {
 				nvgpu_gr_ctx_unmap_buffers(g,
-			gr_ctx, global_ctx_buffer, gr_ctx->mappings);
+					gr_ctx, NULL, global_ctx_buffer,
+					gr_ctx->mappings);

 				nvgpu_gr_ctx_free_mappings(g, gr_ctx);
+			}
+		}

 		nvgpu_gr_ctx_set_patch_ctx_data_count(gr_ctx, 0);

 		nvgpu_gr_ctx_free_ctx_buffers(g, gr_ctx);

 		(void) memset(gr_ctx, 0, sizeof(*gr_ctx));
+
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 	}

 	nvgpu_log(g, gpu_dbg_gr, "done");
 }

 struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_alloc_or_get_mappings(struct gk20a *g,
-				struct nvgpu_tsg *tsg, struct vm_gk20a *vm)
+				struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
 {
 	struct nvgpu_gr_ctx_mappings *mappings = NULL;
 	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct vm_gk20a *vm = ch->vm;

 	nvgpu_log(g, gpu_dbg_gr, " ");

+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return nvgpu_tsg_subctx_alloc_or_get_mappings(g, tsg, ch);
+	}
+
 	mappings = gr_ctx->mappings;
 	if (mappings != NULL) {
 		return mappings;
@@ -278,11 +305,16 @@ void nvgpu_gr_ctx_free_mappings(struct gk20a *g,
 	nvgpu_log(g, gpu_dbg_gr, "done");
 }

-struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg)
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg,
+					struct nvgpu_channel *ch)
 {
-	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct gk20a *g = tsg->g;

-	return gr_ctx->mappings;
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return nvgpu_gr_ctx_mappings_get_subctx_mappings(g, tsg, ch->vm);
+	}
+
+	return tsg->gr_ctx->mappings;
 }

 void nvgpu_gr_ctx_set_patch_ctx_data_count(struct nvgpu_gr_ctx *gr_ctx,
@@ -639,9 +671,9 @@ int nvgpu_gr_ctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,

 #ifdef CONFIG_NVGPU_GFXP
 void nvgpu_gr_ctx_set_preemption_buffer_va(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings)
+	struct nvgpu_gr_ctx *gr_ctx)
 {
+	struct nvgpu_gr_ctx_mappings *mappings = gr_ctx->mappings;
 	struct nvgpu_mem *mem = &gr_ctx->mem[NVGPU_GR_CTX_CTX];
 	u64 preempt_ctxsw_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
 						NVGPU_GR_CTX_PREEMPT_CTXSW);
@@ -744,12 +776,6 @@ int nvgpu_gr_ctx_alloc_map_pm_ctx(struct gk20a *g,
 		return 0;
 	}

-	mappings = nvgpu_gr_ctx_get_mappings(tsg);
-	if (mappings == NULL) {
-		nvgpu_err(g, "gr_ctx mappings struct not allocated");
-		return -ENOMEM;
-	}
-
 	nvgpu_gr_ctx_set_size(gr_ctx_desc,
 		NVGPU_GR_CTX_PM_CTX,
 		nvgpu_gr_hwpm_map_get_size(hwpm_map));
@@ -761,8 +787,25 @@ int nvgpu_gr_ctx_alloc_map_pm_ctx(struct gk20a *g,
 		return ret;
 	}

+	/*
+	 * Commit NVGPU_GR_CTX_PM_CTX gpu va for all subcontexts
+	 * when subcontexts are enabled.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		ret = nvgpu_gr_ctx_mappings_map_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PM_CTX);
+	} else {
+		mappings = nvgpu_gr_ctx_get_mappings(tsg, NULL);
+		if (mappings == NULL) {
+			nvgpu_err(g, "gr_ctx mappings struct not allocated");
+			nvgpu_gr_ctx_free_pm_ctx(g, gr_ctx);
+			return -ENOMEM;
+		}
+
 		ret = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, gr_ctx,
 				NVGPU_GR_CTX_PM_CTX, mappings);
+	}
+
 	if (ret != 0) {
 		nvgpu_err(g, "gr_ctx pm_ctx buffer map failed %d", ret);
 		nvgpu_gr_ctx_free_pm_ctx(g, gr_ctx);
@@ -839,13 +882,13 @@ int nvgpu_gr_ctx_set_smpc_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,

 int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
-	u32 mode, u64 *pm_ctx_gpu_va, bool *skip_update)
+	u32 mode, bool *set_pm_ctx_gpu_va, bool *skip_update)
 {
-	struct nvgpu_gr_ctx_mappings *mappings = gr_ctx->mappings;
 	struct nvgpu_mem *mem = &gr_ctx->mem[NVGPU_GR_CTX_CTX];
 	struct pm_ctx_desc *pm_ctx = &gr_ctx->pm_ctx;
 	int ret = 0;

+	*set_pm_ctx_gpu_va = false;
 	*skip_update = false;

 	if (!nvgpu_mem_is_valid(mem)) {
@@ -868,8 +911,7 @@ int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 			return 0;
 		}
 		pm_ctx->pm_mode = g->ops.gr.ctxsw_prog.hw_get_pm_mode_ctxsw();
-		*pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
-					NVGPU_GR_CTX_PM_CTX);
+		*set_pm_ctx_gpu_va = true;
 		break;
 	case  NVGPU_GR_CTX_HWPM_CTXSW_MODE_NO_CTXSW:
 		if (pm_ctx->pm_mode ==
@@ -879,7 +921,7 @@ int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 		}
 		pm_ctx->pm_mode =
 			g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw();
-		*pm_ctx_gpu_va = 0;
+		*set_pm_ctx_gpu_va = false;
 		break;
 	case NVGPU_GR_CTX_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
 		if (pm_ctx->pm_mode ==
@@ -889,8 +931,7 @@ int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 		}
 		pm_ctx->pm_mode =
 			g->ops.gr.ctxsw_prog.hw_get_pm_mode_stream_out_ctxsw();
-		*pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
-					NVGPU_GR_CTX_PM_CTX);
+		*set_pm_ctx_gpu_va = true;
 		break;
 	default:
 		nvgpu_err(g, "invalid hwpm context switch mode");
@@ -909,9 +950,16 @@ void nvgpu_gr_ctx_set_hwpm_pm_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx)
 }

 void nvgpu_gr_ctx_set_hwpm_ptr(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,
-			       u64 pm_ctx_gpu_va)
+			       bool set_pm_ctx_gpu_va)
 {
 	struct nvgpu_mem *mem = &gr_ctx->mem[NVGPU_GR_CTX_CTX];
+	u64 pm_ctx_gpu_va = 0ULL;
+
+	if (set_pm_ctx_gpu_va) {
+		pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+						gr_ctx->mappings,
+						NVGPU_GR_CTX_PM_CTX);
+	}

 	g->ops.gr.ctxsw_prog.set_pm_ptr(g, mem, pm_ctx_gpu_va);
 }
@@ -921,3 +969,47 @@ void nvgpu_gr_ctx_set_pm_ctx_mapped(struct nvgpu_gr_ctx *ctx, bool mapped)
 	ctx->pm_ctx.mapped = mapped;
 }
 #endif /* CONFIG_NVGPU_DEBUGGER */
+
+bool nvgpu_gr_obj_ctx_global_ctx_buffers_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->global_ctx_buffers_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->global_ctx_buffers_patched = patched;
+}
+
+bool nvgpu_gr_obj_ctx_preempt_buffers_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->preempt_buffers_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_preempt_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->preempt_buffers_patched = patched;
+}
+
+bool nvgpu_gr_obj_ctx_default_compute_regs_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->default_compute_regs_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_default_compute_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->default_compute_regs_patched = patched;
+}
+
+bool nvgpu_gr_obj_ctx_default_gfx_regs_patched(struct nvgpu_gr_ctx *gr_ctx)
+{
+	return gr_ctx->default_gfx_regs_patched;
+}
+
+void nvgpu_gr_obj_ctx_set_default_gfx_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched)
+{
+	gr_ctx->default_gfx_regs_patched = patched;
+}
--- a/drivers/gpu/nvgpu/common/gr/ctx_mappings.c
+++ b/drivers/gpu/nvgpu/common/gr/ctx_mappings.c
@@ -22,17 +22,38 @@

 #include <nvgpu/gk20a.h>
 #include <nvgpu/static_analysis.h>
+#include <nvgpu/tsg_subctx.h>
+#include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/global_ctx.h>
 #include <nvgpu/gr/ctx.h>
+#include <nvgpu/gr/obj_ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
 #include <nvgpu/vm.h>
 #include <nvgpu/io.h>
 #include <nvgpu/gmmu.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/string.h>
+#include <nvgpu/list.h>
+#include <nvgpu/gr/gr_utils.h>
+#include <nvgpu/fifo.h>

 #include <nvgpu/power_features/pg.h>
 #include "common/gr/ctx_mappings_priv.h"
+#include "common/gr/subctx_priv.h"
+
+static inline struct nvgpu_gr_ctx_mappings *
+nvgpu_gr_ctx_mappings_from_tsg_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_gr_ctx_mappings *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_gr_ctx_mappings, tsg_entry));
+};
+
+static inline struct nvgpu_gr_subctx *
+nvgpu_gr_subctx_from_gr_ctx_mappings_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_gr_subctx *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_gr_subctx, gr_ctx_mappings_entry));
+};

 struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_create(struct gk20a *g,
 				struct nvgpu_tsg *tsg, struct vm_gk20a *vm)
@@ -83,11 +104,14 @@ int nvgpu_gr_ctx_mappings_map_ctx_buffer(struct gk20a *g,

 	nvgpu_log(g, gpu_dbg_gr, " ");

+	if (mappings->ctx_buffer_va[index] != 0ULL) {
+		nvgpu_log_info(g, "buffer %u already mapped", index);
+		return 0;
+	}
+
 	mem = nvgpu_gr_ctx_get_ctx_mem(ctx, index);
 	mapping_flags = nvgpu_gr_ctx_get_ctx_mapping_flags(ctx, index);

-	nvgpu_assert(mappings->ctx_buffer_va[index] == 0ULL);
-
 	if (nvgpu_mem_is_valid(mem)) {
 		gpu_va = nvgpu_gmmu_map(vm,
 				mem,
@@ -138,28 +162,149 @@ static void nvgpu_gr_ctx_mappings_unmap_ctx_buffer(struct nvgpu_gr_ctx *ctx,
 	}
 }

+static void nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(
+		struct nvgpu_tsg *tsg, u32 index)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+
+	nvgpu_assert(index < NVGPU_GR_CTX_COUNT);
+
+	nvgpu_list_for_each_entry(mappings, &tsg->gr_ctx_mappings_list,
+				  nvgpu_gr_ctx_mappings, tsg_entry) {
+		nvgpu_gr_ctx_mappings_unmap_ctx_buffer(gr_ctx,
+				index, mappings);
+	}
+}
+
+int nvgpu_gr_ctx_mappings_map_buffer_all_subctx(
+		struct nvgpu_tsg *tsg, u32 index)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct gk20a *g = tsg->g;
+	int err;
+
+	nvgpu_assert(index < NVGPU_GR_CTX_COUNT);
+
+	nvgpu_list_for_each_entry(mappings, &tsg->gr_ctx_mappings_list,
+				  nvgpu_gr_ctx_mappings, tsg_entry) {
+		err = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, gr_ctx,
+				index, mappings);
+		if (err != 0) {
+			nvgpu_err(g, "gr_ctx buffer %u map failed %d", index, err);
+			nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg, index);
+			return err;
+		}
+
+	}
+
+	return 0;
+}
+
 static void nvgpu_gr_ctx_mappings_unmap_ctx_buffers(struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_COUNT;
 	u32 i;
+#ifdef CONFIG_NVGPU_GFXP
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct gk20a *g = tsg->g;
+	bool is_sync_veid;
+	bool gfxp_active;
+#endif

-	for (i = 0; i < NVGPU_GR_CTX_COUNT; i++) {
+	(void) subctx;
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);
+
+		if (is_sync_veid && gfxp_active) {
+			nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg,
+				NVGPU_GR_CTX_PREEMPT_CTXSW);
+			nvgpu_tsg_subctxs_clear_preemption_buffer_va(subctx);
+			nvgpu_gr_ctx_init_graphics_preemption_mode(ctx,
+				NVGPU_PREEMPTION_MODE_GRAPHICS_WFI);
+		}
+
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				nvgpu_gr_subctx_clear_preemption_buffer_va(g,
+					nvgpu_tsg_subctx_get_gr_subctx(subctx));
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW + 1U;
+			} else {
+				buffers_count = NVGPU_GR_CTX_PATCH_CTX + 1U;
+			}
+		}
+	}
+#endif
+
+	for (i = 0; i < buffers_count; i++) {
 		nvgpu_gr_ctx_mappings_unmap_ctx_buffer(ctx, i, mappings);
 	}
 }

 static int nvgpu_gr_ctx_mappings_map_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_COUNT;
 	int err = 0;
 	u32 i;
+#ifdef CONFIG_NVGPU_GFXP
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	bool is_sync_veid;
+	bool gfxp_active;
+#endif

-	for (i = 0; i < NVGPU_GR_CTX_COUNT; i++) {
+	(void) subctx;
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);
+
+		if (is_sync_veid && gfxp_active) {
+			err = nvgpu_gr_ctx_mappings_map_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
+			if (err != 0) {
+				nvgpu_err(g, "preempt buffer mapping failed %d",
+					  err);
+				nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(
+					tsg, NVGPU_GR_CTX_PREEMPT_CTXSW);
+				return err;
+			}
+		}
+
+		/*
+		 * Only NVGPU_GR_CTX_PREEMPT_CTXSW is to be mapped for
+		 * all VEIDs.
+		 * Don't map other preemption buffers for ASYNC VEIDs.
+		 */
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW + 1U;
+			} else {
+				buffers_count = NVGPU_GR_CTX_PATCH_CTX + 1U;
+			}
+		}
+	}
+#endif
+
+	for (i = 0; i < buffers_count; i++) {
 		err = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, ctx, i, mappings);
 		if (err != 0) {
 			nvgpu_err(g, "gr_ctx buffer %u map failed %d", i, err);
-			nvgpu_gr_ctx_mappings_unmap_ctx_buffers(ctx, mappings);
+			nvgpu_gr_ctx_mappings_unmap_ctx_buffers(ctx,
+						subctx, mappings);
 			return err;
 		}
 	}
@@ -170,36 +315,97 @@ static int nvgpu_gr_ctx_mappings_map_ctx_buffers(struct gk20a *g,
 #ifdef CONFIG_NVGPU_GFXP
 static void nvgpu_gr_ctx_mappings_unmap_ctx_preemption_buffers(
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_GFXP_RTVCB_CTXSW;
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct gk20a *g = tsg->g;
+	bool is_sync_veid;
+	bool gfxp_active;
 	u32 i;

-	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW;
-			i <= NVGPU_GR_CTX_GFXP_RTVCB_CTXSW; i++) {
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);
+
+		if (is_sync_veid && gfxp_active) {
+			nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg,
+				NVGPU_GR_CTX_PREEMPT_CTXSW);
+
+			nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+			nvgpu_tsg_subctxs_clear_preemption_buffer_va(subctx);
+			nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+			nvgpu_gr_ctx_init_graphics_preemption_mode(ctx,
+				NVGPU_PREEMPTION_MODE_GRAPHICS_WFI);
+		}
+
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				nvgpu_gr_subctx_clear_preemption_buffer_va(g,
+					nvgpu_tsg_subctx_get_gr_subctx(subctx));
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW;
+			} else {
+				return;
+			}
+		}
+	}
+
+	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW; i <= buffers_count; i++) {
 		nvgpu_gr_ctx_mappings_unmap_ctx_buffer(ctx, i, mappings);
 	}
 }

 int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	u32 buffers_count = NVGPU_GR_CTX_GFXP_RTVCB_CTXSW;
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	bool is_sync_veid;
+	bool gfxp_active;
 	int err = 0;
 	u32 i;

-	nvgpu_log(g, gpu_dbg_gr, " ");
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) ==
+					CHANNEL_INFO_VEID0;
+		gfxp_active = (nvgpu_gr_ctx_get_graphics_preemption_mode(ctx) ==
+			       NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP);

-	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW;
-			i <= NVGPU_GR_CTX_GFXP_RTVCB_CTXSW; i++) {
-		if (mappings->ctx_buffer_va[i] == 0ULL) {
+		if (is_sync_veid && gfxp_active) {
+			err = nvgpu_gr_ctx_mappings_map_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
+			if (err != 0) {
+				nvgpu_err(g, "preempt buffer mapping failed %d", err);
+				nvgpu_gr_ctx_mappings_unmap_buffer_all_subctx(tsg,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
+				return err;
+			}
+		}
+
+		if (!is_sync_veid) {
+			if (gfxp_active) {
+				buffers_count = NVGPU_GR_CTX_PREEMPT_CTXSW;
+			} else {
+				return 0;
+			}
+		}
+	}
+
+	for (i = NVGPU_GR_CTX_PREEMPT_CTXSW; i <= buffers_count; i++) {
 		err = nvgpu_gr_ctx_mappings_map_ctx_buffer(g, ctx, i, mappings);
 		if (err != 0) {
 			nvgpu_err(g, "gr_ctx buffer %u map failed %d", i, err);
-				nvgpu_gr_ctx_mappings_unmap_ctx_preemption_buffers(ctx, mappings);
+			nvgpu_gr_ctx_mappings_unmap_ctx_preemption_buffers(ctx,
+				subctx, mappings);
 			return err;
 		}
 	}
-	}

 	nvgpu_log(g, gpu_dbg_gr, "done");

@@ -273,7 +479,9 @@ static void nvgpu_gr_ctx_mappings_unmap_global_ctx_buffers(

 static int nvgpu_gr_ctx_mappings_map_global_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
-	struct nvgpu_gr_ctx_mappings *mappings, bool vpr)
+	struct nvgpu_tsg_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *mappings,
+	bool vpr)
 {
 	int err;

@@ -282,7 +490,7 @@ static int nvgpu_gr_ctx_mappings_map_global_ctx_buffers(struct gk20a *g,
 	 * Allocate BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB and RTV_CB
 	 * if 2D/3D/I2M classes(graphics) are supported.
 	 */
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
 		/* Circular Buffer */
 		err = nvgpu_gr_ctx_mappings_map_global_ctx_buffer(
 					global_ctx_buffer,
@@ -388,7 +596,7 @@ fail:
 }

 int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	bool vpr)
@@ -403,17 +611,17 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
 		return -EINVAL;
 	}

-	err = nvgpu_gr_ctx_mappings_map_ctx_buffers(g, gr_ctx, mappings);
+	err = nvgpu_gr_ctx_mappings_map_ctx_buffers(g, gr_ctx, subctx, mappings);
 	if (err != 0) {
 		nvgpu_err(g, "fail to map ctx buffers");
 		return err;
 	}

 	err = nvgpu_gr_ctx_mappings_map_global_ctx_buffers(g,
-			global_ctx_buffer, mappings, vpr);
+			global_ctx_buffer, subctx, mappings, vpr);
 	if (err != 0) {
 		nvgpu_err(g, "fail to map global ctx buffer");
-		nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, mappings);
+		nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, subctx, mappings);
 		return err;
 	}

@@ -424,6 +632,7 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,

 void nvgpu_gr_ctx_unmap_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
@@ -432,7 +641,7 @@ void nvgpu_gr_ctx_unmap_buffers(struct gk20a *g,
 	nvgpu_gr_ctx_mappings_unmap_global_ctx_buffers(global_ctx_buffer,
 		mappings);

-	nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, mappings);
+	nvgpu_gr_ctx_mappings_unmap_ctx_buffers(gr_ctx, subctx, mappings);

 	nvgpu_log(g, gpu_dbg_gr, "done");
 }
@@ -450,3 +659,118 @@ u64 nvgpu_gr_ctx_mappings_get_ctx_va(struct nvgpu_gr_ctx_mappings *mappings,
 	nvgpu_assert(index < NVGPU_GR_CTX_COUNT);
 	return mappings->ctx_buffer_va[index];
 }
+
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_get_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_list_for_each_entry(mappings, &tsg->gr_ctx_mappings_list,
+				nvgpu_gr_ctx_mappings, tsg_entry) {
+		if (mappings->vm == vm) {
+			return mappings;
+		}
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return NULL;
+}
+
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_create_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm)
+{
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	mappings = (struct nvgpu_gr_ctx_mappings *)
+			nvgpu_kzalloc(g, sizeof(struct nvgpu_gr_ctx_mappings));
+	if (mappings == NULL) {
+		nvgpu_err(g, "failed to alloc mappings");
+		return NULL;
+	}
+
+	nvgpu_vm_get(vm);
+	mappings->tsg = tsg;
+	mappings->vm = vm;
+
+	nvgpu_init_list_node(&mappings->tsg_entry);
+	nvgpu_init_list_node(&mappings->subctx_list);
+
+	/* add mappings to the list in the tsg */
+	nvgpu_list_add_tail(&mappings->tsg_entry,
+			    &tsg->gr_ctx_mappings_list);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return mappings;
+}
+
+void nvgpu_gr_ctx_mappings_add_gr_subctx(struct nvgpu_gr_ctx_mappings *mappings,
+				   struct nvgpu_gr_subctx *subctx)
+{
+	struct nvgpu_gr_subctx *subctx_iter = NULL;
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct gk20a *g = tsg->g;
+	bool found = false;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_list_for_each_entry(subctx_iter, &mappings->subctx_list,
+				nvgpu_gr_subctx, gr_ctx_mappings_entry) {
+		if (subctx_iter == subctx) {
+			found = true;
+			goto out;
+		}
+	}
+
+out:
+	if (!found) {
+		subctx->mappings = mappings;
+		nvgpu_list_add_tail(&subctx->gr_ctx_mappings_entry,
+				    &mappings->subctx_list);
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
+
+void nvgpu_gr_ctx_mappings_free_subctx_mappings(struct nvgpu_tsg_subctx *subctx,
+			struct nvgpu_gr_ctx_mappings *mappings, bool unmap)
+{
+	struct nvgpu_tsg *tsg = mappings->tsg;
+	struct nvgpu_gr_ctx *gr_ctx = tsg->gr_ctx;
+	struct gk20a *g = tsg->g;
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer =
+				nvgpu_gr_get_global_ctx_buffer_ptr(g);
+	bool is_sync_veid;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (nvgpu_list_empty(&mappings->subctx_list)) {
+		if (unmap) {
+			nvgpu_gr_ctx_unmap_buffers(g,
+				gr_ctx, subctx, global_ctx_buffer, mappings);
+		}
+
+		/* remove mappings from the list in the tsg */
+		nvgpu_list_del(&mappings->tsg_entry);
+
+		nvgpu_gr_ctx_mappings_free(g, mappings);
+	}
+
+	is_sync_veid = nvgpu_tsg_subctx_get_id(subctx) == CHANNEL_INFO_VEID0;
+
+	if (is_sync_veid) {
+		nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(gr_ctx, false);
+		nvgpu_gr_obj_ctx_set_preempt_buffers_patched(gr_ctx, false);
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+}
--- a/drivers/gpu/nvgpu/common/gr/ctx_mappings_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/ctx_mappings_priv.h
@@ -53,5 +53,16 @@ struct nvgpu_gr_ctx_mappings {
 	 * corresponding to GPU virtual addresses above.
 	 */
 	u32	global_ctx_buffer_index[NVGPU_GR_GLOBAL_CTX_VA_COUNT];
+
+	/**
+	 * GR ctx mappings' entry in TSG's (#nvgpu_tsg) mappings list
+	 * #gr_ctx_mappings_list.
+	 */
+	struct nvgpu_list_node tsg_entry;
+
+	/**
+	 * List of GR subcontexts (#nvgpu_gr_subctx) using this mapping.
+	 */
+	struct nvgpu_list_node subctx_list;
 };
 #endif /* NVGPU_GR_CTX_MAPPINGS_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/ctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/ctx_priv.h
@@ -160,6 +160,11 @@ struct nvgpu_gr_ctx {
 	 */
 	u32 sm_diversity_config;
 #endif
+
+	bool global_ctx_buffers_patched;
+	bool preempt_buffers_patched;
+	bool default_compute_regs_patched;
+	bool default_gfx_regs_patched;
 };

 #endif /* NVGPU_GR_CTX_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/gr_setup.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_setup.c
@@ -33,6 +33,7 @@
 #include <nvgpu/gr/gr_instances.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/preempt.h>
+#include <nvgpu/tsg_subctx.h>

 #include "gr_priv.h"

@@ -140,22 +141,6 @@ static int nvgpu_gr_setup_validate_channel_and_class(struct gk20a *g,
 	return err;
 }

-static int nvgpu_gr_setup_alloc_subctx(struct gk20a *g, struct nvgpu_channel *c)
-{
-	int err = 0;
-
-	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
-		if (c->subctx == NULL) {
-			c->subctx = nvgpu_gr_subctx_alloc(g, c->vm);
-			if (c->subctx == NULL) {
-				err = -ENOMEM;
-			}
-		}
-	}
-
-	return err;
-}
-
 int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 		u32 flags)
 {
@@ -165,6 +150,9 @@ int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 	int err = 0;
 	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
 	struct nvgpu_gr_ctx_mappings *mappings = NULL;
+#ifdef CONFIG_NVGPU_FECS_TRACE
+	struct nvgpu_gr_subctx *gr_subctx = NULL;
+#endif

 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr,
 		"GR%u: allocate object context for channel %u",
@@ -195,28 +183,31 @@ int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 		return -EINVAL;
 	}

-	err = nvgpu_gr_setup_alloc_subctx(g, c);
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, c);
 	if (err != 0) {
-		nvgpu_err(g, "failed to allocate gr subctx buffer");
+		nvgpu_err(g, "failed to alloc gr subctx");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		goto out;
 	}

-	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+	err = nvgpu_tsg_subctx_setup_subctx_header(g, c);
+	if (err != 0) {
+		nvgpu_err(g, "failed to setup subctx header");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		goto out;
+	}

 	gr_ctx = tsg->gr_ctx;

-	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, c->vm);
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, c);
 	if (mappings == NULL) {
 		nvgpu_err(g, "fail to allocate/get ctx mappings struct");
 		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		goto out;
 	}

-	if (!nvgpu_mem_is_valid(nvgpu_gr_ctx_get_ctx_mem(gr_ctx,
-							 NVGPU_GR_CTX_CTX))) {
-		tsg->vm = c->vm;
-		nvgpu_vm_get(tsg->vm);
-
 	err = nvgpu_gr_obj_ctx_alloc(g, gr->golden_image,
 			gr->global_ctx_buffer, gr->gr_ctx_desc,
 			gr->config, gr_ctx, c->subctx,
@@ -225,24 +216,20 @@ int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 	if (err != 0) {
 		nvgpu_err(g,
 			"failed to allocate gr ctx buffer");
-			nvgpu_gr_ctx_free_mappings(g, gr_ctx);
 		nvgpu_mutex_release(&tsg->ctx_init_lock);
-			nvgpu_vm_put(tsg->vm);
-			tsg->vm = NULL;
 		goto out;
 	}

 	nvgpu_gr_ctx_set_tsgid(gr_ctx, tsg->tsgid);
-	} else {
-		/* commit gr ctx buffer */
-		nvgpu_gr_obj_ctx_commit_inst(g, &c->inst_block, gr_ctx,
-			c->subctx, mappings);
-	}

 #ifdef CONFIG_NVGPU_FECS_TRACE
 	if (g->ops.gr.fecs_trace.bind_channel && !c->vpr) {
+		if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+			gr_subctx = nvgpu_tsg_subctx_get_gr_subctx(c->subctx);
+		}
+
 		err = g->ops.gr.fecs_trace.bind_channel(g, &c->inst_block,
-			c->subctx, gr_ctx, mappings, tsg->tgid, 0);
+			gr_subctx, gr_ctx, mappings, tsg->tgid, 0);
 		if (err != 0) {
 			nvgpu_warn(g,
 				"fail to bind channel for ctxsw trace");
@@ -274,11 +261,6 @@ int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
 	return 0;
 out:
-	if (c->subctx != NULL) {
-		nvgpu_gr_subctx_free(g, c->subctx, c->vm);
-		c->subctx = NULL;
-	}
-
 	/* 1. gr_ctx, patch_ctx and global ctx buffer mapping
 	   can be reused so no need to release them.
 	   2. golden image init and load is a one time thing so if
@@ -320,13 +302,12 @@ void nvgpu_gr_setup_free_subctx(struct nvgpu_channel *c)
 		return;
 	}

-	if (c->subctx != NULL) {
-		nvgpu_gr_subctx_free(c->g, c->subctx, c->vm);
-		c->subctx = NULL;
-	}
+	nvgpu_gr_subctx_free(c->g, c->subctx, c->vm, true);
+
+	nvgpu_log_fn(c->g, "done");
 }

-static bool nvgpu_gr_setup_validate_preemption_mode(u32 *graphics_preempt_mode,
+bool nvgpu_gr_setup_validate_preemption_mode(u32 *graphics_preempt_mode,
 				u32 *compute_preempt_mode,
 				struct nvgpu_gr_ctx *gr_ctx)
 {
@@ -383,9 +364,19 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,

 	gr_ctx = tsg->gr_ctx;

+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+	g->ops.tsg.disable(tsg);
+
+	err = nvgpu_preempt_channel(g, ch);
+	if (err != 0) {
+		nvgpu_err(g, "failed to preempt channel/TSG");
+		goto enable_ch;
+	}
+
 	if (nvgpu_gr_setup_validate_preemption_mode(&graphics_preempt_mode,
 				&compute_preempt_mode, gr_ctx) == false) {
-		return 0;
+		goto enable_ch;
 	}

 	nvgpu_log(g, gpu_dbg_gr | gpu_dbg_sched, "chid=%d tsgid=%d pid=%d "
@@ -398,13 +389,14 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
 			graphics_preempt_mode, compute_preempt_mode);
 	if (err != 0) {
 		nvgpu_err(g, "set_ctxsw_preemption_mode failed");
-		return err;
+		goto enable_ch;
 	}

-	mappings = nvgpu_gr_ctx_get_mappings(tsg);
+	mappings = nvgpu_gr_ctx_get_mappings(tsg, ch);
 	if (mappings == NULL) {
 		nvgpu_err(g, "failed to get gr_ctx mappings");
-		return -EINVAL;
+		err = -EINVAL;
+		goto enable_ch;
 	}

 #ifdef CONFIG_NVGPU_GFXP
@@ -412,29 +404,21 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
 			gr->gr_ctx_desc, gr_ctx);
 	if (err != 0) {
 		nvgpu_err(g, "fail to allocate ctx preemption buffers");
-		return err;
+		goto enable_ch;
 	}

 	err = nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(g,
-			gr_ctx, mappings);
+			gr_ctx, ch->subctx, mappings);
 	if (err != 0) {
 		nvgpu_err(g, "fail to map ctx preemption buffers");
-		return err;
-	}
- #endif
-
-	g->ops.tsg.disable(tsg);
-
-	err = nvgpu_preempt_channel(g, ch);
-	if (err != 0) {
-		nvgpu_err(g, "failed to preempt channel/TSG");
 		goto enable_ch;
 	}
+ #endif

 	nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(g, gr->config, gr_ctx,
 		ch->subctx, mappings);

-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, ch->subctx)) {
 		nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);
 		g->ops.gr.init.commit_global_cb_manager(g, gr->config, gr_ctx,
 			true);
@@ -443,9 +427,12 @@ int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,

 	g->ops.tsg.enable(tsg);

+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	return err;

 enable_ch:
 	g->ops.tsg.enable(tsg);
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
 	return err;
 }
--- a/drivers/gpu/nvgpu/common/gr/gr_utils.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_utils.c
@@ -85,14 +85,12 @@ struct nvgpu_gr_zbc *nvgpu_gr_get_zbc_ptr(struct gk20a *g)
 }
 #endif

-#ifdef CONFIG_NVGPU_FECS_TRACE
 struct nvgpu_gr_global_ctx_buffer_desc *nvgpu_gr_get_global_ctx_buffer_ptr(
 							struct gk20a *g)
 {
 	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
 	return gr->global_ctx_buffer;
 }
-#endif

 #ifdef CONFIG_NVGPU_CILP
 u32 nvgpu_gr_get_cilp_preempt_pending_chid(struct gk20a *g)
--- a/drivers/gpu/nvgpu/common/gr/obj_ctx.c
+++ b/drivers/gpu/nvgpu/common/gr/obj_ctx.c
@@ -31,10 +31,13 @@
 #endif
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
+#include <nvgpu/tsg_subctx.h>
+#include <nvgpu/gr/setup.h>
 #include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/global_ctx.h>
 #include <nvgpu/gr/obj_ctx.h>
 #include <nvgpu/gr/config.h>
+#include <nvgpu/gr/ctx.h>
 #include <nvgpu/netlist.h>
 #include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/gr/fs_state.h>
@@ -54,16 +57,46 @@ void nvgpu_gr_obj_ctx_commit_inst_gpu_va(struct gk20a *g,
 	g->ops.ramin.set_gr_ptr(g, inst_block, gpu_va);
 }

+#ifdef CONFIG_NVGPU_DEBUGGER
+static void nvgpu_gr_obj_ctx_set_pm_ctx_gpu_va(struct gk20a *g,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *tsg_subctx)
+{
+	struct nvgpu_gr_subctx *subctx;
+	bool set_pm_ctx_gpu_va;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	set_pm_ctx_gpu_va = nvgpu_gr_ctx_get_pm_ctx_pm_mode(gr_ctx) !=
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw();
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		subctx = nvgpu_tsg_subctx_get_gr_subctx(tsg_subctx);
+		nvgpu_gr_subctx_set_hwpm_ptr(g, subctx,
+					     set_pm_ctx_gpu_va);
+	} else {
+		nvgpu_gr_ctx_set_hwpm_ptr(g, gr_ctx, set_pm_ctx_gpu_va);
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+}
+#endif
+
 void nvgpu_gr_obj_ctx_commit_inst(struct gk20a *g, struct nvgpu_mem *inst_block,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *tsg_subctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
+	struct nvgpu_gr_subctx *subctx;
 	struct nvgpu_mem *ctxheader;
 	u64 gpu_va;

 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");

+#ifdef CONFIG_NVGPU_DEBUGGER
+	nvgpu_gr_obj_ctx_set_pm_ctx_gpu_va(g, gr_ctx, tsg_subctx);
+#endif
+
 	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		subctx = nvgpu_tsg_subctx_get_gr_subctx(tsg_subctx);
 		nvgpu_gr_subctx_load_ctx_header(g, subctx, gr_ctx, mappings);

 		ctxheader = nvgpu_gr_subctx_get_ctx_header(subctx);
@@ -78,7 +111,50 @@ void nvgpu_gr_obj_ctx_commit_inst(struct gk20a *g, struct nvgpu_mem *inst_block,
 }

 #if defined(CONFIG_NVGPU_GFXP) || defined(CONFIG_NVGPU_CILP)
-static int nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
+static void nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_ctx_desc *gr_ctx_desc,
+	u32 class_num, u32 flags,
+	u32 *graphics_preempt_mode, u32 *compute_preempt_mode)
+{
+	u32 default_graphics_preempt_mode = 0U;
+	u32 default_compute_preempt_mode = 0U;
+
+	g->ops.gr.init.get_default_preemption_modes(
+			&default_graphics_preempt_mode,
+			&default_compute_preempt_mode);
+
+#ifdef CONFIG_NVGPU_GFXP
+	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP) != 0U) {
+		*graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
+	}
+
+	if (g->ops.gpu_class.is_valid_gfx(class_num) &&
+			nvgpu_gr_ctx_desc_force_preemption_gfxp(gr_ctx_desc)) {
+		*graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
+	}
+#endif
+
+#ifdef CONFIG_NVGPU_CILP
+	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP) != 0U) {
+		*compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
+	}
+
+	if (g->ops.gpu_class.is_valid_compute(class_num) &&
+			nvgpu_gr_ctx_desc_force_preemption_cilp(gr_ctx_desc)) {
+		*compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
+	}
+#endif
+
+	if (*compute_preempt_mode == 0U) {
+		*compute_preempt_mode = default_compute_preempt_mode;
+	}
+
+	if (*graphics_preempt_mode == 0U) {
+		*graphics_preempt_mode = default_graphics_preempt_mode;
+	}
+}
+
+static int nvgpu_gr_obj_ctx_init_ctxsw_preemption(struct gk20a *g,
 	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx_desc *gr_ctx_desc,
 	struct nvgpu_gr_ctx *gr_ctx,
 	u32 class_num, u32 flags)
@@ -86,8 +162,6 @@ static int nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
 	int err;
 	u32 graphics_preempt_mode = 0U;
 	u32 compute_preempt_mode = 0U;
-	u32 default_graphics_preempt_mode = 0U;
-	u32 default_compute_preempt_mode = 0U;

 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");

@@ -97,38 +171,26 @@ static int nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
 		return 0;
 	}

-	g->ops.gr.init.get_default_preemption_modes(
-			&default_graphics_preempt_mode,
-			&default_compute_preempt_mode);
-
+	if (nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
 #ifdef CONFIG_NVGPU_GFXP
 		if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP) != 0U) {
 			graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
 		}
-
-	if (g->ops.gpu_class.is_valid_gfx(class_num) &&
-			nvgpu_gr_ctx_desc_force_preemption_gfxp(gr_ctx_desc)) {
-		graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
-	}
 #endif

 #ifdef CONFIG_NVGPU_CILP
 		if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP) != 0U) {
 			compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
 		}
-
-	if (g->ops.gpu_class.is_valid_compute(class_num) &&
-			nvgpu_gr_ctx_desc_force_preemption_cilp(gr_ctx_desc)) {
-		compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
-	}
 #endif
-
-	if (compute_preempt_mode == 0U) {
-		compute_preempt_mode = default_compute_preempt_mode;
+		if (nvgpu_gr_setup_validate_preemption_mode(&graphics_preempt_mode,
+					&compute_preempt_mode, gr_ctx) == false) {
+			return 0;
 		}
-
-	if (graphics_preempt_mode == 0U) {
-		graphics_preempt_mode = default_graphics_preempt_mode;
+	} else {
+		nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(g, gr_ctx_desc,
+			class_num, flags, &graphics_preempt_mode,
+			&compute_preempt_mode);
 	}

 	err = nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(g, config,
@@ -266,43 +328,15 @@ fail:
 	return err;
 }

-void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
+#ifdef CONFIG_NVGPU_GFXP
+static void nvgpu_gr_obj_ctx_commit_veid0_preemption_buffers(struct gk20a *g,
 	struct nvgpu_gr_config *config,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx,
 	struct nvgpu_gr_ctx_mappings *mappings)
 {
-#ifdef CONFIG_NVGPU_GFXP
 	u64 addr;
 	u32 size;
 	struct nvgpu_mem *mem;
-#endif
-
-	(void)config;
-	(void)subctx;
-	(void)mappings;
-
-	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
-
-	nvgpu_gr_ctx_set_preemption_modes(g, gr_ctx);
-
-#ifdef CONFIG_NVGPU_GFXP
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_PREEMPTION_GFXP)) {
-		goto done;
-	}
-
-	if (!nvgpu_mem_is_valid(
-			nvgpu_gr_ctx_get_ctx_mem(gr_ctx,
-				NVGPU_GR_CTX_PREEMPT_CTXSW))) {
-		goto done;
-	}
-
-	if (subctx != NULL) {
-		nvgpu_gr_subctx_set_preemption_buffer_va(g, subctx, mappings);
-	} else {
-		nvgpu_gr_ctx_set_preemption_buffer_va(g, gr_ctx, mappings);
-	}
-
-	nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);

 	addr = nvgpu_gr_ctx_mappings_get_ctx_va(mappings, NVGPU_GR_CTX_BETACB_CTXSW);
 	g->ops.gr.init.commit_global_attrib_cb(g, gr_ctx, mappings,
@@ -324,6 +358,75 @@ void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
 	size = (u32)mem->size;

 	g->ops.gr.init.commit_ctxsw_spill(g, gr_ctx, addr, size, true);
+}
+#endif
+
+bool nvgpu_gr_obj_ctx_is_gfx_engine(struct gk20a *g, struct nvgpu_tsg_subctx *subctx)
+{
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG) &&
+		     nvgpu_tsg_subctx_get_id(subctx) == CHANNEL_INFO_VEID0) {
+			return true;
+		}
+	} else if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		return true;
+	}
+
+	return false;
+}
+
+void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_config *config,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *mappings)
+{
+	(void)config;
+	(void)subctx;
+	(void)mappings;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	nvgpu_gr_ctx_set_preemption_modes(g, gr_ctx);
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_PREEMPTION_GFXP)) {
+		goto done;
+	}
+
+	if (!nvgpu_mem_is_valid(
+			nvgpu_gr_ctx_get_ctx_mem(gr_ctx,
+				NVGPU_GR_CTX_PREEMPT_CTXSW))) {
+		goto done;
+	}
+
+	/*
+	 * Commit NVGPU_GR_CTX_PREEMPT_CTXSW gpu va for all subcontexts
+	 * considering VEID0 gpu va when subcontexts are enabled.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		nvgpu_tsg_subctxs_set_preemption_buffer_va(subctx);
+	} else {
+		nvgpu_gr_ctx_set_preemption_buffer_va(g, gr_ctx);
+	}
+
+	if (!nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
+		goto done;
+	}
+
+	if (nvgpu_gr_obj_ctx_preempt_buffers_patched(gr_ctx)) {
+		goto done;
+	}
+
+	nvgpu_gr_obj_ctx_set_preempt_buffers_patched(gr_ctx, true);
+
+	/*
+	 * Commit other preemption buffers only for VEID0 when subcontexts are
+	 * enabled. Commit always when subcontext are disabled.
+	 */
+	nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);
+
+	nvgpu_gr_obj_ctx_commit_veid0_preemption_buffers(g, config,
+			gr_ctx, mappings);

 	g->ops.gr.init.commit_cbes_reserve(g, gr_ctx, true);

@@ -346,6 +449,7 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	bool patch)
 {
@@ -363,7 +467,11 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	 * Skip BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB and RTV_CB
 	 * if 2D/3D/I2M classes(graphics) are not supported.
 	 */
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
+		if (patch && nvgpu_gr_obj_ctx_global_ctx_buffers_patched(gr_ctx)) {
+			goto commit_sm_id;
+		}
+
 		/* global pagepool buffer */
 		addr = nvgpu_gr_ctx_mappings_get_global_ctx_va(mappings,
 			NVGPU_GR_GLOBAL_CTX_PAGEPOOL_VA);
@@ -403,6 +511,16 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 			g->ops.gr.init.commit_rtv_cb(g, addr, gr_ctx, patch);
 		}
 #endif
+
+		if (patch) {
+			nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(gr_ctx,
+									true);
+		}
+	}
+
+commit_sm_id:
+	if (patch && nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
+		goto out;
 	}

 #ifdef CONFIG_NVGPU_SM_DIVERSITY
@@ -427,6 +545,7 @@ void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	}
 #endif

+out:
 	if (patch) {
 		nvgpu_gr_ctx_patch_write_end(g, gr_ctx, false);
 	}
@@ -561,7 +680,7 @@ clean_up:
 static int nvgpu_gr_obj_ctx_commit_hw_state(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings)
+	struct nvgpu_tsg_subctx *subctx, struct nvgpu_gr_ctx_mappings *mappings)
 {
 	int err = 0;
 	struct netlist_av_list *sw_method_init =
@@ -577,7 +696,7 @@ static int nvgpu_gr_obj_ctx_commit_hw_state(struct gk20a *g,
 	g->ops.gr.init.fe_go_idle_timeout(g, false);

 	nvgpu_gr_obj_ctx_commit_global_ctx_buffers(g, global_ctx_buffer,
-		config, gr_ctx, mappings, false);
+		config, gr_ctx, subctx, mappings, false);

 	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
 		/* override a few ctx state registers */
@@ -722,6 +841,7 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block)
 {
@@ -745,13 +865,13 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
 	}

 	err = nvgpu_gr_obj_ctx_commit_hw_state(g, global_ctx_buffer,
-					       config, gr_ctx, mappings);
+					config, gr_ctx, subctx, mappings);
 	if (err != 0) {
 		goto clean_up;
 	}

 #ifdef CONFIG_NVGPU_GRAPHICS
-	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+	if (nvgpu_gr_obj_ctx_is_gfx_engine(g, subctx)) {
 		err = nvgpu_gr_ctx_init_zcull(g, gr_ctx);
 		if (err != 0) {
 			goto clean_up;
@@ -832,14 +952,14 @@ static int nvgpu_gr_obj_ctx_alloc_buffers(struct gk20a *g,

 	nvgpu_log(g, gpu_dbg_gr, " ");

+	if (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
 		nvgpu_gr_obj_ctx_gr_ctx_set_size(g, golden_image, gr_ctx_desc);
-
 		nvgpu_gr_obj_ctx_patch_ctx_set_size(g, config, gr_ctx_desc);
-
 		nvgpu_gr_ctx_set_patch_ctx_data_count(gr_ctx, 0);
+	}

 #if defined(CONFIG_NVGPU_GFXP) || defined(CONFIG_NVGPU_CILP)
-	err = nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(g, config,
+	err = nvgpu_gr_obj_ctx_init_ctxsw_preemption(g, config,
 		gr_ctx_desc, gr_ctx, class_num, flags);
 	if (err != 0) {
 		nvgpu_err(g, "fail to init preemption mode");
@@ -982,13 +1102,54 @@ out:
 	return err;
 }

+static int nvgpu_gr_obj_ctx_load_golden_image(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
+	struct nvgpu_gr_config *config,
+	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *mappings,
+	struct nvgpu_mem *inst_block,
+	bool cde)
+{
+	int err;
+
+	/* init golden image */
+	err = nvgpu_gr_obj_ctx_alloc_golden_ctx_image(g, golden_image,
+		global_ctx_buffer, config, gr_ctx, subctx,
+		mappings, inst_block);
+	if (err != 0) {
+		nvgpu_err(g, "fail to init golden ctx image");
+		return err;
+	}
+
+#ifdef CONFIG_NVGPU_POWER_PG
+	/* Re-enable ELPG now that golden image has been initialized.
+	 * The PMU PG init code may already have tried to enable elpg, but
+	 * would not have been able to complete this action since the golden
+	 * image hadn't been initialized yet, so do this now.
+	 */
+	err = nvgpu_pmu_reenable_elpg(g);
+	if (err != 0) {
+		nvgpu_err(g, "fail to re-enable elpg");
+		return err;
+	}
+#endif
+
+	/* load golden image */
+	nvgpu_gr_ctx_load_golden_ctx_image(g, gr_ctx, mappings,
+		golden_image->local_golden_image, cde);
+
+	return 0;
+}
+
 int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_desc *gr_ctx_desc,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block,
 	u32 class_num, u32 flags,
@@ -1005,9 +1166,11 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 		goto out;
 	}

+	if (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
 		nvgpu_gr_ctx_init_ctx_buffers_mapping_flags(g, gr_ctx);
+	}

-	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx,
+	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, subctx,
 			global_ctx_buffer, mappings, vpr);
 	if (err != 0) {
 		nvgpu_err(g, "failed to map ctx buffers");
@@ -1015,52 +1178,42 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	}

 	nvgpu_gr_obj_ctx_commit_global_ctx_buffers(g, global_ctx_buffer,
-			config, gr_ctx, mappings, true);
+			config, gr_ctx, subctx, mappings, true);

 	/* commit gr ctx buffer */
 	nvgpu_gr_obj_ctx_commit_inst(g, inst_block, gr_ctx, subctx, mappings);

-	/* init golden image */
-	err = nvgpu_gr_obj_ctx_alloc_golden_ctx_image(g, golden_image,
-		global_ctx_buffer, config, gr_ctx, mappings, inst_block);
+	if (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx)) {
+		err = nvgpu_gr_obj_ctx_load_golden_image(g, golden_image,
+				global_ctx_buffer, config, gr_ctx, subctx,
+				mappings, inst_block, cde);
 		if (err != 0) {
-		nvgpu_err(g, "fail to init golden ctx image");
+			nvgpu_err(g, "fail to load golden ctx image");
 			goto out;
 		}
-
-#ifdef CONFIG_NVGPU_POWER_PG
-	/* Re-enable ELPG now that golden image has been initialized.
-	 * The PMU PG init code may already have tried to enable elpg, but
-	 * would not have been able to complete this action since the golden
-	 * image hadn't been initialized yet, so do this now.
-	 */
-	err = nvgpu_pmu_reenable_elpg(g);
-	if (err != 0) {
-		nvgpu_err(g, "fail to re-enable elpg");
-		goto out;
 	}
-#endif
-
-	/* load golden image */
-	nvgpu_gr_ctx_load_golden_ctx_image(g, gr_ctx, mappings,
-		golden_image->local_golden_image, cde);

 	nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(g, config, gr_ctx,
 		subctx, mappings);

 #ifndef CONFIG_NVGPU_NON_FUSA
 	if (g->ops.gpu_class.is_valid_compute(class_num) &&
-	    g->ops.gr.init.set_default_compute_regs != NULL) {
+	    (g->ops.gr.init.set_default_compute_regs != NULL) &&
+	    (!nvgpu_gr_obj_ctx_default_compute_regs_patched(gr_ctx))) {
 		g->ops.gr.init.set_default_compute_regs(g, gr_ctx);
+		nvgpu_gr_obj_ctx_set_default_compute_regs_patched(gr_ctx, true);
 	}

-	if (g->ops.ltc.set_default_l2_max_ways_evict_last != NULL) {
+	if ((g->ops.ltc.set_default_l2_max_ways_evict_last != NULL) &&
+	    (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx))) {
 		g->ops.ltc.set_default_l2_max_ways_evict_last(g, gr_ctx);
 	}
 #endif

 #ifdef CONFIG_NVGPU_NON_FUSA
-	if (g->ops.gr.init.enable_mme_config_ptimer != NULL) {
+	if ((g->ops.gr.init.enable_mme_config_ptimer != NULL) &&
+	    (!nvgpu_gr_ctx_get_ctx_initialized(gr_ctx))) {
+
 		err = nvgpu_pg_elpg_protected_call(g,
 				g->ops.gr.init.enable_mme_config_ptimer(g, gr_ctx));

@@ -1076,8 +1229,10 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	 * required for graphics contexts.
 	 */
 	if (g->ops.gpu_class.is_valid_gfx(class_num) &&
-	    g->ops.gr.init.set_default_gfx_regs != NULL) {
+	    (g->ops.gr.init.set_default_gfx_regs != NULL) &&
+	    (!nvgpu_gr_obj_ctx_default_gfx_regs_patched(gr_ctx))) {
 		g->ops.gr.init.set_default_gfx_regs(g, gr_ctx, &golden_image->gfx_regs);
+		nvgpu_gr_obj_ctx_set_default_gfx_regs_patched(gr_ctx, true);
 	}

 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
--- a/drivers/gpu/nvgpu/common/gr/subctx.c
+++ b/drivers/gpu/nvgpu/common/gr/subctx.c
@@ -21,6 +21,7 @@
 */

 #include <nvgpu/gk20a.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
@@ -30,17 +31,16 @@

 #include "common/gr/subctx_priv.h"

-struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
+int nvgpu_gr_subctx_setup_header(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx,
 	struct vm_gk20a *vm)
 {
-	struct nvgpu_gr_subctx *subctx;
 	int err = 0;

-	nvgpu_log_fn(g, " ");
+	nvgpu_log(g, gpu_dbg_gr, " ");

-	subctx = nvgpu_kzalloc(g, sizeof(*subctx));
-	if (subctx == NULL) {
-		return NULL;
+	if (subctx->ctx_header.gpu_va != 0ULL) {
+		return 0;
 	}

 	err = nvgpu_dma_alloc_sys(g,
@@ -48,7 +48,7 @@ struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
 			&subctx->ctx_header);
 	if (err != 0) {
 		nvgpu_err(g, "failed to allocate sub ctx header");
-		goto err_free_subctx;
+		return err;
 	}

 	subctx->ctx_header.gpu_va = nvgpu_gmmu_map(vm,
@@ -58,26 +58,65 @@ struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
 				subctx->ctx_header.aperture);
 	if (subctx->ctx_header.gpu_va == 0ULL) {
 		nvgpu_err(g, "failed to map ctx header");
+		err = -ENOMEM;
 		goto err_free_ctx_header;
 	}

-	return subctx;
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return 0;

 err_free_ctx_header:
 	nvgpu_dma_free(g, &subctx->ctx_header);
-err_free_subctx:
-	nvgpu_kfree(g, subctx);
+	return err;
+}
+
+struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g)
+{
+	struct nvgpu_gr_subctx *subctx;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	subctx = nvgpu_kzalloc(g, sizeof(*subctx));
+	if (subctx == NULL) {
 		return NULL;
+	}
+
+	nvgpu_init_list_node(&subctx->gr_ctx_mappings_entry);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+
+	return subctx;
 }

 void nvgpu_gr_subctx_free(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx,
-	struct vm_gk20a *vm)
+	struct nvgpu_tsg_subctx *subctx,
+	struct vm_gk20a *vm,
+	bool unmap)
 {
-	nvgpu_log_fn(g, " ");
+	struct nvgpu_gr_subctx *gr_subctx =
+			nvgpu_tsg_subctx_get_gr_subctx(subctx);

-	nvgpu_dma_unmap_free(vm, &subctx->ctx_header);
-	nvgpu_kfree(g, subctx);
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (gr_subctx == NULL) {
+		return;
+	}
+
+	if (gr_subctx->mappings != NULL) {
+		nvgpu_list_del(&gr_subctx->gr_ctx_mappings_entry);
+		nvgpu_gr_ctx_mappings_free_subctx_mappings(subctx,
+						gr_subctx->mappings, unmap);
+		gr_subctx->mappings = NULL;
+	}
+
+	if (unmap) {
+		nvgpu_dma_unmap_free(vm, &gr_subctx->ctx_header);
+	}
+
+	nvgpu_kfree(g, gr_subctx);
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
 }

 void nvgpu_gr_subctx_load_ctx_header(struct gk20a *g,
@@ -100,11 +139,6 @@ void nvgpu_gr_subctx_load_ctx_header(struct gk20a *g,
 	g->ops.gr.ctxsw_prog.set_patch_addr(g, ctxheader,
 		nvgpu_gr_ctx_mappings_get_ctx_va(mappings, NVGPU_GR_CTX_PATCH_CTX));

-#ifdef CONFIG_NVGPU_DEBUGGER
-	g->ops.gr.ctxsw_prog.set_pm_ptr(g, ctxheader,
-		nvgpu_gr_ctx_mappings_get_ctx_va(mappings, NVGPU_GR_CTX_PM_CTX));
-#endif
-
 #ifdef CONFIG_NVGPU_GRAPHICS
 	g->ops.gr.ctxsw_prog.set_zcull_ptr(g, ctxheader,
 		nvgpu_gr_ctx_get_zcull_ctx_va(gr_ctx));
@@ -120,6 +154,16 @@ struct nvgpu_mem *nvgpu_gr_subctx_get_ctx_header(struct nvgpu_gr_subctx *subctx)
 	return &subctx->ctx_header;
 }

+struct nvgpu_gr_ctx_mappings *nvgpu_gr_subctx_get_mappings(
+				struct nvgpu_gr_subctx *subctx)
+{
+	if (subctx == NULL) {
+		return NULL;
+	}
+
+	return subctx->mappings;
+}
+
 #ifdef CONFIG_NVGPU_GRAPHICS
 void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx)
@@ -134,25 +178,59 @@ void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx

 #ifdef CONFIG_NVGPU_GFXP
 void nvgpu_gr_subctx_set_preemption_buffer_va(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx, struct nvgpu_gr_ctx_mappings *mappings)
+	struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx_mappings *veid0_mappings)
 {
-	u64 preempt_ctxsw_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(mappings,
+	u64 preempt_ctxsw_veid0_gpu_va;
+	u64 preempt_ctxsw_gpu_va;
+	struct nvgpu_mem *ctxheader;
+
+	ctxheader = nvgpu_gr_subctx_get_ctx_header(subctx);
+
+	preempt_ctxsw_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+					subctx->mappings,
 					NVGPU_GR_CTX_PREEMPT_CTXSW);

-	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, &subctx->ctx_header,
+	preempt_ctxsw_veid0_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+					veid0_mappings,
+					NVGPU_GR_CTX_PREEMPT_CTXSW);
+
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, ctxheader,
 				preempt_ctxsw_gpu_va);

 	if (g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0 != NULL) {
 		g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0(g,
-			&subctx->ctx_header, preempt_ctxsw_gpu_va);
+			ctxheader, preempt_ctxsw_veid0_gpu_va);
+	}
+}
+
+void nvgpu_gr_subctx_clear_preemption_buffer_va(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx)
+{
+	struct nvgpu_mem *ctxheader = nvgpu_gr_subctx_get_ctx_header(subctx);
+
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, ctxheader, 0ULL);
+
+	if (g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0 != NULL) {
+		g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0(g,
+			ctxheader, 0ULL);
 	}
 }
 #endif /* CONFIG_NVGPU_GFXP */

 #ifdef CONFIG_NVGPU_DEBUGGER
 void nvgpu_gr_subctx_set_hwpm_ptr(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx, u64 pm_ctx_gpu_va)
+	struct nvgpu_gr_subctx *subctx,
+	bool set_pm_ctx_gpu_va)
 {
+	u64 pm_ctx_gpu_va = 0ULL;
+
+	if (set_pm_ctx_gpu_va) {
+		pm_ctx_gpu_va = nvgpu_gr_ctx_mappings_get_ctx_va(
+					subctx->mappings,
+					NVGPU_GR_CTX_PM_CTX);
+	}
+
 	g->ops.gr.ctxsw_prog.set_pm_ptr(g, &subctx->ctx_header,
 			pm_ctx_gpu_va);
 }
--- a/drivers/gpu/nvgpu/common/gr/subctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/subctx_priv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -27,14 +27,23 @@ struct nvgpu_mem;

 /**
 * GR subcontext data structure.
- *
- * One subcontext is allocated per GPU channel.
 */
 struct nvgpu_gr_subctx {
 	/**
 	 * Memory to hold subcontext header image.
 	 */
 	struct nvgpu_mem ctx_header;
+
+	/**
+	 * GPU mappings of the GR ctx buffers for this subcontext.
+	 */
+	struct nvgpu_gr_ctx_mappings *mappings;
+
+	/**
+	 * GR subcontext's entry in gr ctx mappings' (#nvgpu_gr_ctx_mappings)
+	 * subcontexts list #subctx_list.
+	 */
+	struct nvgpu_list_node gr_ctx_mappings_entry;
 };

 #endif /* NVGPU_GR_SUBCTX_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/zcull.c
+++ b/drivers/gpu/nvgpu/common/gr/zcull.c
@@ -23,6 +23,7 @@
 #include <nvgpu/log.h>
 #include <nvgpu/io.h>
 #include <nvgpu/gk20a.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/gr/subctx.h>
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/zcull.h>
@@ -159,15 +160,17 @@ int nvgpu_gr_zcull_init_hw(struct gk20a *g,
 	return 0;
 }

-int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
+int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_tsg_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx)
 {
+	struct nvgpu_gr_subctx *gr_subctx;
 	int ret = 0;

-	if (subctx != NULL) {
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		gr_subctx = nvgpu_tsg_subctx_get_gr_subctx(subctx);
 		ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, false);
 		if (ret == 0) {
-			nvgpu_gr_subctx_zcull_setup(g, subctx, gr_ctx);
+			nvgpu_gr_subctx_zcull_setup(g, gr_subctx, gr_ctx);
 		}
 	} else {
 		ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, true);
--- a/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c
@@ -50,6 +50,8 @@
 #include <nvgpu/cyclestats_snapshot.h>
 #include <nvgpu/power_features/pg.h>

+#include <nvgpu/tsg_subctx.h>
+
 #include "gr_vgpu.h"
 #include "ctx_vgpu.h"
 #include "subctx_vgpu.h"
@@ -173,6 +175,7 @@ int vgpu_gr_alloc_obj_ctx(struct nvgpu_channel  *c, u32 class_num, u32 flags)
 	struct nvgpu_tsg *tsg = NULL;
 	struct tegra_vgpu_cmd_msg msg = {};
 	struct tegra_vgpu_alloc_obj_ctx_params *p = &msg.params.alloc_obj_ctx;
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
 	int err = 0;

 	nvgpu_log_fn(g, " ");
@@ -211,11 +214,27 @@ int vgpu_gr_alloc_obj_ctx(struct nvgpu_channel  *c, u32 class_num, u32 flags)
 	gr_ctx = tsg->gr_ctx;

 	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
-	if (tsg->vm == NULL) {
-		tsg->vm = c->vm;
-		nvgpu_vm_get(tsg->vm);
-		gr_ctx->tsgid = tsg->tsgid;
+
+	/*
+	 * gr_subctx and mappings are allocated/setup here just to track the
+	 * VM references. When a new mapping is created VM reference is taken.
+	 * It will be dropped when the last channel in the subcontext is
+	 * released.
+	 */
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, c);
+	if (err != 0) {
+		nvgpu_err(g, "failed to alloc gr subctx");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		return err;
 	}
+
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, c);
+	if (mappings == NULL) {
+		nvgpu_err(g, "fail to allocate/get ctx mappings struct");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		return -ENOMEM;
+	}
+
 	nvgpu_mutex_release(&tsg->ctx_init_lock);

 	msg.cmd = TEGRA_VGPU_CMD_ALLOC_OBJ_CTX;
@@ -234,6 +253,7 @@ int vgpu_gr_alloc_obj_ctx(struct nvgpu_channel  *c, u32 class_num, u32 flags)
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	err = err ? err : msg.ret;
 	if (err == 0) {
+		gr_ctx->tsgid = tsg->tsgid;
 		nvgpu_gr_ctx_mark_ctx_initialized(gr_ctx);
 	} else {
 		nvgpu_err(g, "alloc obj ctx failed err %d", err);
--- a/drivers/gpu/nvgpu/common/vgpu/gr/subctx_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gr/subctx_vgpu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -20,35 +20,22 @@
 * DEALINGS IN THE SOFTWARE.
 */

-#include <nvgpu/vgpu/vgpu.h>
-#include <nvgpu/vgpu/tegra_vgpu.h>
-#include <nvgpu/gk20a.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/gr/subctx.h>
-
-#include "common/gr/subctx_priv.h"
+#include <nvgpu/channel.h>
+#include <nvgpu/log.h>

 #include "subctx_vgpu.h"
-#include "common/vgpu/ivc/comm_vgpu.h"

 void vgpu_gr_setup_free_subctx(struct nvgpu_channel *c)
 {
-	struct tegra_vgpu_cmd_msg msg = {};
-	struct tegra_vgpu_free_ctx_header_params *p =
-				&msg.params.free_ctx_header;
-	struct gk20a *g = c->g;
-	int err;
+	nvgpu_log(c->g, gpu_dbg_gr, " ");

-	msg.cmd = TEGRA_VGPU_CMD_FREE_CTX_HEADER;
-	msg.handle = vgpu_get_handle(g);
-	p->ch_handle = c->virt_ctx;
-	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-	err = err ? err : msg.ret;
-	if (unlikely(err != 0)) {
-		nvgpu_err(g, "free ctx_header failed err %d", err);
+	if (!nvgpu_is_enabled(c->g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return;
 	}

-	if (c->subctx != NULL) {
-		nvgpu_kfree(g, c->subctx);
-		c->subctx = NULL;
-	}
+	nvgpu_gr_subctx_free(c->g, c->subctx, c->vm, false);
+
+	nvgpu_log(c->g, gpu_dbg_gr, "done");
 }
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
@@ -46,6 +46,7 @@
 #include <nvgpu/gr/hwpm_map.h>
 #include <nvgpu/preempt.h>
 #include <nvgpu/power_features/pg.h>
+#include <nvgpu/tsg_subctx.h>

 #include "gr_gk20a.h"
 #include "gr_pri_gk20a.h"
@@ -82,15 +83,16 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  struct nvgpu_tsg *tsg,
 				  u32 mode)
 {
-	struct nvgpu_channel *ch;
+	bool set_pm_ctx_gpu_va = false;
 	struct nvgpu_gr_ctx *gr_ctx;
 	bool skip_update = false;
-	u64 pm_ctx_gpu_va = 0ULL;
 	int ret;
 	struct nvgpu_gr *gr = nvgpu_gr_get_instance_ptr(g, gr_instance_id);

 	nvgpu_log_fn(g, " ");

+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	gr_ctx = tsg->gr_ctx;

 	if (mode != NVGPU_GR_CTX_HWPM_CTXSW_MODE_NO_CTXSW) {
@@ -99,6 +101,7 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		if (ret != 0) {
 			nvgpu_err(g,
 				"failed to allocate and map pm ctxt buffer");
+			nvgpu_mutex_release(&tsg->ctx_init_lock);
 			return ret;
 		}

@@ -109,11 +112,14 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	}

 	ret = nvgpu_gr_ctx_prepare_hwpm_mode(g, gr_ctx, mode,
-					     &pm_ctx_gpu_va, &skip_update);
+					     &set_pm_ctx_gpu_va, &skip_update);
 	if (ret != 0) {
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		return ret;
 	}
+
 	if (skip_update) {
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		return 0;
 	}

@@ -128,20 +134,16 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	nvgpu_gr_ctx_set_hwpm_pm_mode(g, gr_ctx);

 	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
-		nvgpu_rwsem_down_read(&tsg->ch_list_lock);
-		nvgpu_list_for_each_entry(ch, &tsg->ch_list,
-					  nvgpu_channel, ch_entry) {
-			nvgpu_gr_subctx_set_hwpm_ptr(g, ch->subctx,
-				pm_ctx_gpu_va);
-		}
-		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+		nvgpu_tsg_subctxs_set_pm_buffer_va(tsg, set_pm_ctx_gpu_va);
 	} else {
-		nvgpu_gr_ctx_set_hwpm_ptr(g, gr_ctx, pm_ctx_gpu_va);
+		nvgpu_gr_ctx_set_hwpm_ptr(g, gr_ctx, set_pm_ctx_gpu_va);
 	}

 out:
 	g->ops.tsg.enable(tsg);

+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	return ret;
 }

--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -43,7 +43,6 @@ struct nvgpu_fence_type;
 struct nvgpu_swprofiler;
 struct nvgpu_channel_sync;
 struct nvgpu_gpfifo_userdata;
-struct nvgpu_gr_subctx;
 struct nvgpu_gr_ctx;
 struct nvgpu_debug_context;
 struct priv_cmd_queue;
@@ -363,6 +362,12 @@ struct nvgpu_channel {
 	/** Channel's entry in TSG's channel list. */
 	struct nvgpu_list_node ch_entry;

+	/**
+	 * Channel's entry in TSG Subcontext's (#nvgpu_tsg_subctx) channels list
+	 * #ch_list.
+	 */
+	struct nvgpu_list_node subctx_entry;
+
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 	struct nvgpu_channel_joblist joblist;
 	struct gpfifo_desc gpfifo;
@@ -440,8 +445,8 @@ struct nvgpu_channel {
 	u64 virt_ctx;
 #endif

-	/** Channel's graphics subcontext. */
-	struct nvgpu_gr_subctx *subctx;
+	/** Channel's subcontext. */
+	struct nvgpu_tsg_subctx *subctx;

 	/** Lock to access unserviceable state. */
 	struct nvgpu_spinlock unserviceable_lock;
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/ctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/ctx.h
@@ -42,6 +42,7 @@ struct gk20a;
 struct vm_gk20a;
 struct nvgpu_tsg;
 struct nvgpu_gr_ctx;
+struct nvgpu_channel;
 struct nvgpu_gr_ctx_mappings;
 struct nvgpu_gr_global_ctx_buffer_desc;
 struct nvgpu_gr_global_ctx_local_golden_image;
@@ -470,30 +471,33 @@ void nvgpu_gr_ctx_init_ctx_buffers_mapping_flags(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx);

 /**
- * @brief Allocate or get GR ctx buffers mappings for a TSG.
+ * @brief Allocate or get GR ctx buffers mappings for a TSG/Subcontext.
 *
 * @param g [in]		Pointer to GPU driver struct.
 * @param tsg [in]		Pointer to TSG struct.
- * @param vm [in]		Pointer to vm struct.
+ * @param ch [in]		Pointer to Channel struct.
 *
- * This function allocates the mappings struct for TSG corresponding to
- * given vm if not available already else returns the same.
+ * This function allocates the mappings struct for TSG/subcontext corresponding
+ * to given Channel's VM if not available already else returns the same.
 *
 * @return mappings struct in case of success, null in case of failure.
 */
 struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_alloc_or_get_mappings(struct gk20a *g,
-				struct nvgpu_tsg *tsg, struct vm_gk20a *vm);
+				struct nvgpu_tsg *tsg, struct nvgpu_channel *ch);

 /**
- * @brief Get GR ctx buffers mappings for a TSG.
+ * @brief Get GR ctx buffers mappings for a TSG or Subcontext corresponding to
+ *        a channel.
 *
 * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
 *
 * This function returns the mappings struct for TSG.
 *
 * @return mappings struct.
 */
-struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg);
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_get_mappings(struct nvgpu_tsg *tsg,
+					struct nvgpu_channel *ch);

 /**
 * @brief Free the gr ctx mapping struct.
@@ -564,8 +568,7 @@ bool nvgpu_gr_ctx_desc_force_preemption_cilp(

 #ifdef CONFIG_NVGPU_GFXP
 void nvgpu_gr_ctx_set_preemption_buffer_va(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings);
+	struct nvgpu_gr_ctx *gr_ctx);

 bool nvgpu_gr_ctx_desc_force_preemption_gfxp(
 		struct nvgpu_gr_ctx_desc *gr_ctx_desc);
@@ -608,10 +611,10 @@ int nvgpu_gr_ctx_set_smpc_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,

 int nvgpu_gr_ctx_prepare_hwpm_mode(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
-	u32 mode, u64 *pm_ctx_gpu_va, bool *skip_update);
+	u32 mode, bool *set_pm_ctx_gpu_va, bool *skip_update);
 void nvgpu_gr_ctx_set_hwpm_pm_mode(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx);
 void nvgpu_gr_ctx_set_hwpm_ptr(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx,
-			       u64 pm_ctx_gpu_va);
+			       bool set_pm_ctx_gpu_va);
 void nvgpu_gr_ctx_set_pm_ctx_mapped(struct nvgpu_gr_ctx *ctx, bool mapped);

 #ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
@@ -625,4 +628,17 @@ bool nvgpu_gr_ctx_desc_dump_ctxsw_stats_on_channel_close(
 		struct nvgpu_gr_ctx_desc *gr_ctx_desc);
 #endif

+bool nvgpu_gr_obj_ctx_global_ctx_buffers_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_global_ctx_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+bool nvgpu_gr_obj_ctx_preempt_buffers_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_preempt_buffers_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+bool nvgpu_gr_obj_ctx_default_compute_regs_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_default_compute_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+bool nvgpu_gr_obj_ctx_default_gfx_regs_patched(struct nvgpu_gr_ctx *gr_ctx);
+void nvgpu_gr_obj_ctx_set_default_gfx_regs_patched(
+			struct nvgpu_gr_ctx *gr_ctx, bool patched);
+
 #endif /* NVGPU_GR_CTX_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/ctx_mappings.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/ctx_mappings.h
@@ -27,6 +27,7 @@ struct gk20a;
 struct nvgpu_tsg;
 struct vm_gk20a;
 struct nvgpu_gr_ctx;
+struct nvgpu_gr_subctx;
 struct nvgpu_gr_ctx_mappings;
 struct nvgpu_gr_global_ctx_buffer_desc;

@@ -78,15 +79,18 @@ int nvgpu_gr_ctx_mappings_map_ctx_buffer(struct gk20a *g,
 *
 * @param g [in]		Pointer to GPU driver struct.
 * @param ctx [in]		Pointer to GR context struct.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
 * @param mappings [in]		Pointer to GR context buffer mappings struct.
 *
 * This function will map the GR context preemption buffers in #mappings->vm
- * and stores the mapped address.
+ * and stores the mapped address. For subcontext case NVGPU_GR_CTX_PREEMPT_CTXSW
+ * buffer is mapped to all subcontexts.
 *
 * @return 0 in case of success, < 0 in case of failure.
 */
 int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);

 /**
@@ -94,6 +98,7 @@ int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
 *
 * @param g [in]			Pointer to GPU driver struct.
 * @param gr_ctx [in]			Pointer to GR context struct.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
 * @param global_ctx_buffer [in]	Pointer global context buffer desc.
 * @param mappings [in]			Pointer to GR context buffer
 *					mappings struct.
@@ -106,7 +111,7 @@ int nvgpu_gr_ctx_mappings_map_ctx_preemption_buffers(struct gk20a *g,
 * @return 0 in case of success, < 0 in case of failure.
 */
 int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
-	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	bool vpr);
@@ -116,6 +121,7 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
 *
 * @param g [in]			Pointer to GPU driver struct.
 * @param gr_ctx [in]			Pointer to GR context struct.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
 * @param global_ctx_buffer [in]	Pointer global context buffer desc.
 * @param mappings [in]			Pointer to GR context buffer
 *					mappings struct.
@@ -124,6 +130,7 @@ int nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(struct gk20a *g,
 */
 void nvgpu_gr_ctx_unmap_buffers(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_ctx_mappings *mappings);

@@ -157,4 +164,80 @@ u64 nvgpu_gr_ctx_mappings_get_global_ctx_va(struct nvgpu_gr_ctx_mappings *mappin
 u64 nvgpu_gr_ctx_mappings_get_ctx_va(struct nvgpu_gr_ctx_mappings *mappings,
 	u32 index);

+/**
+ * @brief Get GR ctx buffers mappings for a TSG corresponding to VM.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param vm [in]		Pointer to vm struct.
+ *
+ * This function retrieves the mappings struct for TSG corresponding to
+ * given vm from #tsg->gr_ctx_mappings_list.
+ *
+ * @return mappings struct in case of success, null in case of failure.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_get_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm);
+
+/**
+ * @brief Allocate GR ctx buffers mappings for a TSG corresponding to VM.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param vm [in]		Pointer to vm struct.
+ *
+ * This function allocates the mappings struct for TSG corresponding to
+ * given vm and inserts in #tsg->gr_ctx_mappings_list.
+ *
+ * @return mappings struct in case of success, null in case of failure.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_ctx_mappings_create_subctx_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct vm_gk20a *vm);
+
+/**
+ * @brief Link GR subctx to mappings struct.
+ *
+ * @param mappings [in]		Pointer to GR context buffers mappings struct.
+ * @param subctx [in]		Pointer to GR subcontext struct.
+ *
+ * This function checks and inserts the subctx in #mappings->subctx_list.
+ */
+void nvgpu_gr_ctx_mappings_add_gr_subctx(struct nvgpu_gr_ctx_mappings *mappings,
+				   struct nvgpu_gr_subctx *subctx);
+
+/**
+ * @brief Free GR context buffers mappings struct for subcontexts.
+ *
+ * @param subctx [in]		Pointer to GR subcontext struct.
+ * @param mappings [in]		Pointer to GR context buffers mappings struct.
+ * @param unmap [in]		Indicates if the GR context buffers are to be
+ *				unmapped. true in case of native nvgpu config,
+ *				false in case of vgpu config. For vgpu case,
+ *				this path is used to handle the VM references
+ *				per subcontext.
+ *
+ * This function checks if the #mappings->subctx_list is empty and if empty,
+ * unmaps the buffers and deletes the mappings.
+ */
+void nvgpu_gr_ctx_mappings_free_subctx_mappings(struct nvgpu_tsg_subctx *subctx,
+			struct nvgpu_gr_ctx_mappings *mappings, bool unmap);
+
+/**
+ * @brief Map GR context buffer to all subcontext VMs.
+ *
+ * @param tsg [in]		Pointer to tsg struct.
+ * @param index [in]		Index of the buffer to be mapped.
+ *
+ * This function maps the GR context buffer at #index to all VMs listed
+ * in #tsg->gr_ctx_mappings_list.
+ *
+ * @return 0 in case of success, < 0 in case of failure.
+ */
+int nvgpu_gr_ctx_mappings_map_buffer_all_subctx(
+		struct nvgpu_tsg *tsg, u32 index);
+
 #endif /* NVGPU_GR_CTX_MAPPINGS_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_utils.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_utils.h
@@ -130,9 +130,7 @@ struct nvgpu_gr_hwpm_map *nvgpu_gr_get_hwpm_map_ptr(struct gk20a *g);
 void nvgpu_gr_reset_falcon_ptr(struct gk20a *g);
 void nvgpu_gr_reset_golden_image_ptr(struct gk20a *g);
 #endif
-#ifdef CONFIG_NVGPU_FECS_TRACE
 struct nvgpu_gr_global_ctx_buffer_desc *nvgpu_gr_get_global_ctx_buffer_ptr(
 							struct gk20a *g);
-#endif

 #endif /* NVGPU_GR_UTILS_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/obj_ctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/obj_ctx.h
@@ -34,7 +34,7 @@
 struct gk20a;
 struct nvgpu_gr_ctx;
 struct nvgpu_gr_ctx_mappings;
-struct nvgpu_gr_subctx;
+struct nvgpu_tsg_subctx;
 struct nvgpu_gr_config;
 struct nvgpu_gr_ctx_desc;
 struct vm_gk20a;
@@ -70,7 +70,7 @@ void nvgpu_gr_obj_ctx_commit_inst_gpu_va(struct gk20a *g,
 * @param g [in]		Pointer to GPU driver struct.
 * @param inst_block [in]	Pointer to channel instance block.
 * @param gr_ctx [in]		Pointer to graphics context buffer.
- * @param subctx [in]		Pointer to graphics subcontext buffer.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
 * @param mappings [in]		Pointer to mappings of the GR context buffers.
 *
 * If graphics subcontexts are supported, subcontext buffer GPU virtual
@@ -82,9 +82,23 @@ void nvgpu_gr_obj_ctx_commit_inst_gpu_va(struct gk20a *g,
 * instance block.
 */
 void nvgpu_gr_obj_ctx_commit_inst(struct gk20a *g, struct nvgpu_mem *inst_block,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);

+/**
+ * brief Check if the VEID is sync when subcontexts are enabled.
+ *
+ * @param g [in]			Pointer to GPU driver struct.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
+ *
+ * @retval true if subcontexts are enabled, MIG is disabled and \a subctx
+ * corresponds to VEID0.
+ * @retval true if subcontexts are disabled and MIG is disabled.
+ * @retval false otherwise.
+ */
+bool nvgpu_gr_obj_ctx_is_gfx_engine(struct gk20a *g,
+				    struct nvgpu_tsg_subctx *subctx);
+
 /**
 * brief Initialize preemption mode in context struct.
 *
@@ -120,7 +134,7 @@ int nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(struct gk20a *g,
 * @param g [in]		Pointer to GPU driver struct.
 * @param config [in]		Pointer to GR configuration struct.
 * @param gr_ctx [in]		Pointer to graphics context.
- * @param subctx [in]		Pointer to graphics subcontext buffer.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
 * @param mappings [in]		Pointer to mappings of GR context buffers.
 *
 * This function will read preemption modes stored in #nvgpu_gr_ctx
@@ -134,7 +148,7 @@ int nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(struct gk20a *g,
 */
 void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
 	struct nvgpu_gr_config *config,
-	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);

 /**
@@ -144,6 +158,7 @@ void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
 * @param global_ctx_buffer [in]	Pointer to global context descriptor struct.
 * @param config [in]			Pointer to GR configuration struct.
 * @param gr_ctx [in]			Pointer to graphics context.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
 * @param mappings [in]			Pointer to mappings of GR context buffers.
 * @param patch [in]			Boolean flag to use patch context buffer.
 *
@@ -156,7 +171,8 @@ void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
 void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_ctx_mappings *mappings, bool patch);
+	struct nvgpu_tsg_subctx *subctx, struct nvgpu_gr_ctx_mappings *mappings,
+	bool patch);

 /**
 * @brief Allocate and setup object context s/w image for VEID0 GPU channel.
@@ -191,6 +207,7 @@ int nvgpu_gr_obj_ctx_init_golden_context_image(struct gk20a *g);
 * @param global_ctx_buffer [in]	Pointer to global context descriptor struct.
 * @param config [in]			Pointer to GR configuration struct.
 * @param gr_ctx [in]			Pointer to graphics context.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
 * @param inst_block [in]		Pointer to channel instance block.
 *
 * This function allocates golden context image.
@@ -222,6 +239,7 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block);

@@ -234,7 +252,7 @@ int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
 * @param gr_ctx_desc [in]		Pointer to GR context descriptor struct.
 * @param config [in]			Pointer to GR configuration struct.
 * @param gr_ctx [in]			Pointer to graphics context.
- * @param subctx [in]			Pointer to graphics subcontext buffer.
+ * @param subctx [in]			Pointer to TSG subcontext struct.
 * @param mappings [in]			Pointer to mappings of the GR context buffers.
 * @param inst_block [in]		Pointer to channel instance block.
 * @param class_num [in]		GR engine class.
@@ -274,7 +292,7 @@ int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
 	struct nvgpu_gr_ctx_desc *gr_ctx_desc,
 	struct nvgpu_gr_config *config,
 	struct nvgpu_gr_ctx *gr_ctx,
-	struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_tsg_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings,
 	struct nvgpu_mem *inst_block,
 	u32 class_num, u32 flags,
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
@@ -106,6 +106,20 @@ void nvgpu_gr_setup_free_gr_ctx(struct gk20a *g,
 */
 void nvgpu_gr_setup_free_subctx(struct nvgpu_channel *c);

+/**
+ * @brief Validate preemption mode in GR engine context image in case
+ *        Application optionally wants to change default preemption mode.
+ *
+ * @param graphics_preempt_mode [in]	Requested graphics preemption mode.
+ * @param compute_preempt_mode [in]	Requested compute preemption mode.
+ * @param gr_ctx [in]			Pointer to GR engine context image.
+ *
+ * @return true in case of success, false in case of failure.
+ */
+bool nvgpu_gr_setup_validate_preemption_mode(u32 *graphics_preempt_mode,
+				u32 *compute_preempt_mode,
+				struct nvgpu_gr_ctx *gr_ctx);
+
 /**
 * @brief Setup preemption mode in GR engine context image in case
 *        Application optionally wants to change default preemption mode.
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/subctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/subctx.h
@@ -32,41 +32,62 @@
 */
 struct gk20a;
 struct vm_gk20a;
+struct nvgpu_gr_ctx;
 struct nvgpu_gr_subctx;
 struct nvgpu_mem;
 struct nvgpu_gr_ctx_mappings;

 /**
- * @brief Allocate graphics subcontext buffer.
- *
- * @param g [in]		Pointer to GPU driver struct.
- * @param vm [in]		Pointer to virtual memory.
- *
- * This function allocates memory for #nvgpu_gr_subctx structure
- * and subcontext header stored in #nvgpu_gr_subctx structure.
- *
- * Subcontext header memory will be mapped to given virtual
- * memory.
- *
- * @return pointer to #nvgpu_gr_subctx struct in case of success,
- *         NULL in case of failure.
- */
-struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
-	struct vm_gk20a *vm);
-
-/**
- * @brief Free graphics subcontext buffer.
+ * @brief Allocate and map graphics subcontext context header buffer.
 *
 * @param g [in]		Pointer to GPU driver struct.
 * @param subctx [in]		Pointer to graphics subcontext struct.
 * @param vm [in]		Pointer to virtual memory.
 *
+ * This function allocates memory for subcontext header stored in
+ * #nvgpu_gr_subctx structure.
+ *
+ * Subcontext header memory will be mapped to given virtual
+ * memory.
+ *
+ * @return 0 in case of success, < 0 in case of failure.
+ */
+int nvgpu_gr_subctx_setup_header(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx,
+	struct vm_gk20a *vm);
+
+/**
+ * @brief Allocate graphics subcontext buffer.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ *
+ * This function allocates memory for #nvgpu_gr_subctx structure.
+ *
+ * @return pointer to #nvgpu_gr_subctx struct in case of success,
+ *         NULL in case of failure.
+ */
+struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g);
+
+/**
+ * @brief Free graphics subcontext buffer.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param subctx [in]		Pointer to TSG subcontext struct.
+ * @param vm [in]		Pointer to virtual memory.
+ * @param unmap [in]		Indicates if GR context buffers and subctx
+ *				buffer are to be unmapped.
+ *				true in case of native nvgpu config and
+ *				false in case of vgpu config. For vgpu case,
+ *				this path is used to handle the VM references
+ *				per subcontext.
+ *
 * This function will free memory allocated for subcontext header and
 * #nvgpu_gr_subctx structure.
 */
 void nvgpu_gr_subctx_free(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx,
-	struct vm_gk20a *vm);
+	struct nvgpu_tsg_subctx *subctx,
+	struct vm_gk20a *vm,
+	bool unmap);

 /**
 * @brief Initialize graphics subcontext buffer header.
@@ -101,6 +122,19 @@ void nvgpu_gr_subctx_load_ctx_header(struct gk20a *g,
 */
 struct nvgpu_mem *nvgpu_gr_subctx_get_ctx_header(struct nvgpu_gr_subctx *subctx);

+/**
+ * @brief Get pointer of GR context buffers mappings struct for a subcontext.
+ *
+ * @param subctx [in]		Pointer to graphics subcontext struct.
+ *
+ * This function returns #nvgpu_gr_ctx_mappings pointer of GR context buffers
+ * mappings stored in #nvgpu_gr_subctx.
+ *
+ * @return pointer to subcontext GR context buffers mappings struct.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_gr_subctx_get_mappings(
+				struct nvgpu_gr_subctx *subctx);
+
 #ifdef CONFIG_NVGPU_GRAPHICS
 void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx);
@@ -108,10 +142,14 @@ void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx
 void nvgpu_gr_subctx_set_preemption_buffer_va(struct gk20a *g,
 	struct nvgpu_gr_subctx *subctx,
 	struct nvgpu_gr_ctx_mappings *mappings);
+
+void nvgpu_gr_subctx_clear_preemption_buffer_va(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx);
 #endif

 #ifdef CONFIG_NVGPU_DEBUGGER
 void nvgpu_gr_subctx_set_hwpm_ptr(struct gk20a *g,
-	struct nvgpu_gr_subctx *subctx, u64 pm_ctx_gpu_va);
+	struct nvgpu_gr_subctx *subctx,
+	bool set_pm_ctx_gpu_va);
 #endif
 #endif /* NVGPU_GR_SUBCTX_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/zcull.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/zcull.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -28,8 +28,8 @@
 struct gk20a;
 struct nvgpu_gr_config;
 struct nvgpu_gr_ctx;
-struct nvgpu_gr_subctx;
 struct nvgpu_gr_zcull;
+struct nvgpu_tsg_subctx;

 struct nvgpu_gr_zcull_info {
 	u32 width_align_pixels;
@@ -54,7 +54,7 @@ int nvgpu_gr_zcull_init_hw(struct gk20a *g,
 			struct nvgpu_gr_zcull *gr_zcull,
 			struct nvgpu_gr_config *gr_config);

-int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
+int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_tsg_subctx *subctx,
 		struct nvgpu_gr_ctx *gr_ctx);

 #endif /* NVGPU_GR_ZCULL_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
@@ -78,8 +78,6 @@ struct nvgpu_tsg {
 	/** Pointer to GPU driver struct. */
 	struct gk20a *g;

-	/** Points to TSG's virtual memory */
-	struct vm_gk20a *vm;
 	/**
 	 * Starting with Volta, when a Channel/TSG is set up, a recovery buffer
 	 * region must be allocated in BAR2, to allow engine to save methods if
@@ -98,6 +96,12 @@ struct nvgpu_tsg {
 	 */
 	struct nvgpu_gr_ctx *gr_ctx;

+	/**
+	 * List of gr_ctx buffers maps (#nvgpu_gr_ctx_mappings) for gr ctx
+	 * for this TSG. Accessed by holding #ctx_init_lock from TSG.
+	 */
+	struct nvgpu_list_node gr_ctx_mappings_list;
+
 	/**
 	 * Mutex to prevent concurrent context initialization for channels
 	 * in same TSG. All channels in one TSG share the context buffer,
@@ -113,6 +117,12 @@ struct nvgpu_tsg {
 	 */
 	struct nvgpu_ref refcount;

+	/**
+	 * List of subcontexts (#nvgpu_tsg_subctx) bound to this TSG.
+	 * Accessed by holding #ch_list_lock from TSG.
+	 */
+	struct nvgpu_list_node subctx_list;
+
 	/** List of channels bound to this TSG. */
 	struct nvgpu_list_node ch_list;
 #ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
@@ -128,7 +138,7 @@ struct nvgpu_tsg {
 #endif
 	/**
 	 * Read write type of semaphore lock used for accessing/modifying
-	 * #ch_list.
+	 * #ch_list, #subctx_list and #ch_list in #nvgpu_tsg_subctx.
 	 */
 	struct nvgpu_rwsem ch_list_lock;

@@ -272,8 +282,6 @@ struct nvgpu_tsg *nvgpu_tsg_open(struct gk20a *g, pid_t pid);
 * - Call non-NULL HAL to release tsg. This HAL is non-NULL for vgpu only.
 * - Call nvgpu_free_gr_ctx_struct to free #nvgpu_tsg.gr_ctx.
 * - Set #nvgpu_tsg.gr_ctx to NULL.
- * - If #nvgpu_tsg.vm is non-NULL, do #nvgpu_vm_put for this vm and set
- *   it to NULL (Unhook TSG from VM).
 * - If #nvgpu_tsg.sm_error_states is non-NULL, free allocated memory and set
 *   it to NULL.
 */
@@ -286,7 +294,7 @@ void nvgpu_tsg_release_common(struct gk20a *g, struct nvgpu_tsg *tsg);
 *
 * - Get pointer to the #nvgpu_tsg using #ref.
 * - Call HAL to free #nvgpu_tsg.gr_ctx if this memory pointer is non-NULL
- *   and valid and also #nvgpu_tsg.vm is non-NULL.
+ *   and valid.
 * - Unhook all events created on the TSG being released.
 * -- Acquire #nvgpu_tsg.event_id_list_lock.
 * -- While #nvgpu_tsg.event_id_list is non-empty,
@@ -363,6 +371,7 @@ void nvgpu_tsg_disable(struct nvgpu_tsg *tsg);
 * - If channel had ASYNC subctx id, then set runqueue selector to 1.
 * - Set runlist id of TSG to channel's runlist_id if runlist_id of TSG
 *   is set to #NVGPU_INVALID_TSG_ID.
+ * - Bind channel to TSG subcontext calling #nvgpu_tsg_subctx_bind_channel.
 * - Call HAL to bind channel to TSG.
 * - Add channel to TSG's list of channels. See #nvgpu_tsg.ch_list
 * - Set #nvgpu_channel.tsgid to #nvgpu_tsg.tsgid.
@@ -445,6 +454,7 @@ struct nvgpu_tsg *nvgpu_tsg_check_and_get_from_id(struct gk20a *g, u32 tsgid);
 *  - If NEXT bit is set and force is set to false, caller will
 *    have to retry unbind.
 *  - Remove channel from its runlist.
+ *  - Remove channel from subctx by calling #nvgpu_tsg_subctx_unbind_channel.
 *  - Remove channel from TSG's channel list.
 *  - Set tsgid of the channel to #NVGPU_INVALID_TSG_ID.
 *  - Disable channel so that it is not picked up by h/w scheduler.
@@ -456,6 +466,7 @@ struct nvgpu_tsg *nvgpu_tsg_check_and_get_from_id(struct gk20a *g, u32 tsgid);
 *  - Call #nvgpu_channel_update_runlist to remove the channel from the runlist.
 *  - Acquire #nvgpu_tsg.ch_list_lock of the tsg and delete channel from
 *    #nvgpu_tsg.ch_list.
+ *  - Remove channel from subctx by calling #nvgpu_tsg_subctx_unbind_channel.
 *  - Remove channel from TSG's channel list.
 *  - Set #nvgpu_channel.tsgid to #NVGPU_INVALID_TSG_ID
 *  - Release #nvgpu_tsg.ch_list_lock of the tsg.
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_TSG_SUBCTX_H
+#define NVGPU_TSG_SUBCTX_H
+
+#include <nvgpu/types.h>
+
+struct gk20a;
+struct nvgpu_tsg;
+struct nvgpu_tsg_subctx;
+struct nvgpu_channel;
+/**
+ * @brief Bind a channel to the TSG subcontext.
+ *
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Loop through the #subctx_list in #tsg to check if the subctx
+ *   exists for the provided channel.
+ * - If it exists, validate the channel VM with subctx VM.
+ * - If validated, add the channel to the subctx #ch_list and exit.
+ * - Else allocate and initialize new subctx structure.
+ * - Add the channel to the subctx #ch_list and add subctx to the
+ *   TSG #subctx_list.
+ *
+ * @return 0 for successful bind or if subctx support is disabled,
+ *         < 0 for failure.
+ * @retval -EINVAL if channel VM doesn't match with subctx VM for provided
+ *                 subctx_id.
+ * @retval -ENOMEM if subctx allocation fails.
+ */
+int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
+				  struct nvgpu_channel *ch);
+
+/**
+ * @brief Unbind a channel from the TSG subcontext.
+ *
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Validate that #subctx is allocated for the channel #ch.
+ * - Remove the channel from the subctx #ch_list.
+ * - If the subctx #ch_list is empty
+ *   - Invoke g->ops.gr.setup.free_subctx to free the GR subcontext
+ *     struct (and GR subcontext mappings struct).
+ *   - Remove the subctx from the TSG #subctx_list.
+ *   - Free the subctx memory. If this was the only active channel
+ *     in the TSG this function will delete the objects in the
+ *     sequence: mappings -> gr_subctx -> tsg_subctx
+ */
+void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
+				     struct nvgpu_channel *ch);
+
+/**
+ * @brief Allocate GR subcontext for a TSG subcontext.
+ *
+ * @param g [in]		Pointer to gk20a struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Check if TSG subctx is allocated for the channel.
+ * - If not allocated, return error.
+ * - If allocated, and if GR subcontext is not allocated call
+ *   #nvgpu_gr_subctx_alloc.
+ *
+ * @return 0 for successful allocation, < 0 for failure.
+ */
+int nvgpu_tsg_subctx_alloc_gr_subctx(struct gk20a *g, struct nvgpu_channel *ch);
+
+/**
+ * @brief Allocate and map GR subcontext header for a TSG subcontext.
+ *
+ * @param g [in]		Pointer to gk20a struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * - Check if TSG and GR subctx is allocated for the channel.
+ * - If not allocated, return error.
+ * - If allocated, setup subcontext header by calling
+ *   #nvgpu_gr_subctx_setup_header.
+ *
+ * @return 0 for successful allocation, < 0 for failure.
+ */
+int nvgpu_tsg_subctx_setup_subctx_header(struct gk20a *g,
+					 struct nvgpu_channel *ch);
+
+/**
+ * @brief Get GR subcontext for a TSG subcontext.
+ *
+ * @param tsg_subctx [in]	Pointer to TSG Subcontext struct.
+ *
+ * - Return #gr_subctx from #nvgpu_tsg_subctx.
+ */
+struct nvgpu_gr_subctx *nvgpu_tsg_subctx_get_gr_subctx(
+				struct nvgpu_tsg_subctx *tsg_subctx);
+
+/**
+ * @brief Get id of a TSG subcontext.
+ *
+ * @param tsg_subctx [in]	Pointer to TSG Subcontext struct.
+ *
+ * - Return #subctx_id from #nvgpu_tsg_subctx.
+ */
+u32 nvgpu_tsg_subctx_get_id(struct nvgpu_tsg_subctx *tsg_subctx);
+
+/**
+ * @brief Allocate or get the mappings struct for the TSG subcontext.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param ch [in]		Pointer to Channel struct.
+ *
+ * This function allocates the mappings struct for subcontext corresponding
+ * to given Channel's VM if not available already else returns the same.
+ * It adds the gr_subctx corresponding to the channel the mapping object's
+ * subctx_list.
+ *
+ * @return mappings struct in case of success, null in case of failure.
+ */
+struct nvgpu_gr_ctx_mappings *nvgpu_tsg_subctx_alloc_or_get_mappings(
+				struct gk20a *g,
+				struct nvgpu_tsg *tsg,
+				struct nvgpu_channel *ch);
+
+#ifdef CONFIG_NVGPU_GFXP
+/**
+ * @brief Program preemption buffer virtual addresses for all subcontexts.
+ *
+ * @param tsg_subctx [in]		Pointer to TSG subcontext struct.
+ *
+ * - Checks if VEID0 mappings are available.
+ * - If available, program the preemption buffer virtual addresses
+ *   (VEID0 VA and VA in subcontext VM) for all GR subcontexts'
+ *   headers.
+ */
+void nvgpu_tsg_subctxs_set_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx);
+
+/**
+ * @brief Clear preemption buffer virtual addresses for all subcontexts.
+ *
+ * @param tsg_subctx [in]		Pointer to TSG subcontext struct.
+ *
+ * - Program the preemption buffer virtual addresses
+ *   (VEID0 VA and VA in subcontext VM) for all GR subcontexts'
+ *   headers to 0.
+ */
+void nvgpu_tsg_subctxs_clear_preemption_buffer_va(
+			struct nvgpu_tsg_subctx *tsg_subctx);
+#endif /* CONFIG_NVGPU_GFXP */
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+/**
+ * @brief Program PM buffer virtual addresses for all subcontexts.
+ *
+ * @param tsg [in]			Pointer to TSG struct.
+ * @param set_pm_ctx_gpu_va [in]	Indicates if PM ctx buffer GPU VA
+ *					is to be programmed.
+ *
+ * - Program the PM buffer virtual address for all GR subcontexts' headers.
+ */
+void nvgpu_tsg_subctxs_set_pm_buffer_va(struct nvgpu_tsg *tsg,
+					bool set_pm_ctx_gpu_va);
+#endif /* CONFIG_NVGPU_DEBUGGER */
+
+#endif /* NVGPU_TSG_SUBCTX_H */
--- a/libs/igpu/libnvgpu-drv-igpu_safe.export
+++ b/libs/igpu/libnvgpu-drv-igpu_safe.export
@@ -709,6 +709,10 @@ nvgpu_tsg_store_sm_error_state
 nvgpu_tsg_get_sm_error_state
 nvgpu_tsg_abort
 nvgpu_tsg_bind_channel
+nvgpu_tsg_subctx_bind_channel
+nvgpu_tsg_subctx_unbind_channel
+nvgpu_tsg_subctx_alloc_gr_subctx
+nvgpu_tsg_subctx_setup_subctx_header
 nvgpu_tsg_check_and_get_from_id
 nvgpu_tsg_cleanup_sw
 nvgpu_tsg_default_timeslice_us
--- a/userspace/units/fifo/tsg/nvgpu-tsg.c
+++ b/userspace/units/fifo/tsg/nvgpu-tsg.c
@@ -654,7 +654,6 @@ int test_tsg_release(struct unit_module *m,
 	struct nvgpu_fifo *f = &g->fifo;
 	struct gpu_ops gops = g->ops;
 	struct nvgpu_tsg *tsg = NULL;
-	struct vm_gk20a vm;
 	u32 branches = 0U;
 	int ret = UNIT_FAIL;
 	u32 free_gr_ctx_mask =
@@ -706,12 +705,6 @@ int test_tsg_release(struct unit_module *m,
 		if (branches & F_TSG_RELEASE_MEM) {
 			ret = nvgpu_gr_ctx_alloc_ctx_buffers(g, gr_ctx_desc, tsg->gr_ctx);
 			unit_assert(ret == UNIT_SUCCESS, goto done);
-			tsg->vm = &vm;
-			/* prevent nvgpu_vm_remove */
-			nvgpu_ref_init(&vm.ref);
-			nvgpu_ref_get(&vm.ref);
-		} else {
-			tsg->vm = NULL;
 		}

 		if ((branches & free_gr_ctx_mask) == free_gr_ctx_mask) {
@@ -755,7 +748,6 @@ int test_tsg_release(struct unit_module *m,

 		unit_assert(!f->tsg[tsg->tsgid].in_use, goto done);
 		unit_assert(tsg->gr_ctx == NULL, goto done);
-		unit_assert(tsg->vm == NULL, goto done);
 		unit_assert(tsg->sm_error_states == NULL, goto done);
 	}
 	ret = UNIT_SUCCESS;
--- a/userspace/units/gr/ctx/Makefile.tmk
+++ b/userspace/units/gr/ctx/Makefile.tmk
@@ -28,7 +28,8 @@ NVGPU_UNIT_NAME = nvgpu-gr-ctx
 NVGPU_UNIT_SRCS = nvgpu-gr-ctx.c

 NVGPU_UNIT_INTERFACE_DIRS := \
-	$(NV_COMPONENT_DIR)/..
+	$(NV_COMPONENT_DIR)/.. \
+	$(NV_COMPONENT_DIR)/../../fifo

 include $(NV_COMPONENT_DIR)/../../Makefile.units.common.tmk

--- a/userspace/units/gr/ctx/nvgpu-gr-ctx.c
+++ b/userspace/units/gr/ctx/nvgpu-gr-ctx.c
@@ -42,6 +42,8 @@
 #include "../nvgpu-gr.h"
 #include "nvgpu-gr-ctx.h"

+#include "../../fifo/nvgpu-fifo-common.h"
+
 #define DUMMY_SIZE	0xF0U

 static u64 nvgpu_gmmu_map_locked_stub(struct vm_gk20a *vm,
@@ -92,14 +94,24 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	u64 low_hole = SZ_4K * 16UL;
 	struct nvgpu_channel *channel = (struct nvgpu_channel *)
 		malloc(sizeof(struct nvgpu_channel));
-	struct nvgpu_tsg *tsg = (struct nvgpu_tsg *)
-		malloc(sizeof(struct nvgpu_tsg));
+	struct nvgpu_tsg *tsg;
 	u32 i;

-	if (channel == NULL || tsg == NULL) {
+	if (channel == NULL) {
 		unit_return_fail(m, "failed to allocate channel/tsg");
 	}

+	err = test_fifo_init_support(m, g, NULL);
+	if (err != 0) {
+		unit_return_fail(m, "failed to init fifo support\n");
+		return err;
+	}
+
+	tsg = nvgpu_tsg_open(g, 0);
+	if (!tsg) {
+		unit_return_fail(m, "failed to allocate tsg");
+	}
+
 	desc = nvgpu_gr_ctx_desc_alloc(g);
 	if (!desc) {
 		unit_return_fail(m, "failed to allocate memory");
@@ -147,7 +159,7 @@ int test_gr_ctx_error_injection(struct unit_module *m,

 	tsg->gr_ctx = gr_ctx;

-	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, vm);
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, channel);
 	if (mappings == NULL) {
 		unit_return_fail(m, "failed to allocate gr_ctx mappings");
 	}
@@ -179,7 +191,7 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	/* Inject kmem alloc failures to trigger mapping failures */
 	for (i = 0; i < 2; i++) {
 		nvgpu_posix_enable_fault_injection(kmem_fi, true, 2 * i);
-		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx,
+		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
 					global_desc, mappings, false);
 		if (err == 0) {
 			unit_return_fail(m, "unexpected success");
@@ -188,8 +200,8 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	}

 	/* global ctx_desc size is not set. */
-	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, global_desc,
-				       mappings, false);
+	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
+					global_desc, mappings, false);
 	if (err == 0) {
 		unit_return_fail(m, "unexpected success");
 	}
@@ -211,8 +223,8 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	/* Fail global ctx buffer mappings */
 	for (i = 0; i < 4; i++) {
 		nvgpu_posix_enable_fault_injection(kmem_fi, true, 4 + (2 * i));
-		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, global_desc,
-					       mappings, false);
+		err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
+						global_desc, mappings, false);
 		if (err == 0) {
 			unit_return_fail(m, "unexpected success");
 		}
@@ -221,8 +233,8 @@ int test_gr_ctx_error_injection(struct unit_module *m,


 	/* Successful mapping */
-	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, global_desc,
-				       mappings, false);
+	err = nvgpu_gr_ctx_mappings_map_gr_ctx_buffers(g, gr_ctx, NULL,
+					global_desc, mappings, false);
 	if (err != 0) {
 		unit_return_fail(m, "failed to map global buffers");
 	}
@@ -253,6 +265,12 @@ int test_gr_ctx_error_injection(struct unit_module *m,
 	nvgpu_gr_ctx_desc_free(g, desc);
 	nvgpu_vm_put(g->mm.bar1.vm);

+	err = test_fifo_remove_support(m, g, NULL);
+	if (err != 0) {
+		unit_return_fail(m, "failed to remove fifo support\n");
+		return err;
+	}
+
 	return UNIT_SUCCESS;
 }

--- a/userspace/units/gr/intr/nvgpu-gr-intr.c
+++ b/userspace/units/gr/intr/nvgpu-gr-intr.c
@@ -37,7 +37,9 @@
 #include <nvgpu/runlist.h>
 #include <nvgpu/tsg.h>
 #include <nvgpu/class.h>
+#include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/gr_intr.h>
+#include <nvgpu/tsg_subctx.h>

 #include <nvgpu/hw/gv11b/hw_gr_gv11b.h>

@@ -264,12 +266,45 @@ static int gr_test_intr_cache_current_ctx(struct gk20a *g,
 	return g->ops.gr.intr.stall_isr(g);
 }

+static u64 nvgpu_gmmu_map_locked_stub(struct vm_gk20a *vm,
+			  u64 vaddr,
+			  struct nvgpu_sgt *sgt,
+			  u64 buffer_offset,
+			  u64 size,
+			  u32 pgsz_idx,
+			  u8 kind_v,
+			  u32 ctag_offset,
+			  u32 flags,
+			  enum gk20a_mem_rw_flag rw_flag,
+			  bool clear_ctags,
+			  bool sparse,
+			  bool priv,
+			  struct vm_gk20a_mapping_batch *batch,
+			  enum nvgpu_aperture aperture)
+{
+	return 1;
+}
+
+static void nvgpu_gmmu_unmap_locked_stub(struct vm_gk20a *vm,
+			     u64 vaddr,
+			     u64 size,
+			     u32 pgsz_idx,
+			     bool va_allocated,
+			     enum gk20a_mem_rw_flag rw_flag,
+			     bool sparse,
+			     struct vm_gk20a_mapping_batch *batch)
+{
+	return;
+}
+
 static int gr_test_intr_allocate_ch_tsg(struct unit_module *m,
 					struct gk20a *g)
 {
 	u32 tsgid = getpid();
+	struct nvgpu_gr_ctx_mappings *mappings = NULL;
 	struct nvgpu_channel *ch = NULL;
 	struct nvgpu_tsg *tsg = NULL;
+	struct vm_gk20a *vm = NULL;
 	bool sema_init, notify_init;
 	int err;

@@ -295,12 +330,46 @@ static int gr_test_intr_allocate_ch_tsg(struct unit_module *m,
 		goto ch_cleanup;
 	}

+	/* Setup VM */
+	vm = nvgpu_vm_init(g, SZ_4K, SZ_4K << 10,
+		nvgpu_safe_sub_u64(1ULL << 37, SZ_4K << 10),
+		(1ULL << 32), 0ULL,
+		false, false, false, "dummy");
+	if (!vm) {
+		unit_err(m, "failed to allocate VM");
+		goto ch_cleanup;
+	}
+
+	ch->g = g;
+	ch->vm = vm;
+
 	err = nvgpu_tsg_bind_channel(tsg, ch);
 	if (err != 0) {
 		unit_err(m, "failed tsg channel bind\n");
 		goto ch_cleanup;
 	}

+	g->ops.mm.gmmu.map = nvgpu_gmmu_map_locked_stub;
+	g->ops.mm.gmmu.unmap = nvgpu_gmmu_unmap_locked_stub;
+
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, ch);
+	if (err != 0) {
+		unit_err(m, "failed to alloc gr subctx");
+		goto ch_cleanup;
+	}
+
+	err = nvgpu_tsg_subctx_setup_subctx_header(g, ch);
+	if (err != 0) {
+		unit_err(m, "failed to setup subctx header");
+		goto ch_cleanup;
+	}
+
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, ch);
+	if (mappings == NULL) {
+		unit_err(m, "failed to allocate gr_ctx mappings");
+		goto ch_cleanup;
+	}
+
 	err = gr_test_intr_block_ptr_as_current_ctx(m, g, ch, tsg, tsgid);
 	if (err != 0) {
 		unit_err(m, "isr failed with block_ptr as current_ctx\n");
--- a/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c
+++ b/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c
@@ -37,6 +37,7 @@
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/gr/ctx_mappings.h>
 #include <nvgpu/gr/obj_ctx.h>
+#include <nvgpu/tsg_subctx.h>

 #include <nvgpu/posix/posix-fault-injection.h>
 #include <nvgpu/posix/dma.h>
@@ -119,7 +120,7 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 	struct nvgpu_gr_global_ctx_buffer_desc *global_desc;
 	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	struct nvgpu_gr_ctx_mappings *mappings = NULL;
-	struct nvgpu_gr_subctx *subctx = NULL;
+	struct nvgpu_tsg_subctx *subctx = NULL;
 	struct nvgpu_mem inst_block;
 	struct nvgpu_gr_config *config = nvgpu_gr_get_config_ptr(g);
 	struct nvgpu_posix_fault_inj *kmem_fi =
@@ -132,6 +133,8 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 		struct nvgpu_gr_config *config);
 	struct nvgpu_tsg *tsg = (struct nvgpu_tsg *)
 		malloc(sizeof(struct nvgpu_tsg));
+	struct nvgpu_channel *channel = (struct nvgpu_channel *)
+		malloc(sizeof(struct nvgpu_channel));

 	/* Inject allocation failures and initialize obj_ctx, should fail */
 	nvgpu_posix_enable_fault_injection(kmem_fi, true, 0);
@@ -196,16 +199,31 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 		unit_return_fail(m, "failed to allocate global buffers");
 	}

-	subctx = nvgpu_gr_subctx_alloc(g, vm);
-	if (!subctx) {
-		unit_return_fail(m, "failed to allocate subcontext");
+	channel->g = g;
+	channel->vm = vm;
+
+	err = nvgpu_tsg_subctx_bind_channel(tsg, channel);
+	if (err != 0) {
+		unit_return_fail(m, "tsg subctx bind failed");
 	}

-	mappings = nvgpu_gr_ctx_mappings_create(g, tsg, vm);
-	if (mappings == NULL) {
-		unit_return_fail(m, "failed to allocate gr_ctx mappings");
+	err = nvgpu_tsg_subctx_alloc_gr_subctx(g, channel);
+	if (err != 0) {
+		unit_return_fail(m, "failed to allocate gr_subctx");
 	}

+	err = nvgpu_tsg_subctx_setup_subctx_header(g, channel);
+	if (err != 0) {
+		unit_return_fail(m, "failed to setup subctx header");
+	}
+
+	mappings = nvgpu_gr_ctx_alloc_or_get_mappings(g, tsg, channel);
+	if (mappings == NULL) {
+		unit_return_fail(m, "failed to allocate or get mappings");
+	}
+
+	subctx = channel->subctx;
+
 	/* Fail gr_ctx allocation */
 	nvgpu_posix_enable_fault_injection(kmem_fi, true, 0);
 	err = nvgpu_gr_obj_ctx_alloc(g, golden_image, global_desc, desc,
@@ -396,7 +414,7 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 	}

 	/* Cleanup */
-	nvgpu_gr_subctx_free(g, subctx, vm);
+	nvgpu_tsg_subctx_unbind_channel(tsg, channel);
 	nvgpu_gr_ctx_free(g, gr_ctx, global_desc);
 	nvgpu_free_gr_ctx_struct(g, gr_ctx);
 	nvgpu_gr_ctx_desc_free(g, desc);
--- a/userspace/units/gr/setup/nvgpu-gr-setup.c
+++ b/userspace/units/gr/setup/nvgpu-gr-setup.c
@@ -209,12 +209,6 @@ static int gr_test_setup_allocate_ch_tsg(struct unit_module *m,
 		goto ch_cleanup;
 	}

-	err = nvgpu_tsg_bind_channel(tsg, ch);
-	if (err != 0) {
-		unit_err(m, "failed tsg channel bind\n");
-		goto ch_cleanup;
-	}
-
 	err = gk20a_as_alloc_share(g,
 		0U, NVGPU_AS_ALLOC_UNIFIED_VA,
 		U64(SZ_4K) << U64(10),
@@ -230,6 +224,12 @@ static int gr_test_setup_allocate_ch_tsg(struct unit_module *m,
 		goto tsg_unbind;
 	}

+	err = nvgpu_tsg_bind_channel(tsg, ch);
+	if (err != 0) {
+		unit_err(m, "failed tsg channel bind\n");
+		goto ch_cleanup;
+	}
+
 	gr_setup_ch = ch;
 	gr_setup_tsg = tsg;

@@ -574,7 +574,7 @@ static int gr_setup_alloc_no_tsg_subcontext(struct unit_module *m, struct gk20a

 static void gr_setup_fake_free_obj_ctx(struct unit_module *m, struct gk20a *g)
 {
-	struct nvgpu_gr_subctx *gr_subctx = gr_setup_ch->subctx;
+	struct nvgpu_tsg_subctx *gr_subctx = gr_setup_ch->subctx;

 	/* pass NULL variable*/
 	gr_setup_ch->subctx = NULL;