From 693305c0fd58f93337d52b2784a98bbfc59d2f75 Mon Sep 17 00:00:00 2001
From: Sagar Kamble <skamble@nvidia.com>
Date: Sun, 24 Apr 2022 22:37:03 +0530
Subject: [PATCH] gpu: nvgpu: subcontext add/remove support

Subcontext PDBs and valid mask in the instance blocks of the channels
in various subcontexts has to be updated when new subcontext is
created or a subcontext is removed.

Replayable fault state is cached in the channel structure. Replayable
fault state for subcontext is set based on first channel's bind
parameter. It was earlier programmed in function channel_setup_ramfc.

init_inst_block_core is updated to setup TSG level pdb map and mask.

Added new hal gv11b_channel_bind to enable the subcontext on channel
bind.

Bug 3677982

Change-Id: I58156c5b3ab6309b6a4b8e72b0e798d6a39c1bee
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2719994
Reviewed-by: Ankur Kishore <ankkishore@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/channel.c       |   9 +-
 drivers/gpu/nvgpu/common/fifo/tsg.c           |  35 +++-
 drivers/gpu/nvgpu/common/fifo/tsg_subctx.c    |  20 ++
 .../gpu/nvgpu/common/fifo/tsg_subctx_priv.h   |   3 +
 drivers/gpu/nvgpu/common/gr/gr_falcon.c       |   6 +-
 drivers/gpu/nvgpu/common/mm/mm.c              |  27 ++-
 drivers/gpu/nvgpu/common/perf/perfbuf.c       |   7 +-
 .../gpu/nvgpu/hal/fifo/channel_ga10b_fusa.c   |  10 +
 drivers/gpu/nvgpu/hal/fifo/channel_gv11b.h    |   3 +-
 .../gpu/nvgpu/hal/fifo/channel_gv11b_fusa.c   |  36 ++++
 drivers/gpu/nvgpu/hal/fifo/ramfc_ga10b_fusa.c |  10 -
 drivers/gpu/nvgpu/hal/fifo/ramfc_gv11b_fusa.c |  10 -
 drivers/gpu/nvgpu/hal/fifo/ramfc_tu104.c      |  10 -
 drivers/gpu/nvgpu/hal/fifo/ramin_gv11b.h      |  12 +-
 drivers/gpu/nvgpu/hal/fifo/ramin_gv11b_fusa.c | 134 ++++++------
 drivers/gpu/nvgpu/hal/fifo/tsg_gv11b.h        |  59 +++++-
 drivers/gpu/nvgpu/hal/fifo/tsg_gv11b_fusa.c   | 190 ++++++++++++++++++
 drivers/gpu/nvgpu/hal/init/hal_ga100.c        |   7 +
 drivers/gpu/nvgpu/hal/init/hal_ga10b.c        |   7 +
 drivers/gpu/nvgpu/hal/init/hal_gm20b.c        |   2 +-
 drivers/gpu/nvgpu/hal/init/hal_gv11b.c        |   9 +-
 drivers/gpu/nvgpu/hal/init/hal_tu104.c        |   9 +-
 drivers/gpu/nvgpu/hal/mm/mm_gk20a.c           |  10 +-
 drivers/gpu/nvgpu/hal/mm/mm_gk20a.h           |   4 +-
 drivers/gpu/nvgpu/hal/mm/mm_gp10b_fusa.c      |   6 +-
 drivers/gpu/nvgpu/hal/mm/mm_gv11b.h           |   2 +-
 drivers/gpu/nvgpu/hal/mm/mm_gv11b_fusa.c      |  39 +++-
 .../gpu/nvgpu/hal/vgpu/init/vgpu_hal_ga10b.c  |   7 +
 .../gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c  |   7 +
 drivers/gpu/nvgpu/include/nvgpu/channel.h     |   8 +-
 drivers/gpu/nvgpu/include/nvgpu/gops/mm.h     |   2 +-
 drivers/gpu/nvgpu/include/nvgpu/gops/ramin.h  |  86 ++++++--
 drivers/gpu/nvgpu/include/nvgpu/gops/tsg.h    |   7 +-
 drivers/gpu/nvgpu/include/nvgpu/tsg.h         |   4 +
 drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h  |  31 +++
 userspace/units/acr/nvgpu-acr.c               |  10 +-
 .../fifo/channel/gv11b/nvgpu-channel-gv11b.c  |  10 +-
 userspace/units/fifo/channel/nvgpu-channel.c  |   7 +
 .../fifo/ramfc/gv11b/nvgpu-ramfc-gv11b.c      |  14 +-
 .../units/fifo/ramin/gv11b/ramin-gv11b-fusa.c |  37 +++-
 .../units/fifo/ramin/gv11b/ramin-gv11b-fusa.h |   4 +-
 41 files changed, 739 insertions(+), 171 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index 3051135fd..8e46d70de 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -1005,7 +1005,6 @@ static void channel_free(struct nvgpu_channel *ch, bool force)
 	nvgpu_cic_rm_wait_for_deferred_interrupts(g);
 
 unbind:
-	g->ops.channel.unbind(ch);
 	g->ops.channel.free_inst(g, ch);
 
 	nvgpu_channel_wdt_destroy(ch->wdt);
@@ -1520,6 +1519,14 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
 	}
 #endif
 
+	c->replayable = false;
+
+#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
+	if ((args->flags & NVGPU_SETUP_BIND_FLAGS_REPLAYABLE_FAULTS_ENABLE) != 0U) {
+		c->replayable = true;
+	}
+#endif
+
 	if ((args->flags & NVGPU_SETUP_BIND_FLAGS_USERMODE_SUPPORT) != 0U) {
 		err = nvgpu_channel_setup_usermode(c, args);
 	} else {
diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c
index 357292631..fc0be9f21 100644
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -38,6 +38,7 @@
 #include <nvgpu/nvs.h>
 #include <nvgpu/static_analysis.h>
 #include <nvgpu/nvgpu_init.h>
+#include <nvgpu/kmem.h>
 #ifdef CONFIG_NVGPU_PROFILER
 #include <nvgpu/profiler.h>
 #endif
@@ -292,12 +293,7 @@ static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
 	}
 #endif
 
-	/**
-	 * Remove channel from TSG and re-enable rest of the channels.
-	 * Since channel removal can lead to subctx removal and/or
-	 * VM mappings removal, acquire ctx_init_lock.
-	 */
-	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+	g->ops.channel.unbind(ch);
 
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
 	nvgpu_tsg_subctx_unbind_channel(tsg, ch);
@@ -311,8 +307,6 @@ static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
 	g->ops.channel.disable(ch);
 	nvgpu_rwsem_up_write(&tsg->ch_list_lock);
 
-	nvgpu_mutex_release(&tsg->ctx_init_lock);
-
 	/*
 	 * Don't re-enable all channels if TSG has timed out already
 	 *
@@ -345,8 +339,16 @@ int nvgpu_tsg_unbind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch,
 
 	nvgpu_log_fn(g, "unbind tsg:%u ch:%u\n", tsg->tsgid, ch->chid);
 
+	/**
+	 * Remove channel from TSG and re-enable rest of the channels.
+	 * Since channel removal can lead to subctx removal and/or
+	 * VM mappings removal, acquire ctx_init_lock.
+	 */
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	err = nvgpu_tsg_unbind_channel_common(tsg, ch);
 	if (!force && err == -EAGAIN) {
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		return err;
 	}
 
@@ -368,6 +370,8 @@ int nvgpu_tsg_unbind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch,
 		}
 	}
 
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release);
 
 	return err;
@@ -413,7 +417,7 @@ fail_common:
 	}
 #endif
 
-	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+	g->ops.channel.unbind(ch);
 
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
 	nvgpu_tsg_subctx_unbind_channel(tsg, ch);
@@ -907,6 +911,15 @@ int nvgpu_tsg_open_common(struct gk20a *g, struct nvgpu_tsg *tsg, pid_t pid)
 		goto clean_up;
 	}
 
+	if (g->ops.tsg.init_subctx_state != NULL) {
+		err = g->ops.tsg.init_subctx_state(g, tsg);
+		if (err != 0) {
+			nvgpu_err(g, "tsg %d subctx state init failed %d",
+				  tsg->tsgid, err);
+			goto clean_up;
+		}
+	}
+
 #ifdef CONFIG_NVGPU_SM_DIVERSITY
 	nvgpu_gr_ctx_set_sm_diversity_config(tsg->gr_ctx,
 		NVGPU_INVALID_SM_CONFIG_ID);
@@ -970,6 +983,10 @@ void nvgpu_tsg_release_common(struct gk20a *g, struct nvgpu_tsg *tsg)
 	nvgpu_free_gr_ctx_struct(g, tsg->gr_ctx);
 	tsg->gr_ctx = NULL;
 
+	if (g->ops.tsg.deinit_subctx_state != NULL) {
+		g->ops.tsg.deinit_subctx_state(g, tsg);
+	}
+
 	if (g->ops.tsg.deinit_eng_method_buffers != NULL) {
 		g->ops.tsg.deinit_eng_method_buffers(g, tsg);
 	}
diff --git a/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c b/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
index 4864ccf72..ece7d6517 100644
--- a/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
@@ -120,6 +120,10 @@ void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
 	nvgpu_list_del(&ch->subctx_entry);
 
 	if (nvgpu_list_empty(&subctx->ch_list)) {
+		if (g->ops.tsg.remove_subctx_channel_hw != NULL) {
+			g->ops.tsg.remove_subctx_channel_hw(ch);
+		}
+
 		if (g->ops.gr.setup.free_subctx != NULL) {
 			g->ops.gr.setup.free_subctx(ch);
 			subctx->gr_subctx = NULL;
@@ -203,6 +207,22 @@ u32 nvgpu_tsg_subctx_get_id(struct nvgpu_tsg_subctx *subctx)
 	return subctx->subctx_id;
 }
 
+void nvgpu_tsg_subctx_set_replayable(struct nvgpu_tsg_subctx *subctx,
+				     bool replayable)
+{
+	subctx->replayable = replayable;
+}
+
+bool nvgpu_tsg_subctx_get_replayable(struct nvgpu_tsg_subctx *subctx)
+{
+	return subctx->replayable;
+}
+
+struct vm_gk20a *nvgpu_tsg_subctx_get_vm(struct nvgpu_tsg_subctx *subctx)
+{
+	return subctx->vm;
+}
+
 struct nvgpu_gr_ctx_mappings *nvgpu_tsg_subctx_alloc_or_get_mappings(
 				struct gk20a *g,
 				struct nvgpu_tsg *tsg,
diff --git a/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h b/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h
index ed6376148..0939b7760 100644
--- a/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h
+++ b/drivers/gpu/nvgpu/common/fifo/tsg_subctx_priv.h
@@ -44,6 +44,9 @@ struct nvgpu_tsg_subctx {
 	/** Subcontext's GR ctx header and GR ctx buffers mappings. */
 	struct nvgpu_gr_subctx *gr_subctx;
 
+	/** Replayable faults state for a subcontext. */
+	bool replayable;
+
 	/**
 	 * Subcontext's entry in TSG's (#nvgpu_tsg) subcontexts list
 	 * #subctx_list.
diff --git a/drivers/gpu/nvgpu/common/gr/gr_falcon.c b/drivers/gpu/nvgpu/common/gr/gr_falcon.c
index 919f1b18f..f47cd1567 100644
--- a/drivers/gpu/nvgpu/common/gr/gr_falcon.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_falcon.c
@@ -227,7 +227,11 @@ static int nvgpu_gr_falcon_init_ctxsw_ucode_vaspace(struct gk20a *g,
 		return err;
 	}
 
-	g->ops.mm.init_inst_block_core(&ucode_info->inst_blk_desc, vm, 0);
+	err = g->ops.mm.init_inst_block_core(&ucode_info->inst_blk_desc, vm, 0);
+	if (err != 0) {
+		nvgpu_free_inst_block(g, &ucode_info->inst_blk_desc);
+		return err;
+	}
 
 	/* Map ucode surface to GMMU */
 	ucode_info->surface_desc.gpu_va = nvgpu_gmmu_map(vm,
diff --git a/drivers/gpu/nvgpu/common/mm/mm.c b/drivers/gpu/nvgpu/common/mm/mm.c
index c4f8c99cf..035400585 100644
--- a/drivers/gpu/nvgpu/common/mm/mm.c
+++ b/drivers/gpu/nvgpu/common/mm/mm.c
@@ -228,7 +228,12 @@ static int nvgpu_init_system_vm(struct mm_gk20a *mm)
 	if (err != 0) {
 		goto clean_up_vm;
 	}
-	g->ops.mm.init_inst_block_core(inst_block, mm->pmu.vm, big_page_size);
+
+	err = g->ops.mm.init_inst_block_core(inst_block, mm->pmu.vm, big_page_size);
+	if (err != 0) {
+		nvgpu_free_inst_block(g, inst_block);
+		goto clean_up_vm;
+	}
 
 	return 0;
 
@@ -247,7 +252,12 @@ static int nvgpu_init_hwpm(struct mm_gk20a *mm)
 	if (err != 0) {
 		return err;
 	}
-	g->ops.mm.init_inst_block_core(inst_block, mm->pmu.vm, 0);
+
+	err = g->ops.mm.init_inst_block_core(inst_block, mm->pmu.vm, 0);
+	if (err != 0) {
+		nvgpu_free_inst_block(g, inst_block);
+		return err;
+	}
 
 	return 0;
 }
@@ -366,7 +376,12 @@ static int nvgpu_init_bar1_vm(struct mm_gk20a *mm)
 	if (err != 0) {
 		goto clean_up_vm;
 	}
-	g->ops.mm.init_inst_block_core(inst_block, mm->bar1.vm, big_page_size);
+
+	err = g->ops.mm.init_inst_block_core(inst_block, mm->bar1.vm, big_page_size);
+	if (err != 0) {
+		nvgpu_free_inst_block(g, inst_block);
+		goto clean_up_vm;
+	}
 
 	return 0;
 
@@ -401,7 +416,11 @@ static int nvgpu_init_engine_ucode_vm(struct gk20a *g,
 		goto clean_up_va;
 	}
 
-	g->ops.mm.init_inst_block_core(inst_block, ucode->vm, big_page_size);
+	err = g->ops.mm.init_inst_block_core(inst_block, ucode->vm, big_page_size);
+	if (err != 0) {
+		nvgpu_free_inst_block(g, inst_block);
+		goto clean_up_va;
+	}
 
 	return 0;
 
diff --git a/drivers/gpu/nvgpu/common/perf/perfbuf.c b/drivers/gpu/nvgpu/common/perf/perfbuf.c
index c541a4e26..a5a1e30d5 100644
--- a/drivers/gpu/nvgpu/common/perf/perfbuf.c
+++ b/drivers/gpu/nvgpu/common/perf/perfbuf.c
@@ -72,7 +72,12 @@ int nvgpu_perfbuf_init_inst_block(struct gk20a *g)
 		return err;
 	}
 
-	g->ops.mm.init_inst_block_core(&mm->perfbuf.inst_block, mm->perfbuf.vm, 0);
+	err = g->ops.mm.init_inst_block_core(&mm->perfbuf.inst_block, mm->perfbuf.vm, 0);
+	if (err != 0) {
+		nvgpu_free_inst_block(g, &mm->perfbuf.inst_block);
+		return err;
+	}
+
 	g->ops.perf.init_inst_block(g, &mm->perfbuf.inst_block);
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/hal/fifo/channel_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/channel_ga10b_fusa.c
index 26e2014d8..8747f98f0 100644
--- a/drivers/gpu/nvgpu/hal/fifo/channel_ga10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/channel_ga10b_fusa.c
@@ -82,9 +82,19 @@ void ga10b_channel_bind(struct nvgpu_channel *ch)
 {
 	struct gk20a *g = ch->g;
 	struct nvgpu_runlist *runlist = NULL;
+	int err;
 
 	runlist = ch->runlist;
 
+	/* Enable subcontext */
+	if (g->ops.tsg.add_subctx_channel_hw != NULL) {
+		err = g->ops.tsg.add_subctx_channel_hw(ch, ch->replayable);
+		if (err != 0) {
+			nvgpu_err(g, "Subcontext addition failed %d", err);
+			return;
+		}
+	}
+
 	/* Enable channel */
 	nvgpu_chram_bar0_writel(g, runlist, runlist_chram_channel_r(ch->chid),
 		runlist_chram_channel_update_f(
diff --git a/drivers/gpu/nvgpu/hal/fifo/channel_gv11b.h b/drivers/gpu/nvgpu/hal/fifo/channel_gv11b.h
index a563b2b27..eb0e64998 100644
--- a/drivers/gpu/nvgpu/hal/fifo/channel_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/fifo/channel_gv11b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -30,6 +30,7 @@ struct nvgpu_channel;
 struct nvgpu_channel_hw_state;
 struct nvgpu_debug_context;
 
+void gv11b_channel_bind(struct nvgpu_channel *ch);
 void gv11b_channel_unbind(struct nvgpu_channel *ch);
 u32 gv11b_channel_count(struct gk20a *g);
 void gv11b_channel_read_state(struct gk20a *g, struct nvgpu_channel *ch,
diff --git a/drivers/gpu/nvgpu/hal/fifo/channel_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/channel_gv11b_fusa.c
index 59edfbba0..7e52b834f 100644
--- a/drivers/gpu/nvgpu/hal/fifo/channel_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/channel_gv11b_fusa.c
@@ -32,6 +32,42 @@
 
 #include <nvgpu/hw/gv11b/hw_ccsr_gv11b.h>
 
+void gv11b_channel_bind(struct nvgpu_channel *ch)
+{
+	struct gk20a *g = ch->g;
+	int err;
+
+	u32 inst_ptr = nvgpu_inst_block_ptr(g, &ch->inst_block);
+
+	nvgpu_log_info(g, "bind channel %d inst ptr 0x%08x",
+		ch->chid, inst_ptr);
+
+	/* Enable subcontext */
+	if (g->ops.tsg.add_subctx_channel_hw != NULL) {
+		err = g->ops.tsg.add_subctx_channel_hw(ch, ch->replayable);
+		if (err != 0) {
+			nvgpu_err(g, "Subcontext addition failed %d", err);
+			return;
+		}
+	}
+
+	/* Enable channel */
+	nvgpu_writel(g, ccsr_channel_inst_r(ch->chid),
+		     ccsr_channel_inst_ptr_f(inst_ptr) |
+		     nvgpu_aperture_mask(g, &ch->inst_block,
+				ccsr_channel_inst_target_sys_mem_ncoh_f(),
+				ccsr_channel_inst_target_sys_mem_coh_f(),
+				ccsr_channel_inst_target_vid_mem_f()) |
+		     ccsr_channel_inst_bind_true_f());
+
+	nvgpu_writel(g, ccsr_channel_r(ch->chid),
+		(nvgpu_readl(g, ccsr_channel_r(ch->chid)) &
+		 ~ccsr_channel_enable_set_f(~U32(0U))) |
+		 ccsr_channel_enable_set_true_f());
+
+	nvgpu_atomic_set(&ch->bound, 1);
+}
+
 void gv11b_channel_unbind(struct nvgpu_channel *ch)
 {
 	struct gk20a *g = ch->g;
diff --git a/drivers/gpu/nvgpu/hal/fifo/ramfc_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/ramfc_ga10b_fusa.c
index 50840ddc6..f644bd885 100644
--- a/drivers/gpu/nvgpu/hal/fifo/ramfc_ga10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/ramfc_ga10b_fusa.c
@@ -44,7 +44,6 @@ int ga10b_ramfc_setup(struct nvgpu_channel *ch, u64 gpfifo_base,
 	u32 eng_intr_mask = 0U;
 	u32 eng_intr_vector = 0U;
 	u32 eng_bitmask = 0U;
-	bool replayable = false;
 
 	(void)flags;
 
@@ -65,18 +64,9 @@ int ga10b_ramfc_setup(struct nvgpu_channel *ch, u64 gpfifo_base,
 
 	nvgpu_memset(g, mem, 0U, 0U, ram_fc_size_val_v());
 
-#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
-	if ((flags & NVGPU_SETUP_BIND_FLAGS_REPLAYABLE_FAULTS_ENABLE) != 0U) {
-		replayable = true;
-	}
-#endif
-
 	nvgpu_log_info(g, "%llu %u", pbdma_acquire_timeout,
 		g->ops.pbdma.acquire_val(pbdma_acquire_timeout));
 
-	g->ops.ramin.init_subctx_pdb(g, mem, ch->vm->pdb.mem,
-		replayable, nvgpu_channel_get_max_subctx_count(ch));
-
 	nvgpu_mem_wr32(g, mem, ram_fc_gp_base_w(),
 		g->ops.pbdma.get_gp_base(gpfifo_base));
 
diff --git a/drivers/gpu/nvgpu/hal/fifo/ramfc_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/ramfc_gv11b_fusa.c
index b2a2245ca..f482b3f98 100644
--- a/drivers/gpu/nvgpu/hal/fifo/ramfc_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/ramfc_gv11b_fusa.c
@@ -38,7 +38,6 @@ int gv11b_ramfc_setup(struct nvgpu_channel *ch, u64 gpfifo_base,
 	struct gk20a *g = ch->g;
 	struct nvgpu_mem *mem = &ch->inst_block;
 	u32 data;
-	bool replayable = false;
 
 	(void)flags;
 
@@ -46,18 +45,9 @@ int gv11b_ramfc_setup(struct nvgpu_channel *ch, u64 gpfifo_base,
 
 	nvgpu_memset(g, mem, 0, 0, ram_fc_size_val_v());
 
-#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
-	if ((flags & NVGPU_SETUP_BIND_FLAGS_REPLAYABLE_FAULTS_ENABLE) != 0U) {
-		replayable = true;
-	}
-#endif
-
 	nvgpu_log_info(g, "%llu %u", pbdma_acquire_timeout,
 		g->ops.pbdma.acquire_val(pbdma_acquire_timeout));
 
-	g->ops.ramin.init_subctx_pdb(g, mem, ch->vm->pdb.mem,
-		replayable, nvgpu_channel_get_max_subctx_count(ch));
-
 	nvgpu_mem_wr32(g, mem, ram_fc_gp_base_w(),
 		g->ops.pbdma.get_gp_base(gpfifo_base));
 
diff --git a/drivers/gpu/nvgpu/hal/fifo/ramfc_tu104.c b/drivers/gpu/nvgpu/hal/fifo/ramfc_tu104.c
index cba9b0373..4a8c577bc 100644
--- a/drivers/gpu/nvgpu/hal/fifo/ramfc_tu104.c
+++ b/drivers/gpu/nvgpu/hal/fifo/ramfc_tu104.c
@@ -39,24 +39,14 @@ int tu104_ramfc_setup(struct nvgpu_channel *ch, u64 gpfifo_base,
 	struct gk20a *g = ch->g;
 	struct nvgpu_mem *mem = &ch->inst_block;
 	u32 data;
-	bool replayable = false;
 
 	nvgpu_log_fn(g, " ");
 
 	nvgpu_memset(g, mem, 0, 0, ram_fc_size_val_v());
 
-#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
-	if ((flags & NVGPU_SETUP_BIND_FLAGS_REPLAYABLE_FAULTS_ENABLE) != 0U) {
-		replayable = true;
-	}
-#endif
-
 	nvgpu_log_info(g, "%llu %u", pbdma_acquire_timeout,
 		g->ops.pbdma.acquire_val(pbdma_acquire_timeout));
 
-	g->ops.ramin.init_subctx_pdb(g, mem, ch->vm->pdb.mem,
-		replayable, nvgpu_channel_get_max_subctx_count(ch));
-
 	nvgpu_mem_wr32(g, mem, ram_fc_gp_base_w(),
 		g->ops.pbdma.get_gp_base(gpfifo_base));
 
diff --git a/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b.h b/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b.h
index c7321f1d0..e0c3e3702 100644
--- a/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -30,9 +30,15 @@ struct nvgpu_mem;
 
 void gv11b_ramin_set_gr_ptr(struct gk20a *g,
 		struct nvgpu_mem *inst_block, u64 gpu_va);
+void gv11b_ramin_set_subctx_pdb_info(struct gk20a *g,
+		u32 subctx_id, struct nvgpu_mem *pdb_mem,
+		bool replayable, bool add, u32 *subctx_pdb_map);
+void gv11b_ramin_init_subctx_pdb_map(struct gk20a *g,
+		u32 *subctx_pdb_map);
+void gv11b_ramin_init_subctx_valid_mask(struct gk20a *g,
+		struct nvgpu_mem *inst_block, unsigned long *valid_subctx_mask);
 void gv11b_ramin_init_subctx_pdb(struct gk20a *g,
-		struct nvgpu_mem *inst_block, struct nvgpu_mem *pdb_mem,
-		bool replayable, u32 max_subctx_count);
+		struct nvgpu_mem *inst_block, u32 *subctx_pdb_map);
 void gv11b_ramin_set_eng_method_buffer(struct gk20a *g,
 		struct nvgpu_mem *inst_block, u64 gpu_va);
 void gv11b_ramin_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
diff --git a/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b_fusa.c
index abb5cb83f..55551e01b 100644
--- a/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/ramin_gv11b_fusa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -45,85 +45,89 @@ void gv11b_ramin_set_gr_ptr(struct gk20a *g,
 		ram_in_engine_wfi_ptr_hi_f(addr_hi));
 }
 
-static void gv11b_subctx_commit_valid_mask(struct gk20a *g,
-		struct nvgpu_mem *inst_block, u32 max_subctx_count)
+void gv11b_ramin_set_subctx_pdb_info(struct gk20a *g,
+		u32 subctx_id, struct nvgpu_mem *pdb_mem,
+		bool replayable, bool add, u32 *subctx_pdb_map)
 {
+	u32 format_word = 0;
+	u32 pdb_addr_lo = 0;
+	u32 pdb_addr_hi = 0;
+	u64 pdb_addr;
+	u32 aperture;
+
+	if (add) {
+		aperture = nvgpu_aperture_mask(g, pdb_mem,
+				ram_in_sc_page_dir_base_target_sys_mem_ncoh_v(),
+				ram_in_sc_page_dir_base_target_sys_mem_coh_v(),
+				ram_in_sc_page_dir_base_target_vid_mem_v());
+
+		pdb_addr = nvgpu_mem_get_addr(g, pdb_mem);
+		pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
+		pdb_addr_hi = u64_hi32(pdb_addr);
+		format_word = ram_in_sc_page_dir_base_target_f(aperture, 0U) |
+			ram_in_sc_page_dir_base_vol_f(
+			ram_in_sc_page_dir_base_vol_true_v(), 0U) |
+			ram_in_sc_use_ver2_pt_format_f(1U, 0U) |
+			ram_in_sc_big_page_size_f(1U, 0U) |
+			ram_in_sc_page_dir_base_lo_0_f(pdb_addr_lo);
+
+		if (replayable) {
+			format_word |=
+				ram_in_sc_page_dir_base_fault_replay_tex_f(1U, 0U) |
+				ram_in_sc_page_dir_base_fault_replay_gcc_f(1U, 0U);
+		}
+	} else {
+		aperture = ram_in_sc_page_dir_base_target_invalid_v();
+		format_word = ram_in_sc_page_dir_base_target_f(aperture, 0U);
+	}
+
+	nvgpu_log(g, gpu_dbg_info, "%s subctx[%u] pdb info lo %x hi %x",
+			add ? "add" : "remove", subctx_id,
+			format_word, pdb_addr_hi);
+
+	subctx_pdb_map[subctx_id * 4U] = format_word;
+	subctx_pdb_map[(subctx_id * 4U) + 1U] = pdb_addr_hi;
+}
+
+void gv11b_ramin_init_subctx_pdb_map(struct gk20a *g,
+		u32 *subctx_pdb_map)
+{
+	u32 max_subctx_count = g->ops.gr.init.get_max_subctx_count();
+	u32 i;
+
+	/* Initially, all subcontexts are invalid in the TSG. */
+	for (i = 0; i < max_subctx_count; i++) {
+		gv11b_ramin_set_subctx_pdb_info(g, i, NULL, false, false,
+						subctx_pdb_map);
+	}
+}
+
+void gv11b_ramin_init_subctx_valid_mask(struct gk20a *g,
+		struct nvgpu_mem *inst_block, unsigned long *valid_subctx_mask)
+{
+	u32 max_subctx_count = g->ops.gr.init.get_max_subctx_count();
 	u32 id;
-	u32 subctx_count = max_subctx_count;
 
 	for (id = 0U; id < max_subctx_count; id += 32U) {
-		u32 subctx_mask_max_bit = ((subctx_count < 32U) ?
-			(subctx_count % 32U) : 0U);
-		u32 subctx_mask = U32_MAX;
-
-		if (subctx_mask_max_bit != 0U) {
-			subctx_mask = nvgpu_safe_sub_u32(
-				BIT32(subctx_mask_max_bit), 1U);
-		}
+		u32 subctx_mask = ((u32 *)valid_subctx_mask)[id / 32U];
 
 		nvgpu_mem_wr32(g, inst_block,
 				ram_in_sc_pdb_valid_long_w(id), subctx_mask);
 
 		nvgpu_log(g, gpu_dbg_info | gpu_dbg_mig,
-			"id[%d] max_subctx_count[%u] subctx_mask_max_bit[%u] "
-				"subctx_count[%u] subctx_mask[%x] ",
-			id, max_subctx_count, subctx_mask_max_bit,
-			subctx_count, subctx_mask);
-
-		if (subctx_count > 32U) {
-			subctx_count = nvgpu_safe_sub_u32(subctx_count, 32U);
-		}
-	}
-}
-
-static void gv11b_subctx_commit_pdb(struct gk20a *g,
-		struct nvgpu_mem *inst_block, struct nvgpu_mem *pdb_mem,
-		bool replayable, u32 max_subctx_count)
-{
-	u32 lo, hi;
-	u32 subctx_id = 0;
-	u32 format_word;
-	u32 pdb_addr_lo, pdb_addr_hi;
-	u64 pdb_addr;
-	u32 aperture = nvgpu_aperture_mask(g, pdb_mem,
-				ram_in_sc_page_dir_base_target_sys_mem_ncoh_v(),
-				ram_in_sc_page_dir_base_target_sys_mem_coh_v(),
-				ram_in_sc_page_dir_base_target_vid_mem_v());
-
-	pdb_addr = nvgpu_mem_get_addr(g, pdb_mem);
-	pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
-	pdb_addr_hi = u64_hi32(pdb_addr);
-	format_word = ram_in_sc_page_dir_base_target_f(aperture, 0U) |
-		ram_in_sc_page_dir_base_vol_f(
-		ram_in_sc_page_dir_base_vol_true_v(), 0U) |
-		ram_in_sc_use_ver2_pt_format_f(1U, 0U) |
-		ram_in_sc_big_page_size_f(1U, 0U) |
-		ram_in_sc_page_dir_base_lo_0_f(pdb_addr_lo);
-
-	if (replayable) {
-		format_word |=
-			ram_in_sc_page_dir_base_fault_replay_tex_f(1U, 0U) |
-			ram_in_sc_page_dir_base_fault_replay_gcc_f(1U, 0U);
-	}
-
-	nvgpu_log(g, gpu_dbg_info, " pdb info lo %x hi %x",
-					format_word, pdb_addr_hi);
-	for (subctx_id = 0U; subctx_id < max_subctx_count; subctx_id++) {
-		lo = ram_in_sc_page_dir_base_vol_w(subctx_id);
-		hi = ram_in_sc_page_dir_base_hi_w(subctx_id);
-		nvgpu_mem_wr32(g, inst_block, lo, format_word);
-		nvgpu_mem_wr32(g, inst_block, hi, pdb_addr_hi);
+			"id[%d] max_subctx_count[%u] subctx_mask[%x] ",
+			id, max_subctx_count, subctx_mask);
 	}
 }
 
 void gv11b_ramin_init_subctx_pdb(struct gk20a *g,
-		struct nvgpu_mem *inst_block, struct nvgpu_mem *pdb_mem,
-		bool replayable, u32 max_subctx_count)
+		struct nvgpu_mem *inst_block, u32 *subctx_pdb_map)
 {
-	gv11b_subctx_commit_pdb(g, inst_block, pdb_mem, replayable,
-		max_subctx_count);
-	gv11b_subctx_commit_valid_mask(g, inst_block, max_subctx_count);
+	u32 max_subctx_count = g->ops.gr.init.get_max_subctx_count();
+	u32 size = max_subctx_count * 4U * 4U;
 
+	nvgpu_mem_wr_n(g, inst_block, ram_in_sc_page_dir_base_vol_w(0) * 4U,
+		subctx_pdb_map, size);
 }
 
 void gv11b_ramin_set_eng_method_buffer(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b.h b/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b.h
index f148cf23b..cb55192e0 100644
--- a/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -38,4 +38,61 @@ void gv11b_tsg_deinit_eng_method_buffers(struct gk20a *g,
 void gv11b_tsg_bind_channel_eng_method_buffers(struct nvgpu_tsg *tsg,
 		struct nvgpu_channel *ch);
 
+/**
+ * @brief Initialize subcontext PDB map and valid mask for a TSG.
+ *
+ * @param g [in]	Pointer to GPU driver struct.
+ * @param tsg [in]	Pointer to TSG struct.
+ *
+ * - If subcontexts are enabled:
+ *   - Allocate array of PDB configuration values for maximum supported
+ *     subcontexts.
+ *   - Initialize the array by calling g->ops.ramin.init_subctx_pdb_map.
+ *   - Allocate valid subcontexts bitmask.
+ *
+ * @return 0 in case of success, < 0 in case of failure.
+ */
+int gv11b_tsg_init_subctx_state(struct gk20a *g, struct nvgpu_tsg *tsg);
+
+/**
+ * @brief Deinitialize subcontext PDB map and valid mask for a TSG.
+ *
+ * @param g [in]	Pointer to GPU driver struct.
+ * @param tsg [in]	Pointer to TSG struct.
+ *
+ * - If subcontexts are enabled:
+ *   - Free array of PDB configuration values.
+ *   - Free valid subcontexts bitmask.
+ */
+void gv11b_tsg_deinit_subctx_state(struct gk20a *g, struct nvgpu_tsg *tsg);
+
+/**
+ * @brief Add a subctx channel to TSG.
+ *
+ * @param ch [in]		Pointer to Channel struct.
+ * @param replayable [in]	replayable state of the channel.
+ *
+ * - If subcontexts are enabled:
+ *   - Update subcontext info in TSG members if this is first channel
+ *     of a subcontext and update instance blocks of all channels
+ *     in the TSG with this information.
+ *   -  If this is a channel in existing subcontext then:
+ *      - Update the channel instance block with subcontext info.
+ *
+ * @return 0 in case of success, < 0 in case of failure.
+ */
+int gv11b_tsg_add_subctx_channel_hw(struct nvgpu_channel *ch, bool replayable);
+
+/**
+ * @brief Remove a subctx channel from TSG.
+ *
+ * @param ch [in]			Pointer to Channel struct.
+ *
+ * - If subcontexts are enabled:
+ *   - Update subcontext info in TSG members as this is the last channel
+ *     of a subcontext and update instance blocks of all channels
+ *     in the TSG with this information.
+ */
+void gv11b_tsg_remove_subctx_channel_hw(struct nvgpu_channel *ch);
+
 #endif /* NVGPU_TSG_GV11B_H */
diff --git a/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b_fusa.c
index 760cef6bb..cbd77728d 100644
--- a/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/tsg_gv11b_fusa.c
@@ -25,6 +25,7 @@
 #include <nvgpu/runlist.h>
 #include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/tsg.h>
+#include <nvgpu/tsg_subctx.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/static_analysis.h>
@@ -174,3 +175,192 @@ void gv11b_tsg_deinit_eng_method_buffers(struct gk20a *g,
 
 	nvgpu_log_info(g, "eng method buffers de-allocated");
 }
+
+int gv11b_tsg_init_subctx_state(struct gk20a *g, struct nvgpu_tsg *tsg)
+{
+	u32 max_subctx_count;
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	max_subctx_count = g->ops.gr.init.get_max_subctx_count();
+
+	/*
+	 * Allocate an array of subctx PDB configuration values for all supported
+	 * subcontexts. For each subctx, there will be two registers to be
+	 * configured, ram_in_sc_page_dir_base_lo_w(i) and
+	 * ram_in_sc_page_dir_base_hi_w(i) in the instance block for the channels
+	 * belonging to this TSG. Two more unused registers follow these for each
+	 * subcontext. Same PDB table/array is programmed in the instance block
+	 * of all the channels.
+	 *
+	 * As the subcontexts are bound to the TSG, their configurations register
+	 * values are added to the array and corresponding bit is set in the
+	 * valid_subctxs bitmask. And as the subcontexts are unbound from
+	 * the TSG, their configurations register values are added to the
+	 * array and corresponding bit is set in the valid_subctxs bitmask.
+	 */
+	tsg->subctx_pdb_map = nvgpu_kzalloc(g, max_subctx_count * sizeof(u32) * 4U);
+	if (tsg->subctx_pdb_map == NULL) {
+		nvgpu_err(g, "subctx_pdb_map alloc failed");
+		return -ENOMEM;
+	}
+
+	g->ops.ramin.init_subctx_pdb_map(g, tsg->subctx_pdb_map);
+
+	tsg->valid_subctxs = nvgpu_kzalloc(g,
+				BITS_TO_LONGS(max_subctx_count) *
+				sizeof(unsigned long));
+	if (tsg->valid_subctxs == NULL) {
+		nvgpu_err(g, "valid_subctxs bitmap alloc failed");
+		nvgpu_kfree(g, tsg->subctx_pdb_map);
+		tsg->subctx_pdb_map = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void gv11b_tsg_deinit_subctx_state(struct gk20a *g, struct nvgpu_tsg *tsg)
+{
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return;
+	}
+
+	nvgpu_kfree(g, tsg->subctx_pdb_map);
+	tsg->subctx_pdb_map = NULL;
+
+	nvgpu_kfree(g, tsg->valid_subctxs);
+	tsg->valid_subctxs = NULL;
+}
+
+static void gv11b_tsg_update_inst_blocks_subctxs(struct nvgpu_tsg *tsg)
+{
+	struct gk20a *g = tsg->g;
+	struct nvgpu_channel *ch;
+
+	nvgpu_list_for_each_entry(ch, &tsg->ch_list, nvgpu_channel, ch_entry) {
+		g->ops.ramin.init_subctx_pdb(g, &ch->inst_block,
+					     tsg->subctx_pdb_map);
+		g->ops.ramin.init_subctx_mask(g, &ch->inst_block,
+					      tsg->valid_subctxs);
+	}
+}
+
+static void gv11b_tsg_update_subctxs(struct nvgpu_tsg *tsg, u32 subctx_id,
+				struct vm_gk20a *vm, bool replayable, bool add)
+{
+	struct gk20a *g = tsg->g;
+
+	if (add) {
+		g->ops.ramin.set_subctx_pdb_info(g, subctx_id, vm->pdb.mem,
+				replayable, true, tsg->subctx_pdb_map);
+		nvgpu_set_bit(subctx_id, tsg->valid_subctxs);
+	} else {
+		g->ops.ramin.set_subctx_pdb_info(g, subctx_id, NULL,
+				false, false, tsg->subctx_pdb_map);
+		nvgpu_clear_bit(subctx_id, tsg->valid_subctxs);
+	}
+
+	gv11b_tsg_update_inst_blocks_subctxs(tsg);
+}
+
+static void gv11b_tsg_add_new_subctx_channel_hw(struct nvgpu_channel *ch,
+						bool replayable)
+{
+	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	struct nvgpu_tsg_subctx *subctx = ch->subctx;
+	struct vm_gk20a *vm = nvgpu_tsg_subctx_get_vm(subctx);
+	u32 subctx_id = nvgpu_tsg_subctx_get_id(subctx);
+
+	nvgpu_tsg_subctx_set_replayable(subctx, replayable);
+
+	gv11b_tsg_update_subctxs(tsg, subctx_id, vm, replayable, true);
+}
+
+static void gv11b_tsg_add_existing_subctx_channel_hw(struct nvgpu_channel *ch,
+						 bool replayable)
+{
+	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	struct nvgpu_tsg_subctx *subctx = ch->subctx;
+	struct gk20a *g = ch->g;
+
+	if (nvgpu_tsg_subctx_get_replayable(subctx) != replayable) {
+		nvgpu_err(g, "subctx replayable mismatch. ignoring.");
+	}
+
+	g->ops.ramin.init_subctx_pdb(g, &ch->inst_block, tsg->subctx_pdb_map);
+	g->ops.ramin.init_subctx_mask(g, &ch->inst_block, tsg->valid_subctxs);
+}
+
+int gv11b_tsg_add_subctx_channel_hw(struct nvgpu_channel *ch, bool replayable)
+{
+	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	struct gk20a *g = tsg->g;
+	int err;
+
+	nvgpu_log(g, gpu_dbg_fn, " ");
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return 0;
+	}
+
+	/*
+	 * Add new subcontext to the TSG. Sequence for this is below:
+	 *   1. Disable TSG.
+	 *   2. Preempt TSG.
+	 *   3. Program subctx PDBs in instance blocks of all channels in
+	 *      the TSG.
+	 *   4. Enable TSG.
+	 * This sequence is executed acquiring TSG level lock ctx_init_lock.
+	 * to synchronize with channels from other subcontexts.
+	 * ctx_init_lock is reused here. It is originally there for
+	 * synchronizing the GR context initialization by various
+	 * channels in the TSG.
+	 */
+
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+	g->ops.tsg.disable(tsg);
+	err = g->ops.fifo.preempt_tsg(g, tsg);
+	if (err != 0) {
+		g->ops.tsg.enable(tsg);
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
+		nvgpu_err(g, "preempt failed %d", err);
+		return err;
+	}
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+
+	if (!nvgpu_test_bit(ch->subctx_id, tsg->valid_subctxs)) {
+		gv11b_tsg_add_new_subctx_channel_hw(ch, replayable);
+	} else {
+		gv11b_tsg_add_existing_subctx_channel_hw(ch, replayable);
+	}
+
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+
+	g->ops.tsg.enable(tsg);
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
+	nvgpu_log(g, gpu_dbg_fn, "done");
+
+	return 0;
+}
+
+void gv11b_tsg_remove_subctx_channel_hw(struct nvgpu_channel *ch)
+{
+	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	struct gk20a *g = tsg->g;
+	u32 subctx_id;
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return;
+	}
+
+	subctx_id = nvgpu_tsg_subctx_get_id(ch->subctx);
+
+	gv11b_tsg_update_subctxs(tsg, subctx_id, NULL, false, false);
+}
diff --git a/drivers/gpu/nvgpu/hal/init/hal_ga100.c b/drivers/gpu/nvgpu/hal/init/hal_ga100.c
index 34bdb2071..01036cf87 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_ga100.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_ga100.c
@@ -1131,7 +1131,10 @@ static const struct gops_ramin ga100_ops_ramin = {
 	.set_gr_ptr = gv11b_ramin_set_gr_ptr,
 	.set_big_page_size = gm20b_ramin_set_big_page_size,
 	.init_pdb = ga10b_ramin_init_pdb,
+	.init_subctx_pdb_map = gv11b_ramin_init_subctx_pdb_map,
+	.set_subctx_pdb_info = gv11b_ramin_set_subctx_pdb_info,
 	.init_subctx_pdb = gv11b_ramin_init_subctx_pdb,
+	.init_subctx_mask = gv11b_ramin_init_subctx_valid_mask,
 	.set_adr_limit = NULL,
 	.base_shift = gk20a_ramin_base_shift,
 	.alloc_size = gk20a_ramin_alloc_size,
@@ -1199,6 +1202,10 @@ static const struct gops_channel ga100_ops_channel = {
 static const struct gops_tsg ga100_ops_tsg = {
 	.enable = gv11b_tsg_enable,
 	.disable = nvgpu_tsg_disable,
+	.init_subctx_state = gv11b_tsg_init_subctx_state,
+	.deinit_subctx_state = gv11b_tsg_deinit_subctx_state,
+	.add_subctx_channel_hw = gv11b_tsg_add_subctx_channel_hw,
+	.remove_subctx_channel_hw = gv11b_tsg_remove_subctx_channel_hw,
 	.init_eng_method_buffers = gv11b_tsg_init_eng_method_buffers,
 	.deinit_eng_method_buffers = gv11b_tsg_deinit_eng_method_buffers,
 	.bind_channel = NULL,
diff --git a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c
index 2342f5edb..2fbd2c163 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c
@@ -1145,7 +1145,10 @@ static const struct gops_ramin ga10b_ops_ramin = {
 	.set_gr_ptr = gv11b_ramin_set_gr_ptr,
 	.set_big_page_size = gm20b_ramin_set_big_page_size,
 	.init_pdb = ga10b_ramin_init_pdb,
+	.init_subctx_pdb_map = gv11b_ramin_init_subctx_pdb_map,
+	.set_subctx_pdb_info = gv11b_ramin_set_subctx_pdb_info,
 	.init_subctx_pdb = gv11b_ramin_init_subctx_pdb,
+	.init_subctx_mask = gv11b_ramin_init_subctx_valid_mask,
 	.set_adr_limit = NULL,
 	.base_shift = gk20a_ramin_base_shift,
 	.alloc_size = gk20a_ramin_alloc_size,
@@ -1214,6 +1217,10 @@ static const struct gops_channel ga10b_ops_channel = {
 static const struct gops_tsg ga10b_ops_tsg = {
 	.enable = gv11b_tsg_enable,
 	.disable = nvgpu_tsg_disable,
+	.init_subctx_state = gv11b_tsg_init_subctx_state,
+	.deinit_subctx_state = gv11b_tsg_deinit_subctx_state,
+	.add_subctx_channel_hw = gv11b_tsg_add_subctx_channel_hw,
+	.remove_subctx_channel_hw = gv11b_tsg_remove_subctx_channel_hw,
 	.init_eng_method_buffers = gv11b_tsg_init_eng_method_buffers,
 	.deinit_eng_method_buffers = gv11b_tsg_deinit_eng_method_buffers,
 	.bind_channel = NULL,
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
index 35604bc97..f34b77cb9 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
@@ -803,7 +803,7 @@ static const struct gops_mm gm20b_ops_mm = {
 	.setup_hw = nvgpu_mm_setup_hw,
 	.is_bar1_supported = gm20b_mm_is_bar1_supported,
 	.init_inst_block = gk20a_mm_init_inst_block,
-	.init_inst_block_core = gk20a_mm_init_inst_block,
+	.init_inst_block_core = gk20a_mm_init_inst_block_core,
 	.get_default_va_sizes = gm20b_mm_get_default_va_sizes,
 #ifdef CONFIG_NVGPU_USERD
 	.bar1_map_userd = gk20a_mm_bar1_map_userd,
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
index e691fad13..d575f30ca 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -969,7 +969,10 @@ static const struct gops_ramin gv11b_ops_ramin = {
 	.set_gr_ptr = gv11b_ramin_set_gr_ptr,
 	.set_big_page_size = gm20b_ramin_set_big_page_size,
 	.init_pdb = gv11b_ramin_init_pdb,
+	.init_subctx_pdb_map = gv11b_ramin_init_subctx_pdb_map,
+	.set_subctx_pdb_info = gv11b_ramin_set_subctx_pdb_info,
 	.init_subctx_pdb = gv11b_ramin_init_subctx_pdb,
+	.init_subctx_mask = gv11b_ramin_init_subctx_valid_mask,
 	.set_adr_limit = NULL,
 	.base_shift = gk20a_ramin_base_shift,
 	.alloc_size = gk20a_ramin_alloc_size,
@@ -1013,7 +1016,7 @@ static const struct gops_userd gv11b_ops_userd = {
 static const struct gops_channel gv11b_ops_channel = {
 	.alloc_inst = nvgpu_channel_alloc_inst,
 	.free_inst = nvgpu_channel_free_inst,
-	.bind = gm20b_channel_bind,
+	.bind = gv11b_channel_bind,
 	.unbind = gv11b_channel_unbind,
 	.enable = gk20a_channel_enable,
 	.disable = gk20a_channel_disable,
@@ -1030,6 +1033,10 @@ static const struct gops_channel gv11b_ops_channel = {
 static const struct gops_tsg gv11b_ops_tsg = {
 	.enable = gv11b_tsg_enable,
 	.disable = nvgpu_tsg_disable,
+	.init_subctx_state = gv11b_tsg_init_subctx_state,
+	.deinit_subctx_state = gv11b_tsg_deinit_subctx_state,
+	.add_subctx_channel_hw = gv11b_tsg_add_subctx_channel_hw,
+	.remove_subctx_channel_hw = gv11b_tsg_remove_subctx_channel_hw,
 	.init_eng_method_buffers = gv11b_tsg_init_eng_method_buffers,
 	.deinit_eng_method_buffers = gv11b_tsg_deinit_eng_method_buffers,
 	.bind_channel = NULL,
diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
index 74790477d..a22151849 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -1026,7 +1026,10 @@ static const struct gops_ramin tu104_ops_ramin = {
 	.set_gr_ptr = gv11b_ramin_set_gr_ptr,
 	.set_big_page_size = gm20b_ramin_set_big_page_size,
 	.init_pdb = gv11b_ramin_init_pdb,
+	.init_subctx_pdb_map = gv11b_ramin_init_subctx_pdb_map,
+	.set_subctx_pdb_info = gv11b_ramin_set_subctx_pdb_info,
 	.init_subctx_pdb = gv11b_ramin_init_subctx_pdb,
+	.init_subctx_mask = gv11b_ramin_init_subctx_valid_mask,
 	.set_adr_limit = NULL,
 	.base_shift = gk20a_ramin_base_shift,
 	.alloc_size = gk20a_ramin_alloc_size,
@@ -1068,7 +1071,7 @@ static const struct gops_userd tu104_ops_userd = {
 static const struct gops_channel tu104_ops_channel = {
 	.alloc_inst = nvgpu_channel_alloc_inst,
 	.free_inst = nvgpu_channel_free_inst,
-	.bind = gm20b_channel_bind,
+	.bind = gv11b_channel_bind,
 	.unbind = gv11b_channel_unbind,
 	.enable = gk20a_channel_enable,
 	.disable = gk20a_channel_disable,
@@ -1085,6 +1088,10 @@ static const struct gops_channel tu104_ops_channel = {
 static const struct gops_tsg tu104_ops_tsg = {
 	.enable = gv11b_tsg_enable,
 	.disable = nvgpu_tsg_disable,
+	.init_subctx_state = gv11b_tsg_init_subctx_state,
+	.deinit_subctx_state = gv11b_tsg_deinit_subctx_state,
+	.add_subctx_channel_hw = gv11b_tsg_add_subctx_channel_hw,
+	.remove_subctx_channel_hw = gv11b_tsg_remove_subctx_channel_hw,
 	.init_eng_method_buffers = gv11b_tsg_init_eng_method_buffers,
 	.deinit_eng_method_buffers = gv11b_tsg_deinit_eng_method_buffers,
 	.bind_channel = NULL,
diff --git a/drivers/gpu/nvgpu/hal/mm/mm_gk20a.c b/drivers/gpu/nvgpu/hal/mm/mm_gk20a.c
index f404b2fee..fc7917cf5 100644
--- a/drivers/gpu/nvgpu/hal/mm/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/mm/mm_gk20a.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -44,6 +44,14 @@ void gk20a_mm_init_inst_block(struct nvgpu_mem *inst_block, struct vm_gk20a *vm,
 	}
 }
 
+int gk20a_mm_init_inst_block_core(struct nvgpu_mem *inst_block,
+		struct vm_gk20a *vm, u32 big_page_size)
+{
+	gk20a_mm_init_inst_block(inst_block, vm, big_page_size);
+
+	return 0;
+}
+
 #ifdef CONFIG_NVGPU_USERD
 u64 gk20a_mm_bar1_map_userd(struct gk20a *g, struct nvgpu_mem *mem, u32 offset)
 {
diff --git a/drivers/gpu/nvgpu/hal/mm/mm_gk20a.h b/drivers/gpu/nvgpu/hal/mm/mm_gk20a.h
index b74049969..8bed6444a 100644
--- a/drivers/gpu/nvgpu/hal/mm/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/mm/mm_gk20a.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -30,6 +30,8 @@ struct vm_gk20a;
 
 void gk20a_mm_init_inst_block(struct nvgpu_mem *inst_block, struct vm_gk20a *vm,
 			      u32 big_page_size);
+int gk20a_mm_init_inst_block_core(struct nvgpu_mem *inst_block,
+		struct vm_gk20a *vm, u32 big_page_size);
 u64 gk20a_mm_bar1_map_userd(struct gk20a *g, struct nvgpu_mem *mem, u32 offset);
 
 #endif
diff --git a/drivers/gpu/nvgpu/hal/mm/mm_gp10b_fusa.c b/drivers/gpu/nvgpu/hal/mm/mm_gp10b_fusa.c
index 53a396596..94a617abe 100644
--- a/drivers/gpu/nvgpu/hal/mm/mm_gp10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/mm/mm_gp10b_fusa.c
@@ -51,7 +51,11 @@ int gp10b_mm_init_bar2_vm(struct gk20a *g)
 		goto clean_up_va;
 	}
 
-	g->ops.mm.init_inst_block_core(inst_block, mm->bar2.vm, big_page_size);
+	err = g->ops.mm.init_inst_block_core(inst_block, mm->bar2.vm, big_page_size);
+	if (err != 0) {
+		nvgpu_free_inst_block(g, inst_block);
+		goto clean_up_va;
+	}
 
 	return 0;
 
diff --git a/drivers/gpu/nvgpu/hal/mm/mm_gv11b.h b/drivers/gpu/nvgpu/hal/mm/mm_gv11b.h
index 754335887..f0e0f7157 100644
--- a/drivers/gpu/nvgpu/hal/mm/mm_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/mm/mm_gv11b.h
@@ -30,7 +30,7 @@ struct vm_gk20a;
 
 void gv11b_mm_init_inst_block(struct nvgpu_mem *inst_block, struct vm_gk20a *vm,
 			      u32 big_page_size);
-void gv11b_mm_init_inst_block_core(struct nvgpu_mem *inst_block,
+int gv11b_mm_init_inst_block_core(struct nvgpu_mem *inst_block,
 				   struct vm_gk20a *vm,
 				   u32 big_page_size);
 bool gv11b_mm_is_bar1_supported(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/hal/mm/mm_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/mm/mm_gv11b_fusa.c
index edf501959..2e8244c38 100644
--- a/drivers/gpu/nvgpu/hal/mm/mm_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/mm/mm_gv11b_fusa.c
@@ -21,6 +21,7 @@
  */
 
 #include <nvgpu/gk20a.h>
+#include <nvgpu/fifo.h>
 #include <nvgpu/gmmu.h>
 #include <nvgpu/mm.h>
 
@@ -42,14 +43,48 @@ void gv11b_mm_init_inst_block(struct nvgpu_mem *inst_block,
 	}
 }
 
-void gv11b_mm_init_inst_block_core(struct nvgpu_mem *inst_block,
+int gv11b_mm_init_inst_block_core(struct nvgpu_mem *inst_block,
 		struct vm_gk20a *vm, u32 big_page_size)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
+	u32 max_subctx_count = g->ops.gr.init.get_max_subctx_count();
+	unsigned long *valid_subctxs;
+	u32 *subctx_pdb_map;
+
+	subctx_pdb_map = nvgpu_kzalloc(g, max_subctx_count * sizeof(u32) * 4U);
+	if (subctx_pdb_map == NULL) {
+		nvgpu_err(g, "subctx_pdb_map alloc failed");
+		return -ENOMEM;
+	}
+
+	valid_subctxs = nvgpu_kzalloc(g,
+				BITS_TO_LONGS(max_subctx_count) *
+				sizeof(unsigned long));
+	if (valid_subctxs == NULL) {
+		nvgpu_err(g, "valid_subctxs bitmask alloc failed");
+		nvgpu_kfree(g, subctx_pdb_map);
+		return -ENOMEM;
+	}
 
 	gv11b_mm_init_inst_block(inst_block, vm, big_page_size);
 
-	g->ops.ramin.init_subctx_pdb(g, inst_block, vm->pdb.mem, false, 1U);
+	/* Program subctx pdb info in the instance block */
+	g->ops.ramin.init_subctx_pdb_map(g, subctx_pdb_map);
+	g->ops.ramin.set_subctx_pdb_info(g, CHANNEL_INFO_VEID0, vm->pdb.mem,
+					 false, true, subctx_pdb_map);
+	g->ops.ramin.init_subctx_pdb(g, inst_block, subctx_pdb_map);
+
+	/*
+	 * Program subctx pdb valid mask in the instance block.
+	 * Only subctx 0 is valid here.
+	 */
+	nvgpu_set_bit(CHANNEL_INFO_VEID0, valid_subctxs);
+	g->ops.ramin.init_subctx_mask(g, inst_block, valid_subctxs);
+
+	nvgpu_kfree(g, valid_subctxs);
+	nvgpu_kfree(g, subctx_pdb_map);
+
+	return 0;
 }
 
 bool gv11b_mm_is_bar1_supported(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_ga10b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_ga10b.c
index dc3244ca5..38c608e5d 100644
--- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_ga10b.c
@@ -693,7 +693,10 @@ static const struct gops_ramin vgpu_ga10b_ops_ramin = {
 	.set_gr_ptr = NULL,
 	.set_big_page_size = gm20b_ramin_set_big_page_size,
 	.init_pdb = gv11b_ramin_init_pdb,
+	.init_subctx_pdb_map = gv11b_ramin_init_subctx_pdb_map,
+	.set_subctx_pdb_info = gv11b_ramin_set_subctx_pdb_info,
 	.init_subctx_pdb = gv11b_ramin_init_subctx_pdb,
+	.init_subctx_mask = gv11b_ramin_init_subctx_valid_mask,
 	.set_adr_limit = NULL,
 	.base_shift = gk20a_ramin_base_shift,
 	.alloc_size = gk20a_ramin_alloc_size,
@@ -744,6 +747,10 @@ static const struct gops_channel vgpu_ga10b_ops_channel = {
 static const struct gops_tsg vgpu_ga10b_ops_tsg = {
 	.open = vgpu_tsg_open,
 	.release = vgpu_tsg_release,
+	.init_subctx_state = NULL,
+	.deinit_subctx_state = NULL,
+	.add_subctx_channel_hw = NULL,
+	.remove_subctx_channel_hw = NULL,
 	.init_eng_method_buffers = NULL,
 	.deinit_eng_method_buffers = NULL,
 	.enable = gv11b_tsg_enable,
diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
index 770f5e7e6..c68c788b9 100644
--- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
@@ -660,7 +660,10 @@ static const struct gops_ramin vgpu_gv11b_ops_ramin = {
 	.set_gr_ptr = NULL,
 	.set_big_page_size = gm20b_ramin_set_big_page_size,
 	.init_pdb = gv11b_ramin_init_pdb,
+	.init_subctx_pdb_map = gv11b_ramin_init_subctx_pdb_map,
+	.set_subctx_pdb_info = gv11b_ramin_set_subctx_pdb_info,
 	.init_subctx_pdb = gv11b_ramin_init_subctx_pdb,
+	.init_subctx_mask = gv11b_ramin_init_subctx_valid_mask,
 	.set_adr_limit = NULL,
 	.base_shift = gk20a_ramin_base_shift,
 	.alloc_size = gk20a_ramin_alloc_size,
@@ -712,6 +715,10 @@ static const struct gops_channel vgpu_gv11b_ops_channel = {
 static const struct gops_tsg vgpu_gv11b_ops_tsg = {
 	.open = vgpu_tsg_open,
 	.release = vgpu_tsg_release,
+	.init_subctx_state = NULL,
+	.deinit_subctx_state = NULL,
+	.add_subctx_channel_hw = NULL,
+	.remove_subctx_channel_hw = NULL,
 	.init_eng_method_buffers = NULL,
 	.deinit_eng_method_buffers = NULL,
 	.enable = gv11b_tsg_enable,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h
index 573a2a5a8..1ce5484fb 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -501,6 +501,11 @@ struct nvgpu_channel {
 	/** Runlist the channel will run on. */
 	struct nvgpu_runlist *runlist;
 
+	/**
+	 * Replayable fault state for the channel.
+	 */
+	bool replayable;
+
 	/**
 	 * Recovery path can be entered twice for the same error in
 	 * case of mmu_nack. This flag indicates if we already recovered
@@ -960,7 +965,7 @@ struct nvgpu_channel *nvgpu_channel_open_new(struct gk20a *g,
 		pid_t pid, pid_t tid);
 
 /**
- * @brief Setup and bind the channel
+ * @brief Setup and bind the channel and add subcontext PDB.
  *
  * @param ch [in]	Channel pointer.
  * @param args [in]	Setup bind arguments.
@@ -975,6 +980,7 @@ struct nvgpu_channel *nvgpu_channel_open_new(struct gk20a *g,
  * provided in args. A submit token is passed back to be written in the
  * doorbell register in the usermode region to notify the GPU for new
  * work on this channel.
+ * Update the instance blocks of all channels to add the subctx pdb.
  *
  * @note An address space needs to have been bound to the channel before
  *       calling this function.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/mm.h b/drivers/gpu/nvgpu/include/nvgpu/gops/mm.h
index 60fb44d96..a13c67bdb 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/mm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/mm.h
@@ -566,7 +566,7 @@ struct gops_mm {
 	 * - Configures the pdb base, big page size and
 	 *   0th sub context's pdb base in context's instance block memory.
 	 */
-	void (*init_inst_block_core)(struct nvgpu_mem *inst_block,
+	int (*init_inst_block_core)(struct nvgpu_mem *inst_block,
 			struct vm_gk20a *vm, u32 big_page_size);
 
 	/**
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/ramin.h b/drivers/gpu/nvgpu/include/nvgpu/gops/ramin.h
index 2fd5d017a..1e3c54678 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/ramin.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/ramin.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -90,34 +90,78 @@ struct gops_ramin {
 	void (*init_pdb)(struct gk20a *g, struct nvgpu_mem *inst_block,
 			u64 pdb_addr, struct nvgpu_mem *pdb_mem);
 
+	/**
+	 * @brief Init subcontext pdb map for a TSG.
+	 *
+	 * @param g [in]		Pointer to GPU driver struct.
+	 * @param subctx_pdb_map [in]	Memory pointing to pdb map for a TSG.
+	 *
+	 * This HAL configures PDB for all subcontexts of an instance block.
+	 * It sets all PDBs invalid.
+	 */
+	void (*init_subctx_pdb_map)(struct gk20a *g,
+			u32 *subctx_pdb_map);
+
+	/**
+	 * @brief Update subcontext pdb map for subcontext addition/removal.
+	 *
+	 * @param g [in]		Pointer to GPU driver struct.
+	 * @param subctx_id [in]	Subcontext ID.
+	 * @param pdb_mem [in]		Memory descriptor of PDB.
+	 * @param replayable [in]	Indicates if errors are replayable
+	 *				for this Subcontext.
+	 * @param add [in]		Indicate if subcontext PDB is to be
+	 *				added or removed.
+	 * @param subctx_pdb_map [in]	Memory pointing to pdb map for a TSG.
+	 *
+	 * This HAL configures PDB for sub-context of Instance Block:
+	 * If adding a subcontext PDB:
+	 * - Get aperture mask from \a pdb_mem.
+	 * - Get physical address of \a pdb_mem.
+	 * - Build PDB entry with defaults for PT version, big page size,
+	 *   volatile attribute, and above aperture.
+	 * - If \a replayable is true, set replayable attribute for TEX
+	 *   and GCC faults.
+	 * - Set lo and hi 32-bits to point to \a pdb_mem.
+	 * - Program related entry in \a subctx_pdb_map.
+	 * If removing a subcontext PDB:
+	 * - Set aperture as ram_in_sc_page_dir_base_target_invalid_v().
+	 * - Program related entry in \a subctx_pdb_map.
+	 */
+	void (*set_subctx_pdb_info)(struct gk20a *g,
+		u32 subctx_id, struct nvgpu_mem *pdb_mem,
+		bool replayable, bool add, u32 *subctx_pdb_map);
+
 	/**
 	 * @brief Init PDB for sub-contexts.
 	 *
 	 * @param g [in]		Pointer to GPU driver struct.
 	 * @param inst_block [in]	Memory descriptor of Instance Block.
-	 * @param pdb_mem [in]		Memory descriptor of PDB.
-	 * @param replayable [in]	Indicates if errors are replayable
-	 * 				for this Instance Block.
-	 * @param max_subctx_count [in] Max number of sub context.
+	 * @param subctx_pdb_map [in]	Memory pointing to pdb map for a TSG.
 	 *
-	 * This HAL configures PDB for all sub-contexts of Instance Block:
-	 * - Get max number of sub-contexts from HW.
-	 * - Get aperture mask from \a pdb_mem.
-	 * - Get physical address of \a pdb_mem.
-	 * - For each sub-context:
-	 *    - Build PDB entry with defaults for PT version, big page size,
-	 *      volatile attribute, and above aperture.
-	 *    - If \a replayable is true, set replayable attribute for TEX
-	 *      and GCC faults.
-	 *    - Set lo and hi 32-bits to point to \a pdb_mem.
-	 *    - Program related entry in Instance Block.
-	 *
-	 * @see NVGPU_SETUP_BIND_FLAGS_REPLAYABLE_FAULTS_ENABLE
+	 * This HAL configures PDB for all sub-contexts of Instance Block.
+	 * It copies \a subctx_pdb_map to the offset
+	 * ram_in_sc_page_dir_base_vol_w(0) * 4U in
+	 * the instance block.
 	 */
 	void (*init_subctx_pdb)(struct gk20a *g,
-			struct nvgpu_mem *inst_block,
-			struct nvgpu_mem *pdb_mem,
-			bool replayable, u32 max_subctx_count);
+		struct nvgpu_mem *inst_block, u32 *subctx_pdb_map);
+
+	/**
+	 * @brief Set valid subcontexts masks.
+	 *
+	 * @param g [in]			Pointer to GPU driver struct.
+	 * @param inst_block [in]		Memory descriptor of Instance
+	 *					Block.
+	 * @param valid_subctx_mask [in]	Max number of sub context.
+	 *
+	 * This HAL configures mask for all sub-contexts of Instance Block:
+	 * - Get max number of sub-contexts from HW.
+	 * - For each set of 32 subcontexts, set the mask from
+	 *   \a valid_subctx_mask in ram_in_sc_pdb_valid_long_w().
+	 */
+	void (*init_subctx_mask)(struct gk20a *g,
+		struct nvgpu_mem *inst_block, unsigned long *valid_subctx_mask);
 
 	/**
 	 * @brief Instance Block shift.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/tsg.h b/drivers/gpu/nvgpu/include/nvgpu/gops/tsg.h
index aff7a4eb2..b99babb35 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/tsg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -55,6 +55,11 @@ struct gops_tsg {
 /** @cond DOXYGEN_SHOULD_SKIP_THIS */
 	int (*open)(struct nvgpu_tsg *tsg);
 	void (*release)(struct nvgpu_tsg *tsg);
+	int (*init_subctx_state)(struct gk20a *g, struct nvgpu_tsg *tsg);
+	void (*deinit_subctx_state)(struct gk20a *g, struct nvgpu_tsg *tsg);
+	int (*add_subctx_channel_hw)(struct nvgpu_channel *ch,
+				      bool replayable);
+	void (*remove_subctx_channel_hw)(struct nvgpu_channel *ch);
 	int (*init_eng_method_buffers)(struct gk20a *g,
 			struct nvgpu_tsg *tsg);
 	void (*deinit_eng_method_buffers)(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg.h b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
index 9b6280df1..cd92c8f27 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
@@ -117,6 +117,10 @@ struct nvgpu_tsg {
 	 */
 	struct nvgpu_ref refcount;
 
+	u32 *subctx_pdb_map;
+
+	unsigned long *valid_subctxs;
+
 	/**
 	 * List of subcontexts (#nvgpu_tsg_subctx) bound to this TSG.
 	 * Accessed by holding #ch_list_lock from TSG.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h b/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
index 73cfd444e..4ce1f7826 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
@@ -60,6 +60,8 @@ int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
  * - Validate that #subctx is allocated for the channel #ch.
  * - Remove the channel from the subctx #ch_list.
  * - If the subctx #ch_list is empty
+ *   - Update the instance blocks of all channels to remove the
+ *     subctx pdb.
  *   - Invoke g->ops.gr.setup.free_subctx to free the GR subcontext
  *     struct (and GR subcontext mappings struct).
  *   - Remove the subctx from the TSG #subctx_list.
@@ -120,6 +122,35 @@ struct nvgpu_gr_subctx *nvgpu_tsg_subctx_get_gr_subctx(
  */
 u32 nvgpu_tsg_subctx_get_id(struct nvgpu_tsg_subctx *tsg_subctx);
 
+/**
+ * @brief Set replayable state for a TSG subcontext.
+ *
+ * @param subctx [in]		Pointer to TSG subcontext.
+ * @param Replayable [in]	replayable state for the subcontext.
+ *
+ * - Set #replayable in #nvgpu_tsg_subctx.
+ */
+void nvgpu_tsg_subctx_set_replayable(struct nvgpu_tsg_subctx *subctx,
+				     bool replayable);
+
+/**
+ * @brief Get replayable state for a TSG subcontext.
+ *
+ * @param subctx [in]		Pointer to TSG subcontext.
+ *
+ * - Return #replayable from #nvgpu_tsg_subctx.
+ */
+bool nvgpu_tsg_subctx_get_replayable(struct nvgpu_tsg_subctx *subctx);
+
+/**
+ * @brief Get VM for a TSG subcontext.
+ *
+ * @param subctx [in]		Pointer to TSG subcontext.
+ *
+ * - Return #vm from #nvgpu_tsg_subctx.
+ */
+struct vm_gk20a *nvgpu_tsg_subctx_get_vm(struct nvgpu_tsg_subctx *subctx);
+
 /**
  * @brief Allocate or get the mappings struct for the TSG subcontext.
  *
diff --git a/userspace/units/acr/nvgpu-acr.c b/userspace/units/acr/nvgpu-acr.c
index 27ea3f0fa..8532f9748 100644
--- a/userspace/units/acr/nvgpu-acr.c
+++ b/userspace/units/acr/nvgpu-acr.c
@@ -826,24 +826,24 @@ int test_acr_prepare_ucode_blob(struct unit_module *m,
 
 	nvgpu_posix_enable_fault_injection(kmem_fi, false, 0);
 
-	nvgpu_posix_enable_fault_injection(kmem_fi, true, 17);
+	nvgpu_posix_enable_fault_injection(kmem_fi, true, 19);
 
-	unit_info(m, " kmem counter 17\n");
+	unit_info(m, " kmem counter 19\n");
 	err = g->acr->prepare_ucode_blob(g);
 
 	if (err != -ENOENT) {
-		unit_return_fail(m, "kmem count 17 test did not fail as expected\n");
+		unit_return_fail(m, "kmem count 19 test did not fail as expected\n");
 	}
 
 	/*
-	 * the kmem counter is decreased after 17th count
+	 * the kmem counter is decreased after 19th count
 	 * because in the first attempt new memory is allocated and mapped for
 	 * page directories but after that since memory is already allocated it
 	 * is just mapped. Thus, number of kmallocs decrease.
 	 */
 	nvgpu_posix_enable_fault_injection(kmem_fi, false, 0);
 
-	for (i = 9; i < 17; i++) {
+	for (i = 9; i < 19; i++) {
 		unit_info(m, "kmem counter %d\n", i);
 		nvgpu_posix_enable_fault_injection(kmem_fi, true, i);
 		err = g->acr->prepare_ucode_blob(g);
diff --git a/userspace/units/fifo/channel/gv11b/nvgpu-channel-gv11b.c b/userspace/units/fifo/channel/gv11b/nvgpu-channel-gv11b.c
index 8b4631f43..096fb5dee 100644
--- a/userspace/units/fifo/channel/gv11b/nvgpu-channel-gv11b.c
+++ b/userspace/units/fifo/channel/gv11b/nvgpu-channel-gv11b.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -69,6 +69,12 @@ struct unit_ctx {
 	size_t size;
 };
 
+static int stub_add_subctx_channel_hw(struct nvgpu_channel *ch,
+				      bool replayable)
+{
+	return 0;
+}
+
 int test_gv11b_channel_unbind(struct unit_module *m,
 		struct gk20a *g, void *args)
 {
@@ -77,6 +83,8 @@ int test_gv11b_channel_unbind(struct unit_module *m,
 	struct nvgpu_channel *ch;
 	int ret = UNIT_FAIL;
 
+	g->ops.tsg.add_subctx_channel_hw = stub_add_subctx_channel_hw;
+
 	ch = nvgpu_channel_open_new(g, runlist_id,
 		privileged, getpid(), getpid());
 	unit_assert(ch, goto done);
diff --git a/userspace/units/fifo/channel/nvgpu-channel.c b/userspace/units/fifo/channel/nvgpu-channel.c
index 3d390f88a..85729ff42 100644
--- a/userspace/units/fifo/channel/nvgpu-channel.c
+++ b/userspace/units/fifo/channel/nvgpu-channel.c
@@ -1562,6 +1562,12 @@ static void stub_channel_work_completion_cancel_sync(struct nvgpu_channel *ch)
 }
 #endif
 
+static int stub_add_subctx_channel_hw(struct nvgpu_channel *ch,
+				      bool replayable)
+{
+	return 0;
+}
+
 int test_channel_suspend_resume_serviceable_chs(struct unit_module *m,
 						struct gk20a *g, void *vargs)
 {
@@ -1594,6 +1600,7 @@ int test_channel_suspend_resume_serviceable_chs(struct unit_module *m,
 	g->ops.fifo.preempt_tsg = stub_fifo_preempt_tsg;
 	g->ops.fifo.preempt_channel = stub_fifo_preempt_channel;
 	g->ops.runlist.reload = stub_runlist_reload;
+	g->ops.tsg.add_subctx_channel_hw = stub_add_subctx_channel_hw;
 	orig_ch_tsgid = ch->tsgid;
 
 	for (branches = 0U; branches < F_CHANNEL_SUSPEND_RESUME_CHS_LAST;
diff --git a/userspace/units/fifo/ramfc/gv11b/nvgpu-ramfc-gv11b.c b/userspace/units/fifo/ramfc/gv11b/nvgpu-ramfc-gv11b.c
index bae3ab33d..3168f6d1c 100644
--- a/userspace/units/fifo/ramfc/gv11b/nvgpu-ramfc-gv11b.c
+++ b/userspace/units/fifo/ramfc/gv11b/nvgpu-ramfc-gv11b.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -131,13 +131,6 @@ static int stub_ramfc_commit_userd(struct nvgpu_channel *ch)
 	return 0;
 }
 
-static void stub_ramin_init_subctx_pdb(struct gk20a *g,
-			struct nvgpu_mem *inst_block, struct nvgpu_mem *pdb_mem,
-			bool replayable, u32 max_subctx_count)
-{
-	global_count++;
-}
-
 #define F_RAMFC_SETUP_PRIVILEDGED_CH			BIT(0)
 #define F_RAMFC_SETUP_LAST				BIT(1)
 
@@ -156,7 +149,6 @@ int test_gv11b_ramfc_setup(struct unit_module *m, struct gk20a *g, void *args)
 
 	g->ops.ramin.alloc_size = gk20a_ramin_alloc_size;
 	g->ops.pbdma.acquire_val = stub_pbdma_acquire_val;
-	g->ops.ramin.init_subctx_pdb = stub_ramin_init_subctx_pdb;
 	g->ops.pbdma.get_gp_base = stub_pbdma_get_gp_base;
 	g->ops.pbdma.get_gp_base_hi = stub_pbdma_get_gp_base_hi;
 	g->ops.pbdma.get_signature = stub_pbdma_get_signature;
@@ -199,9 +191,9 @@ int test_gv11b_ramfc_setup(struct unit_module *m, struct gk20a *g, void *args)
 				ram_fc_config_w()) == 5U, goto done);
 
 		if (branches & F_RAMFC_SETUP_PRIVILEDGED_CH) {
-			unit_assert(global_count == 15U, goto done);
+			unit_assert(global_count == 14U, goto done);
 		} else {
-			unit_assert(global_count == 13U, goto done);
+			unit_assert(global_count == 12U, goto done);
 		}
 	}
 
diff --git a/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.c b/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.c
index a22a0b183..d464d1fd1 100644
--- a/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.c
+++ b/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -109,6 +109,24 @@ int test_gv11b_ramin_init_subctx_pdb(struct unit_module *m, struct gk20a *g,
 	u64 pdb_addr;
 	u32 max_subctx_count = ram_in_sc_page_dir_base_target__size_1_v();
 	u32 aperture = ram_in_sc_page_dir_base_target_sys_mem_ncoh_v();
+	unsigned long *valid_subctxs;
+	u32 *subctx_pdb_map;
+
+	subctx_pdb_map = nvgpu_kzalloc(g, max_subctx_count * sizeof(u32) * 4U);
+	if (subctx_pdb_map == NULL) {
+		nvgpu_err(g, "subctx_pdb_map alloc failed");
+		return UNIT_FAIL;
+	}
+
+	valid_subctxs = nvgpu_kzalloc(g,
+				BITS_TO_LONGS(max_subctx_count) *
+				sizeof(unsigned long));
+	if (valid_subctxs == NULL) {
+		nvgpu_err(g, "valid_subctxs bitmap alloc failed");
+		nvgpu_kfree(g, subctx_pdb_map);
+		subctx_pdb_map = NULL;
+		return UNIT_FAIL;
+	}
 
 	g->ops.ramin.alloc_size = gk20a_ramin_alloc_size;
 
@@ -146,8 +164,21 @@ int test_gv11b_ramin_init_subctx_pdb(struct unit_module *m, struct gk20a *g,
 					1U, 0U);
 		}
 
-		gv11b_ramin_init_subctx_pdb(g, &inst_block, &pdb_mem,
-								replayable, 64);
+		g->ops.ramin.init_subctx_pdb_map(g, subctx_pdb_map);
+		for (subctx_id = 0; subctx_id < max_subctx_count; subctx_id++) {
+			g->ops.ramin.set_subctx_pdb_info(g, subctx_id,
+				&pdb_mem, replayable, true, subctx_pdb_map);
+			nvgpu_set_bit(subctx_id, valid_subctxs);
+		}
+
+		/* Program subctx pdb info in the instance block */
+		g->ops.ramin.init_subctx_pdb(g, &inst_block, subctx_pdb_map);
+
+		/*
+		 * Program subctx pdb valid mask in the instance block.
+		 * Only subctx 0 is valid here.
+		 */
+		g->ops.ramin.init_subctx_mask(g, &inst_block, valid_subctxs);
 
 		for (subctx_id = 0; subctx_id < max_subctx_count; subctx_id++) {
 			addr_lo = ram_in_sc_page_dir_base_vol_w(subctx_id);
diff --git a/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.h b/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.h
index 65e96e0c6..def991470 100644
--- a/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.h
+++ b/userspace/units/fifo/ramin/gv11b/ramin-gv11b-fusa.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -63,7 +63,7 @@ int test_gv11b_ramin_set_gr_ptr(struct unit_module *m, struct gk20a *g,
  * Test Type: Feature
  *
  * Targets: gops_ramin.init_subctx_pdb, gv11b_ramin_init_subctx_pdb,
- *          gv11b_subctx_commit_pdb, gv11b_subctx_commit_valid_mask
+ *          gops_ramin.init_subctx_mask, gv11b_ramin_init_subctx_valid_mask
  *
  * Input: None
  *