diff --git a/drivers/gpu/nvgpu/common/regops/regops.c b/drivers/gpu/nvgpu/common/regops/regops.c
index 02d23cb8f..e4d74da2f 100644
--- a/drivers/gpu/nvgpu/common/regops/regops.c
+++ b/drivers/gpu/nvgpu/common/regops/regops.c
@@ -81,14 +81,14 @@ static bool validate_reg_ops(struct gk20a *g,
 			    u32 *ctx_rd_count, u32 *ctx_wr_count,
 			    struct nvgpu_dbg_reg_op *ops,
 			    u32 op_count,
-			    bool is_profiler);
+			    bool valid_ctx,
+			    u32 *flags);
 
 int exec_regops_gk20a(struct gk20a *g,
-		      struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		      bool is_profiler,
-		      bool *is_current_ctx)
+		      u32 *flags)
 {
 	int err = 0;
 	unsigned int i;
@@ -99,20 +99,8 @@ int exec_regops_gk20a(struct gk20a *g,
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
 
-	/* For vgpu, the regops routines need to be handled in the
-	 * context of the server and support for that does not exist.
-	 *
-	 * The two users of the regops interface are the compute driver
-	 * and tools. The compute driver will work without a functional
-	 * regops implementation, so we return -ENOSYS. This will allow
-	 * compute apps to run with vgpu. Tools will not work in this
-	 * configuration and are not required to work at this time. */
-	if (g->is_virtual) {
-		return -ENOSYS;
-	}
-
 	ok = validate_reg_ops(g, &ctx_rd_count, &ctx_wr_count,
-			      ops, num_ops, is_profiler);
+		ops, num_ops, tsg != NULL, flags);
 	if (!ok) {
 		nvgpu_err(g, "invalid op(s)");
 		err = -EINVAL;
@@ -211,9 +199,9 @@ int exec_regops_gk20a(struct gk20a *g,
 	}
 
 	if ((ctx_wr_count | ctx_rd_count) != 0U) {
-		err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+		err = gr_gk20a_exec_ctx_ops(tsg, ops, num_ops,
 					    ctx_wr_count, ctx_rd_count,
-					    is_current_ctx);
+					    flags);
 		if (err != 0) {
 			nvgpu_warn(g, "failed to perform ctx ops\n");
 			goto clean_up;
@@ -269,7 +257,7 @@ static int validate_reg_op_info(struct nvgpu_dbg_reg_op *op)
 static bool check_whitelists(struct gk20a *g,
 			     struct nvgpu_dbg_reg_op *op,
 			     u32 offset,
-			     bool is_profiler)
+			     bool valid_ctx)
 {
 	bool valid = false;
 
@@ -283,7 +271,7 @@ static bool check_whitelists(struct gk20a *g,
 			        regop_bsearch_range_cmp) != NULL);
 
 		/* if debug session, search context list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			/* binary search context list */
 			valid = (g->ops.regops.get_context_whitelist_ranges != NULL) &&
 			        (nvgpu_bsearch(&offset,
@@ -294,7 +282,7 @@ static bool check_whitelists(struct gk20a *g,
 		}
 
 		/* if debug session, search runcontrol list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			valid = (g->ops.regops.get_runcontrol_whitelist != NULL) &&
 				linear_search(offset,
 					     g->ops.regops.get_runcontrol_whitelist(),
@@ -310,7 +298,7 @@ static bool check_whitelists(struct gk20a *g,
 			        regop_bsearch_range_cmp) != NULL);
 
 		/* if debug session, search runcontrol list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			valid = (g->ops.regops.get_runcontrol_whitelist != NULL) &&
 				linear_search(offset,
 					     g->ops.regops.get_runcontrol_whitelist(),
@@ -324,7 +312,7 @@ static bool check_whitelists(struct gk20a *g,
 /* note: the op here has already been through validate_reg_op_info */
 static int validate_reg_op_offset(struct gk20a *g,
 				  struct nvgpu_dbg_reg_op *op,
-				  bool is_profiler)
+				  bool valid_ctx)
 {
 	int err;
 	u32 buf_offset_lo, buf_offset_addr, num_offsets, offset;
@@ -340,9 +328,9 @@ static int validate_reg_op_offset(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	valid = check_whitelists(g, op, offset, is_profiler);
+	valid = check_whitelists(g, op, offset, valid_ctx);
 	if ((op->op == REGOP(READ_64) || op->op == REGOP(WRITE_64)) && valid) {
-		valid = check_whitelists(g, op, offset + 4U, is_profiler);
+		valid = check_whitelists(g, op, offset + 4U, valid_ctx);
 	}
 
 	if (valid && (op->type != REGOP(TYPE_GLOBAL))) {
@@ -383,19 +371,23 @@ static bool validate_reg_ops(struct gk20a *g,
 			    u32 *ctx_rd_count, u32 *ctx_wr_count,
 			    struct nvgpu_dbg_reg_op *ops,
 			    u32 op_count,
-			    bool is_profiler)
+			    bool valid_ctx,
+			    u32 *flags)
 {
-	u32 i;
-	bool ok = true;
+	bool all_or_none = (*flags) & NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 	bool gr_ctx_ops = false;
+	bool op_failed = false;
+	u32 i;
 
 	/* keep going until the end so every op can get
 	 * a separate error code if needed */
 	for (i = 0; i < op_count; i++) {
 
 		if (validate_reg_op_info(&ops[i]) != 0) {
-			ok = false;
-			break;
+			op_failed = true;
+			if (all_or_none) {
+				break;
+			}
 		}
 
 		if (reg_op_is_gr_ctx(ops[i].type)) {
@@ -408,28 +400,42 @@ static bool validate_reg_ops(struct gk20a *g,
 			gr_ctx_ops = true;
 		}
 
-		/* context operations are not valid on profiler session */
-		if (gr_ctx_ops && is_profiler) {
-			ok = false;
-			break;
+		/* context operations need valid context */
+		if (gr_ctx_ops && !valid_ctx) {
+			op_failed = true;
+			if (all_or_none) {
+				break;
+			}
 		}
 
 		/* if "allow_all" flag enabled, dont validate offset */
 		if (!g->allow_all) {
-			if (validate_reg_op_offset(g, &ops[i],
-					is_profiler) != 0) {
-				ok = false;
-				break;
+			if (validate_reg_op_offset(g, &ops[i], valid_ctx) != 0) {
+				op_failed = true;
+				if (all_or_none) {
+					break;
+				}
 			}
 		}
 	}
 
-	if (ok) {
-		nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d",
-			   *ctx_wr_count, *ctx_rd_count);
+	nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d",
+		   *ctx_wr_count, *ctx_rd_count);
+
+	if (all_or_none) {
+		if (op_failed) {
+			return false;
+		} else {
+			return true;
+		}
 	}
 
-	return ok;
+	/* Continue on error */
+	if (!op_failed) {
+		*flags |= NVGPU_REG_OP_FLAG_ALL_PASSED;
+	}
+
+	return true;
 }
 
 /* exported for tools like cyclestats, etc */
diff --git a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
index b8b851ba9..2f3d9f754 100644
--- a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
@@ -35,11 +35,10 @@
 #include "common/vgpu/ivc/comm_vgpu.h"
 
 int vgpu_exec_regops(struct gk20a *g,
-		     struct nvgpu_channel *ch,
-		     struct nvgpu_dbg_reg_op *ops,
-		     u32 num_ops,
-		     bool is_profiler,
-		     bool *is_current_ctx)
+		      struct nvgpu_tsg *tsg,
+		      struct nvgpu_dbg_reg_op *ops,
+		      u32 num_ops,
+		      u32 *flags)
 {
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_reg_ops_params *p = &msg.params.reg_ops;
@@ -68,17 +67,15 @@ int vgpu_exec_regops(struct gk20a *g,
 
 	msg.cmd = TEGRA_VGPU_CMD_REG_OPS;
 	msg.handle = vgpu_get_handle(g);
-	p->handle = ch ? ch->virt_ctx : 0;
+	p->tsg_id = tsg ? tsg->tsgid : U32_MAX;
 	p->num_ops = num_ops;
-	p->is_profiler = is_profiler;
+	p->flags = *flags;
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	err = err ? err : msg.ret;
 	if (err == 0) {
 		nvgpu_memcpy((u8 *)ops, (u8 *)oob, ops_size);
-		if (is_current_ctx != NULL) {
-			*is_current_ctx = p->is_current_ctx != 0u;
-		}
 	}
+	*flags = p->flags;
 
 fail:
 	vgpu_ivc_oob_put_ptr(handle);
diff --git a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
index 439ff0788..ebe274c8e 100644
--- a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
+++ b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
@@ -30,11 +30,10 @@ struct gk20a;
 struct nvgpu_channel;
 
 int vgpu_exec_regops(struct gk20a *g,
-		     struct nvgpu_channel *ch,
-		     struct nvgpu_dbg_reg_op *ops,
-		     u32 num_ops,
-		     bool is_profiler,
-		     bool *is_current_ctx);
+		      struct nvgpu_tsg *tsg,
+		      struct nvgpu_dbg_reg_op *ops,
+		      u32 num_ops,
+		      u32 *flags);
 int vgpu_dbg_set_powergate(struct dbg_session_gk20a *dbg_s,
 			bool disable_powergate);
 
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
index d46c2ce49..965bd6f81 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
@@ -625,8 +625,7 @@ void gk20a_gr_init_ovr_sm_dsm_perf(void)
  * which makes it impossible to know externally whether a ctx
  * write will actually occur. so later we should put a lazy,
  *  map-and-hold system in the patch write state */
-static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
-			    struct nvgpu_channel *ch,
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 			    u32 addr, u32 data,
 			    struct nvgpu_gr_ctx *gr_ctx)
 {
@@ -663,15 +662,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				nvgpu_gr_ctx_patch_write(g, gr_ctx,
 							 addr, data, true);
 
-				if (ch->subctx != NULL) {
-					nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
-						false);
-					nvgpu_gr_subctx_set_patch_ctx(g,
-						ch->subctx, gr_ctx);
-				} else {
-					nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
+				nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
 						true);
-				}
 
 				/* we're not caching these on cpu side,
 				   but later watch for it */
@@ -1303,14 +1295,10 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	return -EINVAL;
 }
 
-bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
+static struct nvgpu_channel *gk20a_get_resident_ctx(struct gk20a *g, u32 *tsgid)
 {
 	u32 curr_gr_ctx;
-	u32 curr_gr_tsgid;
-	struct gk20a *g = ch->g;
 	struct nvgpu_channel *curr_ch;
-	bool ret = false;
-	struct nvgpu_tsg *tsg;
 
 	curr_gr_ctx = g->ops.gr.falcon.get_current_ctx(g);
 
@@ -1320,20 +1308,27 @@ bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
 	 * valid context is currently resident.
 	 */
 	if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) {
-		return false;
+		return NULL;
 	}
 
-	curr_ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_gr_ctx,
-					      &curr_gr_tsgid);
+	curr_ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_gr_ctx, tsgid);
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
-		  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
-		  " ch->chid=%d",
-		  (curr_ch != NULL) ? curr_ch->chid : U32_MAX,
-		  curr_gr_tsgid,
-		  ch->tsgid,
-		  ch->chid);
+		  "curr_gr_chid=%d curr_tsgid=%d",
+		  (curr_ch != NULL) ? curr_ch->chid : U32_MAX, *tsgid);
 
+	return curr_ch;
+}
+
+bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
+{
+	u32 curr_gr_tsgid;
+	struct gk20a *g = ch->g;
+	struct nvgpu_channel *curr_ch;
+	bool ret = false;
+	struct nvgpu_tsg *tsg;
+
+	curr_ch = gk20a_get_resident_ctx(g, &curr_gr_tsgid);
 	if (curr_ch == NULL) {
 		return false;
 	}
@@ -1351,13 +1346,33 @@ bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
 	return ret;
 }
 
-static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
+static bool gk20a_is_tsg_ctx_resident(struct nvgpu_tsg *tsg)
+{
+	u32 curr_gr_tsgid;
+	struct gk20a *g = tsg->g;
+	struct nvgpu_channel *curr_ch;
+	bool ret = false;
+
+	curr_ch = gk20a_get_resident_ctx(g, &curr_gr_tsgid);
+	if (curr_ch == NULL) {
+		return false;
+	}
+
+	if ((tsg->tsgid == curr_gr_tsgid) &&
+	    (tsg->tsgid == curr_ch->tsgid)) {
+		ret = true;
+	}
+
+	nvgpu_channel_put(curr_ch);
+	return ret;
+}
+
+static int gr_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			    struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			    u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			    bool ch_is_curr_ctx)
+			    bool ctx_resident)
 {
-	struct gk20a *g = ch->g;
-	struct nvgpu_tsg *tsg;
+	struct gk20a *g = tsg->g;
 	struct nvgpu_gr_ctx *gr_ctx;
 	bool gr_ctx_ready = false;
 	bool pm_ctx_ready = false;
@@ -1376,14 +1391,9 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
 		   num_ctx_wr_ops, num_ctx_rd_ops);
 
-	tsg = nvgpu_tsg_from_ch(ch);
-	if (tsg == NULL) {
-		return -EINVAL;
-	}
-
 	gr_ctx = tsg->gr_ctx;
 
-	if (ch_is_curr_ctx) {
+	if (ctx_resident) {
 		for (pass = 0; pass < 2; pass++) {
 			ctx_op_nr = 0;
 			for (i = 0; i < num_ops; ++i) {
@@ -1549,10 +1559,11 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 							   offsets[j] + 4U, v);
 					}
 
-					if (current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx)) {
+					if (current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx) &&
+							g->ops.gr.ctx_patch_smpc != NULL) {
 						/* check to see if we need to add a special WAR
 						   for some of the SMPC perf regs */
-						gr_gk20a_ctx_patch_smpc(g, ch,
+						g->ops.gr.ctx_patch_smpc(g,
 							offset_addrs[j],
 							v, gr_ctx);
 					}
@@ -1591,14 +1602,14 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 	return err;
 }
 
-int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
+int gr_gk20a_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			  struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			  bool *is_curr_ctx)
+			  u32 *flags)
 {
-	struct gk20a *g = ch->g;
+	struct gk20a *g = tsg->g;
 	int err, tmp_err;
-	bool ch_is_curr_ctx;
+	bool ctx_resident;
 
 	/* disable channel switching.
 	 * at that point the hardware state can be inspected to
@@ -1611,15 +1622,16 @@ int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
 		return err;
 	}
 
-	ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
-	if (is_curr_ctx != NULL) {
-		*is_curr_ctx = ch_is_curr_ctx;
+	ctx_resident = gk20a_is_tsg_ctx_resident(tsg);
+	if (ctx_resident) {
+		*flags |= NVGPU_REG_OP_FLAG_DIRECT_OPS;
 	}
-	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
-		  ch_is_curr_ctx);
 
-	err = gr_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
-				      num_ctx_rd_ops, ch_is_curr_ctx);
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
+		  ctx_resident);
+
+	err = gr_exec_ctx_ops(tsg, ctx_ops, num_ops, num_ctx_wr_ops,
+				      num_ctx_rd_ops, ctx_resident);
 
 	tmp_err = nvgpu_gr_enable_ctxsw(g);
 	if (tmp_err != 0) {
@@ -1865,6 +1877,12 @@ int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
 	u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g);
+	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
+
+	if (tsg == NULL) {
+		return -EINVAL;
+	}
 
 	ops = nvgpu_kcalloc(g, no_of_sm, sizeof(*ops));
 	if (ops == NULL) {
@@ -1910,7 +1928,7 @@ int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
 		i++;
 	}
 
-	err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, ops, i, i, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "Failed to access register");
 	}
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
index 607a6e030..b1fe5ff53 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
@@ -34,15 +34,16 @@ struct nvgpu_tsg;
 struct nvgpu_warpstate;
 struct dbg_session_gk20a;
 struct nvgpu_dbg_reg_op;
+struct nvgpu_gr_ctx;
 
 enum ctxsw_addr_type;
 
 /* sm */
 bool gk20a_gr_sm_debugger_attached(struct gk20a *g);
-int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
+int gr_gk20a_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			  struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			  bool *is_curr_ctx);
+			  u32 *flags);
 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
 				    u32 addr, u32 max_offsets,
 				    u32 *offsets, u32 *offset_addrs,
@@ -57,6 +58,9 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  struct nvgpu_tsg *tsg,
 				  u64 gpu_va, u32 mode);
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+			    u32 addr, u32 data,
+			    struct nvgpu_gr_ctx *gr_ctx);
 void gk20a_gr_resume_single_sm(struct gk20a *g,
 				u32 gpc, u32 tpc, u32 sm);
 void gk20a_gr_resume_all_sms(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
index 7e36267aa..39228be31 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
@@ -555,12 +555,13 @@ int gm20b_gr_set_mmu_debug_mode(struct gk20a *g,
 	};
 	int err;
 	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 
 	if (tsg == NULL) {
 		return enable ? -EINVAL : 0;
 	}
 
-	err = gr_gk20a_exec_ctx_ops(ch, &ctx_ops, 1, 1, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, &ctx_ops, 1, 1, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "update MMU debug mode failed");
 	}
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
index bf826382e..db99fcc00 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
@@ -867,14 +867,13 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
 	unsigned int i = 0, sm_id;
 	u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g);
 	int err;
-#ifdef CONFIG_NVGPU_SM_DIVERSITY
 	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 
 	if (tsg == NULL) {
 		nvgpu_err(g, "gv11b_gr_set_sm_debug_mode failed=>tsg NULL");
 		return -EINVAL;
 	}
-#endif
 
 	ops = nvgpu_kcalloc(g, no_of_sm, sizeof(*ops));
 	if (ops == NULL) {
@@ -945,7 +944,7 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
 		i++;
 	}
 
-	err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, ops, i, i, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "Failed to access register");
 	}
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
index 7d692bf2a..36f2ce437 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
@@ -212,6 +212,7 @@ static const struct gpu_ops gm20b_ops = {
 		.bpt_reg_info = gr_gm20b_bpt_reg_info,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
+		.ctx_patch_smpc = gr_gk20a_ctx_patch_smpc,
 		.set_mmu_debug_mode = gm20b_gr_set_mmu_debug_mode,
 		.clear_sm_error_state = gm20b_gr_clear_sm_error_state,
 		.suspend_contexts = gr_gk20a_suspend_contexts,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index 2b1ecb7e7..b3117d5c5 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -422,11 +422,10 @@ struct gpu_ops {
 #ifdef CONFIG_NVGPU_DEBUGGER
 	struct {
 		int (*exec_regops)(struct gk20a *g,
-			    struct nvgpu_channel *ch,
-			    struct nvgpu_dbg_reg_op *ops,
-			    u32 num_ops,
-			    bool is_profiler,
-			    bool *is_current_ctx);
+			      struct nvgpu_tsg *tsg,
+			      struct nvgpu_dbg_reg_op *ops,
+			      u32 num_ops,
+			      u32 *flags);
 		const struct regop_offset_range* (
 				*get_global_whitelist_ranges)(void);
 		u64 (*get_global_whitelist_ranges_count)(void);
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
index 3395cd5d9..6d652225b 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
@@ -1109,6 +1109,9 @@ struct gops_gr {
 				      struct nvgpu_tsg *tsg,
 				      u64 gpu_va,
 				      u32 mode);
+	int (*ctx_patch_smpc)(struct gk20a *g,
+			    u32 addr, u32 data,
+			    struct nvgpu_gr_ctx *gr_ctx);
 	void (*init_hwpm_pmm_register)(struct gk20a *g);
 	void (*get_num_hwpm_perfmon)(struct gk20a *g, u32 *num_sys_perfmon,
 				     u32 *num_fbp_perfmon,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/regops.h b/drivers/gpu/nvgpu/include/nvgpu/regops.h
index dad812e89..d60162fed 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/regops.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/regops.h
@@ -26,6 +26,11 @@
 
 #ifdef CONFIG_NVGPU_DEBUGGER
 
+#include <nvgpu/types.h>
+
+struct gk20a;
+struct nvgpu_tsg;
+
 /*
  * Register operations
  * All operations are targeted towards first channel
@@ -57,6 +62,11 @@
 #define NVGPU_DBG_REG_OP_STATUS_UNSUPPORTED_OP               0x00000008U
 #define NVGPU_DBG_REG_OP_STATUS_INVALID_MASK                 0x00000010U
 
+#define NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE		BIT32(1U)
+#define NVGPU_REG_OP_FLAG_MODE_CONTINUE_ON_ERROR	BIT32(2U)
+#define NVGPU_REG_OP_FLAG_ALL_PASSED			BIT32(3U)
+#define NVGPU_REG_OP_FLAG_DIRECT_OPS			BIT32(4U)
+
 struct nvgpu_dbg_reg_op {
 	u8    op;
 	u8    type;
@@ -77,11 +87,10 @@ struct regop_offset_range {
 };
 
 int exec_regops_gk20a(struct gk20a *g,
-		      struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		      bool is_profiler,
-		      bool *is_current_ctx);
+		      u32 *flags);
 
 /* turn seriously unwieldy names -> something shorter */
 #define REGOP(x) NVGPU_DBG_REG_OP_##x
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
index 8de8e99ae..14fe9cc25 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -328,10 +328,9 @@ struct tegra_vgpu_reg_op {
 };
 
 struct tegra_vgpu_reg_ops_params {
-	u64 handle;
 	u64 num_ops;
-	u32 is_profiler;
-	u8 is_current_ctx;
+	u32 tsg_id;
+	u32 flags;
 };
 
 struct tegra_vgpu_channel_priority_params {
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
index a32267f60..04503267d 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -778,12 +778,10 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 {
 	int err = 0, powergate_err = 0;
 	bool is_pg_disabled = false;
-
 	struct gk20a *g = dbg_s->g;
 	struct nvgpu_channel *ch;
-
-	bool is_current_ctx = false;
-
+	struct nvgpu_tsg *tsg = NULL;
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 
 	nvgpu_log_fn(g, "%d ops, max fragment %d", args->num_ops, g->dbg_regops_tmp_buf_ops);
 
@@ -813,6 +811,14 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 		return -EINVAL;
 	}
 
+	if (ch != NULL) {
+		tsg = nvgpu_tsg_from_ch(ch);
+		if (tsg == NULL) {
+			nvgpu_err(g, "channel not bound to TSG");
+			return -EINVAL;
+		}
+	}
+
 	/* since exec_reg_ops sends methods to the ucode, it must take the
 	 * global gpu lock to protect against mixing methods from debug sessions
 	 * on other channels */
@@ -869,16 +875,16 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 			if (err)
 				break;
 
-			err = g->ops.regops.exec_regops(g, ch,
-				g->dbg_regops_tmp_buf, num_ops,
-				dbg_s->is_profiler, &is_current_ctx);
+			err = g->ops.regops.exec_regops(g, tsg,
+				g->dbg_regops_tmp_buf, num_ops, &flags);
 
 			if (err) {
 				break;
 			}
 
 			if (ops_offset == 0) {
-				args->gr_ctx_resident = is_current_ctx;
+				args->gr_ctx_resident =
+					flags & NVGPU_REG_OP_FLAG_DIRECT_OPS;
 			}
 
 			err = nvgpu_get_regops_data_linux(g->dbg_regops_tmp_buf,