From 6daa0636d1292e8533dfeb3ee0e38d2637c948a1 Mon Sep 17 00:00:00 2001
From: Deepak Nibade <dnibade@nvidia.com>
Date: Wed, 29 Jul 2020 18:25:19 +0530
Subject: [PATCH] gpu: nvgpu: rework regops execution API

Rework regops execution API to accomodate below updates for new
profiler design

- gops.regops.exec_regops() should accept TSG pointer instead of
  channel pointer.
- Remove individual boolean parameters and add one flag field.

Below new flags are added to this API :
NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE
NVGPU_REG_OP_FLAG_MODE_CONTINUE_ON_ERROR
NVGPU_REG_OP_FLAG_ALL_PASSED
NVGPU_REG_OP_FLAG_DIRECT_OPS

Update other APIs, e.g. gr_gk20a_exec_ctx_ops() and validate_reg_ops()
as per new API changes.

Add new API gk20a_is_tsg_ctx_resident() to check context residency
from TSG pointer.

Convert gr_gk20a_ctx_patch_smpc() to a HAL gops.gr.ctx_patch_smpc().
Set this HAL only for gm20b since it is not required for later chips.
Also, remove subcontext code from this function since gm20b does not
support subcontext.

Remove stale comment about missing vGPU support in exec_regops_gk20a()

Bug 2510974
Jira NVGPU-5360

Change-Id: I3c25c34277b5ca88484da1e20d459118f15da102
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2389733
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/regops/regops.c      |  92 +++++++-------
 drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c |  17 ++-
 drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h |   9 +-
 drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c        | 114 ++++++++++--------
 drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h        |   8 +-
 drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c        |   3 +-
 drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c        |   5 +-
 drivers/gpu/nvgpu/hal/init/hal_gm20b.c        |   1 +
 drivers/gpu/nvgpu/include/nvgpu/gk20a.h       |   9 +-
 drivers/gpu/nvgpu/include/nvgpu/gops_gr.h     |   3 +
 drivers/gpu/nvgpu/include/nvgpu/regops.h      |  15 ++-
 .../gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h |   5 +-
 drivers/gpu/nvgpu/os/linux/ioctl_dbg.c        |  22 ++--
 13 files changed, 172 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/regops/regops.c b/drivers/gpu/nvgpu/common/regops/regops.c
index 02d23cb8f..e4d74da2f 100644
--- a/drivers/gpu/nvgpu/common/regops/regops.c
+++ b/drivers/gpu/nvgpu/common/regops/regops.c
@@ -81,14 +81,14 @@ static bool validate_reg_ops(struct gk20a *g,
 			    u32 *ctx_rd_count, u32 *ctx_wr_count,
 			    struct nvgpu_dbg_reg_op *ops,
 			    u32 op_count,
-			    bool is_profiler);
+			    bool valid_ctx,
+			    u32 *flags);
 
 int exec_regops_gk20a(struct gk20a *g,
-		      struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		      bool is_profiler,
-		      bool *is_current_ctx)
+		      u32 *flags)
 {
 	int err = 0;
 	unsigned int i;
@@ -99,20 +99,8 @@ int exec_regops_gk20a(struct gk20a *g,
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
 
-	/* For vgpu, the regops routines need to be handled in the
-	 * context of the server and support for that does not exist.
-	 *
-	 * The two users of the regops interface are the compute driver
-	 * and tools. The compute driver will work without a functional
-	 * regops implementation, so we return -ENOSYS. This will allow
-	 * compute apps to run with vgpu. Tools will not work in this
-	 * configuration and are not required to work at this time. */
-	if (g->is_virtual) {
-		return -ENOSYS;
-	}
-
 	ok = validate_reg_ops(g, &ctx_rd_count, &ctx_wr_count,
-			      ops, num_ops, is_profiler);
+		ops, num_ops, tsg != NULL, flags);
 	if (!ok) {
 		nvgpu_err(g, "invalid op(s)");
 		err = -EINVAL;
@@ -211,9 +199,9 @@ int exec_regops_gk20a(struct gk20a *g,
 	}
 
 	if ((ctx_wr_count | ctx_rd_count) != 0U) {
-		err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+		err = gr_gk20a_exec_ctx_ops(tsg, ops, num_ops,
 					    ctx_wr_count, ctx_rd_count,
-					    is_current_ctx);
+					    flags);
 		if (err != 0) {
 			nvgpu_warn(g, "failed to perform ctx ops\n");
 			goto clean_up;
@@ -269,7 +257,7 @@ static int validate_reg_op_info(struct nvgpu_dbg_reg_op *op)
 static bool check_whitelists(struct gk20a *g,
 			     struct nvgpu_dbg_reg_op *op,
 			     u32 offset,
-			     bool is_profiler)
+			     bool valid_ctx)
 {
 	bool valid = false;
 
@@ -283,7 +271,7 @@ static bool check_whitelists(struct gk20a *g,
 			        regop_bsearch_range_cmp) != NULL);
 
 		/* if debug session, search context list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			/* binary search context list */
 			valid = (g->ops.regops.get_context_whitelist_ranges != NULL) &&
 			        (nvgpu_bsearch(&offset,
@@ -294,7 +282,7 @@ static bool check_whitelists(struct gk20a *g,
 		}
 
 		/* if debug session, search runcontrol list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			valid = (g->ops.regops.get_runcontrol_whitelist != NULL) &&
 				linear_search(offset,
 					     g->ops.regops.get_runcontrol_whitelist(),
@@ -310,7 +298,7 @@ static bool check_whitelists(struct gk20a *g,
 			        regop_bsearch_range_cmp) != NULL);
 
 		/* if debug session, search runcontrol list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			valid = (g->ops.regops.get_runcontrol_whitelist != NULL) &&
 				linear_search(offset,
 					     g->ops.regops.get_runcontrol_whitelist(),
@@ -324,7 +312,7 @@ static bool check_whitelists(struct gk20a *g,
 /* note: the op here has already been through validate_reg_op_info */
 static int validate_reg_op_offset(struct gk20a *g,
 				  struct nvgpu_dbg_reg_op *op,
-				  bool is_profiler)
+				  bool valid_ctx)
 {
 	int err;
 	u32 buf_offset_lo, buf_offset_addr, num_offsets, offset;
@@ -340,9 +328,9 @@ static int validate_reg_op_offset(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	valid = check_whitelists(g, op, offset, is_profiler);
+	valid = check_whitelists(g, op, offset, valid_ctx);
 	if ((op->op == REGOP(READ_64) || op->op == REGOP(WRITE_64)) && valid) {
-		valid = check_whitelists(g, op, offset + 4U, is_profiler);
+		valid = check_whitelists(g, op, offset + 4U, valid_ctx);
 	}
 
 	if (valid && (op->type != REGOP(TYPE_GLOBAL))) {
@@ -383,19 +371,23 @@ static bool validate_reg_ops(struct gk20a *g,
 			    u32 *ctx_rd_count, u32 *ctx_wr_count,
 			    struct nvgpu_dbg_reg_op *ops,
 			    u32 op_count,
-			    bool is_profiler)
+			    bool valid_ctx,
+			    u32 *flags)
 {
-	u32 i;
-	bool ok = true;
+	bool all_or_none = (*flags) & NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 	bool gr_ctx_ops = false;
+	bool op_failed = false;
+	u32 i;
 
 	/* keep going until the end so every op can get
 	 * a separate error code if needed */
 	for (i = 0; i < op_count; i++) {
 
 		if (validate_reg_op_info(&ops[i]) != 0) {
-			ok = false;
-			break;
+			op_failed = true;
+			if (all_or_none) {
+				break;
+			}
 		}
 
 		if (reg_op_is_gr_ctx(ops[i].type)) {
@@ -408,28 +400,42 @@ static bool validate_reg_ops(struct gk20a *g,
 			gr_ctx_ops = true;
 		}
 
-		/* context operations are not valid on profiler session */
-		if (gr_ctx_ops && is_profiler) {
-			ok = false;
-			break;
+		/* context operations need valid context */
+		if (gr_ctx_ops && !valid_ctx) {
+			op_failed = true;
+			if (all_or_none) {
+				break;
+			}
 		}
 
 		/* if "allow_all" flag enabled, dont validate offset */
 		if (!g->allow_all) {
-			if (validate_reg_op_offset(g, &ops[i],
-					is_profiler) != 0) {
-				ok = false;
-				break;
+			if (validate_reg_op_offset(g, &ops[i], valid_ctx) != 0) {
+				op_failed = true;
+				if (all_or_none) {
+					break;
+				}
 			}
 		}
 	}
 
-	if (ok) {
-		nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d",
-			   *ctx_wr_count, *ctx_rd_count);
+	nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d",
+		   *ctx_wr_count, *ctx_rd_count);
+
+	if (all_or_none) {
+		if (op_failed) {
+			return false;
+		} else {
+			return true;
+		}
 	}
 
-	return ok;
+	/* Continue on error */
+	if (!op_failed) {
+		*flags |= NVGPU_REG_OP_FLAG_ALL_PASSED;
+	}
+
+	return true;
 }
 
 /* exported for tools like cyclestats, etc */
diff --git a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
index b8b851ba9..2f3d9f754 100644
--- a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
@@ -35,11 +35,10 @@
 #include "common/vgpu/ivc/comm_vgpu.h"
 
 int vgpu_exec_regops(struct gk20a *g,
-		     struct nvgpu_channel *ch,
-		     struct nvgpu_dbg_reg_op *ops,
-		     u32 num_ops,
-		     bool is_profiler,
-		     bool *is_current_ctx)
+		      struct nvgpu_tsg *tsg,
+		      struct nvgpu_dbg_reg_op *ops,
+		      u32 num_ops,
+		      u32 *flags)
 {
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_reg_ops_params *p = &msg.params.reg_ops;
@@ -68,17 +67,15 @@ int vgpu_exec_regops(struct gk20a *g,
 
 	msg.cmd = TEGRA_VGPU_CMD_REG_OPS;
 	msg.handle = vgpu_get_handle(g);
-	p->handle = ch ? ch->virt_ctx : 0;
+	p->tsg_id = tsg ? tsg->tsgid : U32_MAX;
 	p->num_ops = num_ops;
-	p->is_profiler = is_profiler;
+	p->flags = *flags;
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	err = err ? err : msg.ret;
 	if (err == 0) {
 		nvgpu_memcpy((u8 *)ops, (u8 *)oob, ops_size);
-		if (is_current_ctx != NULL) {
-			*is_current_ctx = p->is_current_ctx != 0u;
-		}
 	}
+	*flags = p->flags;
 
 fail:
 	vgpu_ivc_oob_put_ptr(handle);
diff --git a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
index 439ff0788..ebe274c8e 100644
--- a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
+++ b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
@@ -30,11 +30,10 @@ struct gk20a;
 struct nvgpu_channel;
 
 int vgpu_exec_regops(struct gk20a *g,
-		     struct nvgpu_channel *ch,
-		     struct nvgpu_dbg_reg_op *ops,
-		     u32 num_ops,
-		     bool is_profiler,
-		     bool *is_current_ctx);
+		      struct nvgpu_tsg *tsg,
+		      struct nvgpu_dbg_reg_op *ops,
+		      u32 num_ops,
+		      u32 *flags);
 int vgpu_dbg_set_powergate(struct dbg_session_gk20a *dbg_s,
 			bool disable_powergate);
 
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
index d46c2ce49..965bd6f81 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
@@ -625,8 +625,7 @@ void gk20a_gr_init_ovr_sm_dsm_perf(void)
  * which makes it impossible to know externally whether a ctx
  * write will actually occur. so later we should put a lazy,
  *  map-and-hold system in the patch write state */
-static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
-			    struct nvgpu_channel *ch,
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 			    u32 addr, u32 data,
 			    struct nvgpu_gr_ctx *gr_ctx)
 {
@@ -663,15 +662,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				nvgpu_gr_ctx_patch_write(g, gr_ctx,
 							 addr, data, true);
 
-				if (ch->subctx != NULL) {
-					nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
-						false);
-					nvgpu_gr_subctx_set_patch_ctx(g,
-						ch->subctx, gr_ctx);
-				} else {
-					nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
+				nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
 						true);
-				}
 
 				/* we're not caching these on cpu side,
 				   but later watch for it */
@@ -1303,14 +1295,10 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	return -EINVAL;
 }
 
-bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
+static struct nvgpu_channel *gk20a_get_resident_ctx(struct gk20a *g, u32 *tsgid)
 {
 	u32 curr_gr_ctx;
-	u32 curr_gr_tsgid;
-	struct gk20a *g = ch->g;
 	struct nvgpu_channel *curr_ch;
-	bool ret = false;
-	struct nvgpu_tsg *tsg;
 
 	curr_gr_ctx = g->ops.gr.falcon.get_current_ctx(g);
 
@@ -1320,20 +1308,27 @@ bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
 	 * valid context is currently resident.
 	 */
 	if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) {
-		return false;
+		return NULL;
 	}
 
-	curr_ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_gr_ctx,
-					      &curr_gr_tsgid);
+	curr_ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_gr_ctx, tsgid);
 
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
-		  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
-		  " ch->chid=%d",
-		  (curr_ch != NULL) ? curr_ch->chid : U32_MAX,
-		  curr_gr_tsgid,
-		  ch->tsgid,
-		  ch->chid);
+		  "curr_gr_chid=%d curr_tsgid=%d",
+		  (curr_ch != NULL) ? curr_ch->chid : U32_MAX, *tsgid);
 
+	return curr_ch;
+}
+
+bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
+{
+	u32 curr_gr_tsgid;
+	struct gk20a *g = ch->g;
+	struct nvgpu_channel *curr_ch;
+	bool ret = false;
+	struct nvgpu_tsg *tsg;
+
+	curr_ch = gk20a_get_resident_ctx(g, &curr_gr_tsgid);
 	if (curr_ch == NULL) {
 		return false;
 	}
@@ -1351,13 +1346,33 @@ bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
 	return ret;
 }
 
-static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
+static bool gk20a_is_tsg_ctx_resident(struct nvgpu_tsg *tsg)
+{
+	u32 curr_gr_tsgid;
+	struct gk20a *g = tsg->g;
+	struct nvgpu_channel *curr_ch;
+	bool ret = false;
+
+	curr_ch = gk20a_get_resident_ctx(g, &curr_gr_tsgid);
+	if (curr_ch == NULL) {
+		return false;
+	}
+
+	if ((tsg->tsgid == curr_gr_tsgid) &&
+	    (tsg->tsgid == curr_ch->tsgid)) {
+		ret = true;
+	}
+
+	nvgpu_channel_put(curr_ch);
+	return ret;
+}
+
+static int gr_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			    struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			    u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			    bool ch_is_curr_ctx)
+			    bool ctx_resident)
 {
-	struct gk20a *g = ch->g;
-	struct nvgpu_tsg *tsg;
+	struct gk20a *g = tsg->g;
 	struct nvgpu_gr_ctx *gr_ctx;
 	bool gr_ctx_ready = false;
 	bool pm_ctx_ready = false;
@@ -1376,14 +1391,9 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
 		   num_ctx_wr_ops, num_ctx_rd_ops);
 
-	tsg = nvgpu_tsg_from_ch(ch);
-	if (tsg == NULL) {
-		return -EINVAL;
-	}
-
 	gr_ctx = tsg->gr_ctx;
 
-	if (ch_is_curr_ctx) {
+	if (ctx_resident) {
 		for (pass = 0; pass < 2; pass++) {
 			ctx_op_nr = 0;
 			for (i = 0; i < num_ops; ++i) {
@@ -1549,10 +1559,11 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 							   offsets[j] + 4U, v);
 					}
 
-					if (current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx)) {
+					if (current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx) &&
+							g->ops.gr.ctx_patch_smpc != NULL) {
 						/* check to see if we need to add a special WAR
 						   for some of the SMPC perf regs */
-						gr_gk20a_ctx_patch_smpc(g, ch,
+						g->ops.gr.ctx_patch_smpc(g,
 							offset_addrs[j],
 							v, gr_ctx);
 					}
@@ -1591,14 +1602,14 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 	return err;
 }
 
-int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
+int gr_gk20a_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			  struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			  bool *is_curr_ctx)
+			  u32 *flags)
 {
-	struct gk20a *g = ch->g;
+	struct gk20a *g = tsg->g;
 	int err, tmp_err;
-	bool ch_is_curr_ctx;
+	bool ctx_resident;
 
 	/* disable channel switching.
 	 * at that point the hardware state can be inspected to
@@ -1611,15 +1622,16 @@ int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
 		return err;
 	}
 
-	ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
-	if (is_curr_ctx != NULL) {
-		*is_curr_ctx = ch_is_curr_ctx;
+	ctx_resident = gk20a_is_tsg_ctx_resident(tsg);
+	if (ctx_resident) {
+		*flags |= NVGPU_REG_OP_FLAG_DIRECT_OPS;
 	}
-	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
-		  ch_is_curr_ctx);
 
-	err = gr_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
-				      num_ctx_rd_ops, ch_is_curr_ctx);
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
+		  ctx_resident);
+
+	err = gr_exec_ctx_ops(tsg, ctx_ops, num_ops, num_ctx_wr_ops,
+				      num_ctx_rd_ops, ctx_resident);
 
 	tmp_err = nvgpu_gr_enable_ctxsw(g);
 	if (tmp_err != 0) {
@@ -1865,6 +1877,12 @@ int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
 	u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g);
+	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
+
+	if (tsg == NULL) {
+		return -EINVAL;
+	}
 
 	ops = nvgpu_kcalloc(g, no_of_sm, sizeof(*ops));
 	if (ops == NULL) {
@@ -1910,7 +1928,7 @@ int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
 		i++;
 	}
 
-	err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, ops, i, i, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "Failed to access register");
 	}
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
index 607a6e030..b1fe5ff53 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
@@ -34,15 +34,16 @@ struct nvgpu_tsg;
 struct nvgpu_warpstate;
 struct dbg_session_gk20a;
 struct nvgpu_dbg_reg_op;
+struct nvgpu_gr_ctx;
 
 enum ctxsw_addr_type;
 
 /* sm */
 bool gk20a_gr_sm_debugger_attached(struct gk20a *g);
-int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
+int gr_gk20a_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			  struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			  bool *is_curr_ctx);
+			  u32 *flags);
 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
 				    u32 addr, u32 max_offsets,
 				    u32 *offsets, u32 *offset_addrs,
@@ -57,6 +58,9 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  struct nvgpu_tsg *tsg,
 				  u64 gpu_va, u32 mode);
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+			    u32 addr, u32 data,
+			    struct nvgpu_gr_ctx *gr_ctx);
 void gk20a_gr_resume_single_sm(struct gk20a *g,
 				u32 gpc, u32 tpc, u32 sm);
 void gk20a_gr_resume_all_sms(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
index 7e36267aa..39228be31 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
@@ -555,12 +555,13 @@ int gm20b_gr_set_mmu_debug_mode(struct gk20a *g,
 	};
 	int err;
 	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 
 	if (tsg == NULL) {
 		return enable ? -EINVAL : 0;
 	}
 
-	err = gr_gk20a_exec_ctx_ops(ch, &ctx_ops, 1, 1, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, &ctx_ops, 1, 1, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "update MMU debug mode failed");
 	}
diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
index bf826382e..db99fcc00 100644
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
@@ -867,14 +867,13 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
 	unsigned int i = 0, sm_id;
 	u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g);
 	int err;
-#ifdef CONFIG_NVGPU_SM_DIVERSITY
 	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 
 	if (tsg == NULL) {
 		nvgpu_err(g, "gv11b_gr_set_sm_debug_mode failed=>tsg NULL");
 		return -EINVAL;
 	}
-#endif
 
 	ops = nvgpu_kcalloc(g, no_of_sm, sizeof(*ops));
 	if (ops == NULL) {
@@ -945,7 +944,7 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
 		i++;
 	}
 
-	err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, ops, i, i, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "Failed to access register");
 	}
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
index 7d692bf2a..36f2ce437 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
@@ -212,6 +212,7 @@ static const struct gpu_ops gm20b_ops = {
 		.bpt_reg_info = gr_gm20b_bpt_reg_info,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
+		.ctx_patch_smpc = gr_gk20a_ctx_patch_smpc,
 		.set_mmu_debug_mode = gm20b_gr_set_mmu_debug_mode,
 		.clear_sm_error_state = gm20b_gr_clear_sm_error_state,
 		.suspend_contexts = gr_gk20a_suspend_contexts,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index 2b1ecb7e7..b3117d5c5 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -422,11 +422,10 @@ struct gpu_ops {
 #ifdef CONFIG_NVGPU_DEBUGGER
 	struct {
 		int (*exec_regops)(struct gk20a *g,
-			    struct nvgpu_channel *ch,
-			    struct nvgpu_dbg_reg_op *ops,
-			    u32 num_ops,
-			    bool is_profiler,
-			    bool *is_current_ctx);
+			      struct nvgpu_tsg *tsg,
+			      struct nvgpu_dbg_reg_op *ops,
+			      u32 num_ops,
+			      u32 *flags);
 		const struct regop_offset_range* (
 				*get_global_whitelist_ranges)(void);
 		u64 (*get_global_whitelist_ranges_count)(void);
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
index 3395cd5d9..6d652225b 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
@@ -1109,6 +1109,9 @@ struct gops_gr {
 				      struct nvgpu_tsg *tsg,
 				      u64 gpu_va,
 				      u32 mode);
+	int (*ctx_patch_smpc)(struct gk20a *g,
+			    u32 addr, u32 data,
+			    struct nvgpu_gr_ctx *gr_ctx);
 	void (*init_hwpm_pmm_register)(struct gk20a *g);
 	void (*get_num_hwpm_perfmon)(struct gk20a *g, u32 *num_sys_perfmon,
 				     u32 *num_fbp_perfmon,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/regops.h b/drivers/gpu/nvgpu/include/nvgpu/regops.h
index dad812e89..d60162fed 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/regops.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/regops.h
@@ -26,6 +26,11 @@
 
 #ifdef CONFIG_NVGPU_DEBUGGER
 
+#include <nvgpu/types.h>
+
+struct gk20a;
+struct nvgpu_tsg;
+
 /*
  * Register operations
  * All operations are targeted towards first channel
@@ -57,6 +62,11 @@
 #define NVGPU_DBG_REG_OP_STATUS_UNSUPPORTED_OP               0x00000008U
 #define NVGPU_DBG_REG_OP_STATUS_INVALID_MASK                 0x00000010U
 
+#define NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE		BIT32(1U)
+#define NVGPU_REG_OP_FLAG_MODE_CONTINUE_ON_ERROR	BIT32(2U)
+#define NVGPU_REG_OP_FLAG_ALL_PASSED			BIT32(3U)
+#define NVGPU_REG_OP_FLAG_DIRECT_OPS			BIT32(4U)
+
 struct nvgpu_dbg_reg_op {
 	u8    op;
 	u8    type;
@@ -77,11 +87,10 @@ struct regop_offset_range {
 };
 
 int exec_regops_gk20a(struct gk20a *g,
-		      struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		      bool is_profiler,
-		      bool *is_current_ctx);
+		      u32 *flags);
 
 /* turn seriously unwieldy names -> something shorter */
 #define REGOP(x) NVGPU_DBG_REG_OP_##x
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
index 8de8e99ae..14fe9cc25 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -328,10 +328,9 @@ struct tegra_vgpu_reg_op {
 };
 
 struct tegra_vgpu_reg_ops_params {
-	u64 handle;
 	u64 num_ops;
-	u32 is_profiler;
-	u8 is_current_ctx;
+	u32 tsg_id;
+	u32 flags;
 };
 
 struct tegra_vgpu_channel_priority_params {
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
index a32267f60..04503267d 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -778,12 +778,10 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 {
 	int err = 0, powergate_err = 0;
 	bool is_pg_disabled = false;
-
 	struct gk20a *g = dbg_s->g;
 	struct nvgpu_channel *ch;
-
-	bool is_current_ctx = false;
-
+	struct nvgpu_tsg *tsg = NULL;
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 
 	nvgpu_log_fn(g, "%d ops, max fragment %d", args->num_ops, g->dbg_regops_tmp_buf_ops);
 
@@ -813,6 +811,14 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 		return -EINVAL;
 	}
 
+	if (ch != NULL) {
+		tsg = nvgpu_tsg_from_ch(ch);
+		if (tsg == NULL) {
+			nvgpu_err(g, "channel not bound to TSG");
+			return -EINVAL;
+		}
+	}
+
 	/* since exec_reg_ops sends methods to the ucode, it must take the
 	 * global gpu lock to protect against mixing methods from debug sessions
 	 * on other channels */
@@ -869,16 +875,16 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 			if (err)
 				break;
 
-			err = g->ops.regops.exec_regops(g, ch,
-				g->dbg_regops_tmp_buf, num_ops,
-				dbg_s->is_profiler, &is_current_ctx);
+			err = g->ops.regops.exec_regops(g, tsg,
+				g->dbg_regops_tmp_buf, num_ops, &flags);
 
 			if (err) {
 				break;
 			}
 
 			if (ops_offset == 0) {
-				args->gr_ctx_resident = is_current_ctx;
+				args->gr_ctx_resident =
+					flags & NVGPU_REG_OP_FLAG_DIRECT_OPS;
 			}
 
 			err = nvgpu_get_regops_data_linux(g->dbg_regops_tmp_buf,