gpu: nvgpu: rework regops execution API

Rework regops execution API to accomodate below updates for new profiler design - gops.regops.exec_regops() should accept TSG pointer instead of channel pointer. - Remove individual boolean parameters and add one flag field. Below new flags are added to this API : NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE NVGPU_REG_OP_FLAG_MODE_CONTINUE_ON_ERROR NVGPU_REG_OP_FLAG_ALL_PASSED NVGPU_REG_OP_FLAG_DIRECT_OPS Update other APIs, e.g. gr_gk20a_exec_ctx_ops() and validate_reg_ops() as per new API changes. Add new API gk20a_is_tsg_ctx_resident() to check context residency from TSG pointer. Convert gr_gk20a_ctx_patch_smpc() to a HAL gops.gr.ctx_patch_smpc(). Set this HAL only for gm20b since it is not required for later chips. Also, remove subcontext code from this function since gm20b does not support subcontext. Remove stale comment about missing vGPU support in exec_regops_gk20a() Bug 2510974 Jira NVGPU-5360 Change-Id: I3c25c34277b5ca88484da1e20d459118f15da102 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2389733 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2020-07-29 18:25:19 +05:30
parent a73b5d3c6f
commit 6daa0636d1
13 changed files with 172 additions and 131 deletions
--- a/drivers/gpu/nvgpu/common/regops/regops.c
+++ b/drivers/gpu/nvgpu/common/regops/regops.c
@@ -81,14 +81,14 @@ static bool validate_reg_ops(struct gk20a *g,
 			    u32 *ctx_rd_count, u32 *ctx_wr_count,
 			    struct nvgpu_dbg_reg_op *ops,
 			    u32 op_count,
-			    bool is_profiler);
+			    bool valid_ctx,
+			    u32 *flags);

 int exec_regops_gk20a(struct gk20a *g,
-		      struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		      bool is_profiler,
-		      bool *is_current_ctx)
+		      u32 *flags)
 {
 	int err = 0;
 	unsigned int i;
@@ -99,20 +99,8 @@ int exec_regops_gk20a(struct gk20a *g,

 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");

-	/* For vgpu, the regops routines need to be handled in the
-	 * context of the server and support for that does not exist.
-	 *
-	 * The two users of the regops interface are the compute driver
-	 * and tools. The compute driver will work without a functional
-	 * regops implementation, so we return -ENOSYS. This will allow
-	 * compute apps to run with vgpu. Tools will not work in this
-	 * configuration and are not required to work at this time. */
-	if (g->is_virtual) {
-		return -ENOSYS;
-	}
-
 	ok = validate_reg_ops(g, &ctx_rd_count, &ctx_wr_count,
-			      ops, num_ops, is_profiler);
+		ops, num_ops, tsg != NULL, flags);
 	if (!ok) {
 		nvgpu_err(g, "invalid op(s)");
 		err = -EINVAL;
@@ -211,9 +199,9 @@ int exec_regops_gk20a(struct gk20a *g,
 	}

 	if ((ctx_wr_count | ctx_rd_count) != 0U) {
-		err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+		err = gr_gk20a_exec_ctx_ops(tsg, ops, num_ops,
 					    ctx_wr_count, ctx_rd_count,
-					    is_current_ctx);
+					    flags);
 		if (err != 0) {
 			nvgpu_warn(g, "failed to perform ctx ops\n");
 			goto clean_up;
@@ -269,7 +257,7 @@ static int validate_reg_op_info(struct nvgpu_dbg_reg_op *op)
 static bool check_whitelists(struct gk20a *g,
 			     struct nvgpu_dbg_reg_op *op,
 			     u32 offset,
-			     bool is_profiler)
+			     bool valid_ctx)
 {
 	bool valid = false;

@@ -283,7 +271,7 @@ static bool check_whitelists(struct gk20a *g,
 			        regop_bsearch_range_cmp) != NULL);

 		/* if debug session, search context list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			/* binary search context list */
 			valid = (g->ops.regops.get_context_whitelist_ranges != NULL) &&
 			        (nvgpu_bsearch(&offset,
@@ -294,7 +282,7 @@ static bool check_whitelists(struct gk20a *g,
 		}

 		/* if debug session, search runcontrol list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			valid = (g->ops.regops.get_runcontrol_whitelist != NULL) &&
 				linear_search(offset,
 					     g->ops.regops.get_runcontrol_whitelist(),
@@ -310,7 +298,7 @@ static bool check_whitelists(struct gk20a *g,
 			        regop_bsearch_range_cmp) != NULL);

 		/* if debug session, search runcontrol list */
-		if ((!valid) && (!is_profiler)) {
+		if ((!valid) && (valid_ctx)) {
 			valid = (g->ops.regops.get_runcontrol_whitelist != NULL) &&
 				linear_search(offset,
 					     g->ops.regops.get_runcontrol_whitelist(),
@@ -324,7 +312,7 @@ static bool check_whitelists(struct gk20a *g,
 /* note: the op here has already been through validate_reg_op_info */
 static int validate_reg_op_offset(struct gk20a *g,
 				  struct nvgpu_dbg_reg_op *op,
-				  bool is_profiler)
+				  bool valid_ctx)
 {
 	int err;
 	u32 buf_offset_lo, buf_offset_addr, num_offsets, offset;
@@ -340,9 +328,9 @@ static int validate_reg_op_offset(struct gk20a *g,
 		return -EINVAL;
 	}

-	valid = check_whitelists(g, op, offset, is_profiler);
+	valid = check_whitelists(g, op, offset, valid_ctx);
 	if ((op->op == REGOP(READ_64) || op->op == REGOP(WRITE_64)) && valid) {
-		valid = check_whitelists(g, op, offset + 4U, is_profiler);
+		valid = check_whitelists(g, op, offset + 4U, valid_ctx);
 	}

 	if (valid && (op->type != REGOP(TYPE_GLOBAL))) {
@@ -383,20 +371,24 @@ static bool validate_reg_ops(struct gk20a *g,
 			    u32 *ctx_rd_count, u32 *ctx_wr_count,
 			    struct nvgpu_dbg_reg_op *ops,
 			    u32 op_count,
-			    bool is_profiler)
+			    bool valid_ctx,
+			    u32 *flags)
 {
-	u32 i;
-	bool ok = true;
+	bool all_or_none = (*flags) & NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
 	bool gr_ctx_ops = false;
+	bool op_failed = false;
+	u32 i;

 	/* keep going until the end so every op can get
 	 * a separate error code if needed */
 	for (i = 0; i < op_count; i++) {

 		if (validate_reg_op_info(&ops[i]) != 0) {
-			ok = false;
+			op_failed = true;
+			if (all_or_none) {
 				break;
 			}
+		}

 		if (reg_op_is_gr_ctx(ops[i].type)) {
 			if (reg_op_is_read(ops[i].op)) {
@@ -408,28 +400,42 @@ static bool validate_reg_ops(struct gk20a *g,
 			gr_ctx_ops = true;
 		}

-		/* context operations are not valid on profiler session */
-		if (gr_ctx_ops && is_profiler) {
-			ok = false;
+		/* context operations need valid context */
+		if (gr_ctx_ops && !valid_ctx) {
+			op_failed = true;
+			if (all_or_none) {
 				break;
 			}
+		}

 		/* if "allow_all" flag enabled, dont validate offset */
 		if (!g->allow_all) {
-			if (validate_reg_op_offset(g, &ops[i],
-					is_profiler) != 0) {
-				ok = false;
+			if (validate_reg_op_offset(g, &ops[i], valid_ctx) != 0) {
+				op_failed = true;
+				if (all_or_none) {
 					break;
 				}
 			}
 		}
-
-	if (ok) {
-		nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d",
-			   *ctx_wr_count, *ctx_rd_count);
 	}

-	return ok;
+	nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d",
+		   *ctx_wr_count, *ctx_rd_count);
+
+	if (all_or_none) {
+		if (op_failed) {
+			return false;
+		} else {
+			return true;
+		}
+	}
+
+	/* Continue on error */
+	if (!op_failed) {
+		*flags |= NVGPU_REG_OP_FLAG_ALL_PASSED;
+	}
+
+	return true;
 }

 /* exported for tools like cyclestats, etc */
--- a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.c
@@ -35,11 +35,10 @@
 #include "common/vgpu/ivc/comm_vgpu.h"

 int vgpu_exec_regops(struct gk20a *g,
-		     struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		     bool is_profiler,
-		     bool *is_current_ctx)
+		      u32 *flags)
 {
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_reg_ops_params *p = &msg.params.reg_ops;
@@ -68,17 +67,15 @@ int vgpu_exec_regops(struct gk20a *g,

 	msg.cmd = TEGRA_VGPU_CMD_REG_OPS;
 	msg.handle = vgpu_get_handle(g);
-	p->handle = ch ? ch->virt_ctx : 0;
+	p->tsg_id = tsg ? tsg->tsgid : U32_MAX;
 	p->num_ops = num_ops;
-	p->is_profiler = is_profiler;
+	p->flags = *flags;
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	err = err ? err : msg.ret;
 	if (err == 0) {
 		nvgpu_memcpy((u8 *)ops, (u8 *)oob, ops_size);
-		if (is_current_ctx != NULL) {
-			*is_current_ctx = p->is_current_ctx != 0u;
-		}
 	}
+	*flags = p->flags;

 fail:
 	vgpu_ivc_oob_put_ptr(handle);
--- a/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
+++ b/drivers/gpu/nvgpu/common/vgpu/debugger_vgpu.h
@@ -30,11 +30,10 @@ struct gk20a;
 struct nvgpu_channel;

 int vgpu_exec_regops(struct gk20a *g,
-		     struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		     bool is_profiler,
-		     bool *is_current_ctx);
+		      u32 *flags);
 int vgpu_dbg_set_powergate(struct dbg_session_gk20a *dbg_s,
 			bool disable_powergate);

--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c
@@ -625,8 +625,7 @@ void gk20a_gr_init_ovr_sm_dsm_perf(void)
 * which makes it impossible to know externally whether a ctx
 * write will actually occur. so later we should put a lazy,
 *  map-and-hold system in the patch write state */
-static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
-			    struct nvgpu_channel *ch,
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 			    u32 addr, u32 data,
 			    struct nvgpu_gr_ctx *gr_ctx)
 {
@@ -663,15 +662,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				nvgpu_gr_ctx_patch_write(g, gr_ctx,
 							 addr, data, true);

-				if (ch->subctx != NULL) {
-					nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
-						false);
-					nvgpu_gr_subctx_set_patch_ctx(g,
-						ch->subctx, gr_ctx);
-				} else {
 				nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
 						true);
-				}

 				/* we're not caching these on cpu side,
 				   but later watch for it */
@@ -1303,14 +1295,10 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	return -EINVAL;
 }

-bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
+static struct nvgpu_channel *gk20a_get_resident_ctx(struct gk20a *g, u32 *tsgid)
 {
 	u32 curr_gr_ctx;
-	u32 curr_gr_tsgid;
-	struct gk20a *g = ch->g;
 	struct nvgpu_channel *curr_ch;
-	bool ret = false;
-	struct nvgpu_tsg *tsg;

 	curr_gr_ctx = g->ops.gr.falcon.get_current_ctx(g);

@@ -1320,20 +1308,27 @@ bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
 	 * valid context is currently resident.
 	 */
 	if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) {
-		return false;
+		return NULL;
 	}

-	curr_ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_gr_ctx,
-					      &curr_gr_tsgid);
+	curr_ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_gr_ctx, tsgid);

 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
-		  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
-		  " ch->chid=%d",
-		  (curr_ch != NULL) ? curr_ch->chid : U32_MAX,
-		  curr_gr_tsgid,
-		  ch->tsgid,
-		  ch->chid);
+		  "curr_gr_chid=%d curr_tsgid=%d",
+		  (curr_ch != NULL) ? curr_ch->chid : U32_MAX, *tsgid);

+	return curr_ch;
+}
+
+bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
+{
+	u32 curr_gr_tsgid;
+	struct gk20a *g = ch->g;
+	struct nvgpu_channel *curr_ch;
+	bool ret = false;
+	struct nvgpu_tsg *tsg;
+
+	curr_ch = gk20a_get_resident_ctx(g, &curr_gr_tsgid);
 	if (curr_ch == NULL) {
 		return false;
 	}
@@ -1351,13 +1346,33 @@ bool gk20a_is_channel_ctx_resident(struct nvgpu_channel *ch)
 	return ret;
 }

-static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
+static bool gk20a_is_tsg_ctx_resident(struct nvgpu_tsg *tsg)
+{
+	u32 curr_gr_tsgid;
+	struct gk20a *g = tsg->g;
+	struct nvgpu_channel *curr_ch;
+	bool ret = false;
+
+	curr_ch = gk20a_get_resident_ctx(g, &curr_gr_tsgid);
+	if (curr_ch == NULL) {
+		return false;
+	}
+
+	if ((tsg->tsgid == curr_gr_tsgid) &&
+	    (tsg->tsgid == curr_ch->tsgid)) {
+		ret = true;
+	}
+
+	nvgpu_channel_put(curr_ch);
+	return ret;
+}
+
+static int gr_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			    struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			    u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			    bool ch_is_curr_ctx)
+			    bool ctx_resident)
 {
-	struct gk20a *g = ch->g;
-	struct nvgpu_tsg *tsg;
+	struct gk20a *g = tsg->g;
 	struct nvgpu_gr_ctx *gr_ctx;
 	bool gr_ctx_ready = false;
 	bool pm_ctx_ready = false;
@@ -1376,14 +1391,9 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
 		   num_ctx_wr_ops, num_ctx_rd_ops);

-	tsg = nvgpu_tsg_from_ch(ch);
-	if (tsg == NULL) {
-		return -EINVAL;
-	}
-
 	gr_ctx = tsg->gr_ctx;

-	if (ch_is_curr_ctx) {
+	if (ctx_resident) {
 		for (pass = 0; pass < 2; pass++) {
 			ctx_op_nr = 0;
 			for (i = 0; i < num_ops; ++i) {
@@ -1549,10 +1559,11 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 							   offsets[j] + 4U, v);
 					}

-					if (current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx)) {
+					if (current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx) &&
+							g->ops.gr.ctx_patch_smpc != NULL) {
 						/* check to see if we need to add a special WAR
 						   for some of the SMPC perf regs */
-						gr_gk20a_ctx_patch_smpc(g, ch,
+						g->ops.gr.ctx_patch_smpc(g,
 							offset_addrs[j],
 							v, gr_ctx);
 					}
@@ -1591,14 +1602,14 @@ static int gr_exec_ctx_ops(struct nvgpu_channel *ch,
 	return err;
 }

-int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
+int gr_gk20a_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			  struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			  bool *is_curr_ctx)
+			  u32 *flags)
 {
-	struct gk20a *g = ch->g;
+	struct gk20a *g = tsg->g;
 	int err, tmp_err;
-	bool ch_is_curr_ctx;
+	bool ctx_resident;

 	/* disable channel switching.
 	 * at that point the hardware state can be inspected to
@@ -1611,15 +1622,16 @@ int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
 		return err;
 	}

-	ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
-	if (is_curr_ctx != NULL) {
-		*is_curr_ctx = ch_is_curr_ctx;
+	ctx_resident = gk20a_is_tsg_ctx_resident(tsg);
+	if (ctx_resident) {
+		*flags |= NVGPU_REG_OP_FLAG_DIRECT_OPS;
 	}
-	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
-		  ch_is_curr_ctx);

-	err = gr_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
-				      num_ctx_rd_ops, ch_is_curr_ctx);
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
+		  ctx_resident);
+
+	err = gr_exec_ctx_ops(tsg, ctx_ops, num_ops, num_ctx_wr_ops,
+				      num_ctx_rd_ops, ctx_resident);

 	tmp_err = nvgpu_gr_enable_ctxsw(g);
 	if (tmp_err != 0) {
@@ -1865,6 +1877,12 @@ int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
 	u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g);
+	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;
+
+	if (tsg == NULL) {
+		return -EINVAL;
+	}

 	ops = nvgpu_kcalloc(g, no_of_sm, sizeof(*ops));
 	if (ops == NULL) {
@@ -1910,7 +1928,7 @@ int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
 		i++;
 	}

-	err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, ops, i, i, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "Failed to access register");
 	}
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h
@@ -34,15 +34,16 @@ struct nvgpu_tsg;
 struct nvgpu_warpstate;
 struct dbg_session_gk20a;
 struct nvgpu_dbg_reg_op;
+struct nvgpu_gr_ctx;

 enum ctxsw_addr_type;

 /* sm */
 bool gk20a_gr_sm_debugger_attached(struct gk20a *g);
-int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch,
+int gr_gk20a_exec_ctx_ops(struct nvgpu_tsg *tsg,
 			  struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
 			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
-			  bool *is_curr_ctx);
+			  u32 *flags);
 int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
 				    u32 addr, u32 max_offsets,
 				    u32 *offsets, u32 *offset_addrs,
@@ -57,6 +58,9 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  struct nvgpu_tsg *tsg,
 				  u64 gpu_va, u32 mode);
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+			    u32 addr, u32 data,
+			    struct nvgpu_gr_ctx *gr_ctx);
 void gk20a_gr_resume_single_sm(struct gk20a *g,
 				u32 gpc, u32 tpc, u32 sm);
 void gk20a_gr_resume_all_sms(struct gk20a *g);
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c
@@ -555,12 +555,13 @@ int gm20b_gr_set_mmu_debug_mode(struct gk20a *g,
 	};
 	int err;
 	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;

 	if (tsg == NULL) {
 		return enable ? -EINVAL : 0;
 	}

-	err = gr_gk20a_exec_ctx_ops(ch, &ctx_ops, 1, 1, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, &ctx_ops, 1, 1, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "update MMU debug mode failed");
 	}
--- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c
@@ -867,14 +867,13 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
 	unsigned int i = 0, sm_id;
 	u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g);
 	int err;
-#ifdef CONFIG_NVGPU_SM_DIVERSITY
 	struct nvgpu_tsg *tsg = nvgpu_tsg_from_ch(ch);
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;

 	if (tsg == NULL) {
 		nvgpu_err(g, "gv11b_gr_set_sm_debug_mode failed=>tsg NULL");
 		return -EINVAL;
 	}
-#endif

 	ops = nvgpu_kcalloc(g, no_of_sm, sizeof(*ops));
 	if (ops == NULL) {
@@ -945,7 +944,7 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
 		i++;
 	}

-	err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
+	err = gr_gk20a_exec_ctx_ops(tsg, ops, i, i, 0, &flags);
 	if (err != 0) {
 		nvgpu_err(g, "Failed to access register");
 	}
--- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
@@ -212,6 +212,7 @@ static const struct gpu_ops gm20b_ops = {
 		.bpt_reg_info = gr_gm20b_bpt_reg_info,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
+		.ctx_patch_smpc = gr_gk20a_ctx_patch_smpc,
 		.set_mmu_debug_mode = gm20b_gr_set_mmu_debug_mode,
 		.clear_sm_error_state = gm20b_gr_clear_sm_error_state,
 		.suspend_contexts = gr_gk20a_suspend_contexts,
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -422,11 +422,10 @@ struct gpu_ops {
 #ifdef CONFIG_NVGPU_DEBUGGER
 	struct {
 		int (*exec_regops)(struct gk20a *g,
-			    struct nvgpu_channel *ch,
+			      struct nvgpu_tsg *tsg,
 			      struct nvgpu_dbg_reg_op *ops,
 			      u32 num_ops,
-			    bool is_profiler,
-			    bool *is_current_ctx);
+			      u32 *flags);
 		const struct regop_offset_range* (
 				*get_global_whitelist_ranges)(void);
 		u64 (*get_global_whitelist_ranges_count)(void);
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
@@ -1109,6 +1109,9 @@ struct gops_gr {
 				      struct nvgpu_tsg *tsg,
 				      u64 gpu_va,
 				      u32 mode);
+	int (*ctx_patch_smpc)(struct gk20a *g,
+			    u32 addr, u32 data,
+			    struct nvgpu_gr_ctx *gr_ctx);
 	void (*init_hwpm_pmm_register)(struct gk20a *g);
 	void (*get_num_hwpm_perfmon)(struct gk20a *g, u32 *num_sys_perfmon,
 				     u32 *num_fbp_perfmon,
--- a/drivers/gpu/nvgpu/include/nvgpu/regops.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/regops.h
@@ -26,6 +26,11 @@

 #ifdef CONFIG_NVGPU_DEBUGGER

+#include <nvgpu/types.h>
+
+struct gk20a;
+struct nvgpu_tsg;
+
 /*
 * Register operations
 * All operations are targeted towards first channel
@@ -57,6 +62,11 @@
 #define NVGPU_DBG_REG_OP_STATUS_UNSUPPORTED_OP               0x00000008U
 #define NVGPU_DBG_REG_OP_STATUS_INVALID_MASK                 0x00000010U

+#define NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE		BIT32(1U)
+#define NVGPU_REG_OP_FLAG_MODE_CONTINUE_ON_ERROR	BIT32(2U)
+#define NVGPU_REG_OP_FLAG_ALL_PASSED			BIT32(3U)
+#define NVGPU_REG_OP_FLAG_DIRECT_OPS			BIT32(4U)
+
 struct nvgpu_dbg_reg_op {
 	u8    op;
 	u8    type;
@@ -77,11 +87,10 @@ struct regop_offset_range {
 };

 int exec_regops_gk20a(struct gk20a *g,
-		      struct nvgpu_channel *ch,
+		      struct nvgpu_tsg *tsg,
 		      struct nvgpu_dbg_reg_op *ops,
 		      u32 num_ops,
-		      bool is_profiler,
-		      bool *is_current_ctx);
+		      u32 *flags);

 /* turn seriously unwieldy names -> something shorter */
 #define REGOP(x) NVGPU_DBG_REG_OP_##x
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -328,10 +328,9 @@ struct tegra_vgpu_reg_op {
 };

 struct tegra_vgpu_reg_ops_params {
-	u64 handle;
 	u64 num_ops;
-	u32 is_profiler;
-	u8 is_current_ctx;
+	u32 tsg_id;
+	u32 flags;
 };

 struct tegra_vgpu_channel_priority_params {
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -778,12 +778,10 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 {
 	int err = 0, powergate_err = 0;
 	bool is_pg_disabled = false;
-
 	struct gk20a *g = dbg_s->g;
 	struct nvgpu_channel *ch;
-
-	bool is_current_ctx = false;
-
+	struct nvgpu_tsg *tsg = NULL;
+	u32 flags = NVGPU_REG_OP_FLAG_MODE_ALL_OR_NONE;

 	nvgpu_log_fn(g, "%d ops, max fragment %d", args->num_ops, g->dbg_regops_tmp_buf_ops);

@@ -813,6 +811,14 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 		return -EINVAL;
 	}

+	if (ch != NULL) {
+		tsg = nvgpu_tsg_from_ch(ch);
+		if (tsg == NULL) {
+			nvgpu_err(g, "channel not bound to TSG");
+			return -EINVAL;
+		}
+	}
+
 	/* since exec_reg_ops sends methods to the ucode, it must take the
 	 * global gpu lock to protect against mixing methods from debug sessions
 	 * on other channels */
@@ -869,16 +875,16 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
 			if (err)
 				break;

-			err = g->ops.regops.exec_regops(g, ch,
-				g->dbg_regops_tmp_buf, num_ops,
-				dbg_s->is_profiler, &is_current_ctx);
+			err = g->ops.regops.exec_regops(g, tsg,
+				g->dbg_regops_tmp_buf, num_ops, &flags);

 			if (err) {
 				break;
 			}

 			if (ops_offset == 0) {
-				args->gr_ctx_resident = is_current_ctx;
+				args->gr_ctx_resident =
+					flags & NVGPU_REG_OP_FLAG_DIRECT_OPS;
 			}

 			err = nvgpu_get_regops_data_linux(g->dbg_regops_tmp_buf,