gpu: nvgpu: Move SM_MASK_TYPE setting to TSG level

Moved the SM_MASK_TYPE variable from GR to TSG struct. SM error registers are context based. In dbg_session IOCTL to SET_SM_MASK_TYPE, kernel code iterate the TSG associated with first channel and set the mask_type to that context. Bug 200412641 Change-Id: Ic91944037ad2447f403b4803d5266ae6250ba4c9 Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1809322 Reviewed-by: svc-misra-checker <svc-misra-checker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2018-08-29 12:32:25 -07:00
parent b25d5d86ca
commit f187e0bf44
6 changed files with 64 additions and 61 deletions
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
@@ -72,12 +72,6 @@ struct dbg_session_gk20a {
 	bool broadcast_stop_trigger;

 	struct nvgpu_mutex ioctl_lock;
-
-	/*
-	 * sm set exception type mask flag, to check whether
-	 * exception type mask is requested or not.
-	 */
-	bool is_sm_exception_type_mask_set;
 };

 struct dbg_session_data {
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -420,11 +420,6 @@ struct gr_gk20a {
 	u32 no_of_sm;
 	struct sm_info *sm_to_cluster;

-#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE		(0x0U)
-#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL		(0x1U << 0)
-	u32 sm_exception_mask_type;
-	u32 sm_exception_mask_refcount;
-
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	struct nvgpu_mutex			cs_lock;
 	struct gk20a_cs_snapshot	*cs_data;
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -304,6 +304,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
 	tsg->timeslice_scale = 0;
 	tsg->runlist_id = ~0;
 	tsg->tgid = pid;
+	tsg->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;

 	if (g->ops.fifo.init_eng_method_buffers)
 		g->ops.fifo.init_eng_method_buffers(g, tsg);
@@ -373,6 +374,7 @@ void gk20a_tsg_release(struct nvgpu_ref *ref)
 	release_used_tsg(&g->fifo, tsg);

 	tsg->runlist_id = ~0;
+	tsg->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;

 	nvgpu_log(g, gpu_dbg_fn, "tsg released %d\n", tsg->tsgid);
 }
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -78,6 +78,10 @@ struct tsg_gk20a {
 	bool in_use;

 	struct nvgpu_tsg_sm_error_state *sm_error_states;
+
+#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE		(0x0U)
+#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL		(0x1U << 0)
+	u32 sm_exception_mask_type;
 };

 int gk20a_enable_tsg(struct tsg_gk20a *tsg);
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2239,7 +2239,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
 static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
 {
 	u32 index = 0U;
-	u32 esr_err = gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f();
+	bool esr_err = false;

 	struct warp_esr_error_table_s {
 		u32 error_value;
@@ -2285,7 +2285,7 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)

 	for (index = 0; index < ARRAY_SIZE(warp_esr_error_table); index++) {
 		if (warp_esr_error_table[index].error_value == warp_esr_error) {
-			esr_err = warp_esr_error_table[index].error_value;
+			esr_err = true;
 			nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
 				"WARP_ESR %s(0x%x)",
 				warp_esr_error_table[index].error_name,
@@ -2294,8 +2294,9 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
 		}
 	}

-	return (esr_err == 0U) ? false : true;
+	return esr_err;
 }
+
 static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g,
 						u32 gpc, u32 tpc, u32 sm,
 						u32 warp_esr_error,
@@ -2316,24 +2317,24 @@ static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g,
 		return 0;
 	}

-	/*
-	 * Check SET_EXCEPTION_TYPE_MASK is being set.
-	 * If set, skip the recovery and trigger CILP
-	 * If not set, trigger the recovery.
-	 */
-	if ((g->gr.sm_exception_mask_type &
-					NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) ==
-					NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) {
-		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
-			"SM Exception Type Mask set %d,"
-			"skip recovery",
-			g->gr.sm_exception_mask_type);
-		return 0;
-	}
-
 	if (fault_ch) {
 		tsg = &g->fifo.tsg[fault_ch->tsgid];

+		/*
+		 * Check SET_EXCEPTION_TYPE_MASK is being set.
+		 * If set, skip the recovery and trigger CILP
+		 * If not set, trigger the recovery.
+		 */
+		if ((tsg->sm_exception_mask_type &
+			NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) ==
+				NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) {
+			nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+				"SM Exception Type Mask set %d,"
+				"skip recovery",
+				tsg->sm_exception_mask_type);
+			return 0;
+		}
+
 		nvgpu_rwsem_down_read(&tsg->ch_list_lock);
 		nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
 				channel_gk20a, ch_entry) {
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -223,10 +223,6 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
 			nvgpu_kfree(g, prof_obj);
 		}
 	}
-
-	nvgpu_set_sm_exception_type_mask_locked(dbg_s,
-					NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
-
 	nvgpu_mutex_release(&g->dbg_sessions_lock);

 	nvgpu_mutex_destroy(&dbg_s->ch_list_lock);
@@ -499,7 +495,6 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
 	dbg_s->is_profiler = is_profiler;
 	dbg_s->is_pg_disabled = false;
 	dbg_s->is_timeout_disabled = false;
-	dbg_s->is_sm_exception_type_mask_set = false;

 	nvgpu_cond_init(&dbg_s->dbg_events.wait_queue);
 	nvgpu_init_list_node(&dbg_s->ch_list);
@@ -512,9 +507,6 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
 	dbg_s->dbg_events.events_enabled = false;
 	dbg_s->dbg_events.num_pending_events = 0;

-	nvgpu_set_sm_exception_type_mask_locked(dbg_s,
-					NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
-
 	return 0;

 err_destroy_lock:
@@ -1887,34 +1879,29 @@ static int nvgpu_set_sm_exception_type_mask_locked(
 					u32 exception_mask)
 {
 	struct gk20a *g = dbg_s->g;
-	struct gr_gk20a *gr = &g->gr;
 	int err = 0;
+	struct channel_gk20a *ch = NULL;

-	switch (exception_mask) {
-	case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL:
-		gr->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL;
-		if (dbg_s->is_sm_exception_type_mask_set == false) {
-			gr->sm_exception_mask_refcount++;
-			dbg_s->is_sm_exception_type_mask_set = true;
+	/*
+	 * Obtain the fisrt channel from the channel list in
+	 * dbg_session, find the context associated with channel
+	 * and set the sm_mask_type to that context
+	 */
+	ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
+	if (ch != NULL) {
+		struct tsg_gk20a *tsg;
+
+		tsg = tsg_gk20a_from_ch(ch);
+		if (tsg != NULL) {
+			tsg->sm_exception_mask_type = exception_mask;
+			goto type_mask_end;
 		}
-		break;
-	case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE:
-		if (dbg_s->is_sm_exception_type_mask_set) {
-			gr->sm_exception_mask_refcount--;
-			dbg_s->is_sm_exception_type_mask_set = false;
-		}
-		if (gr->sm_exception_mask_refcount == 0)
-			gr->sm_exception_mask_type =
-					NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
-		break;
-	default:
-		nvgpu_err(g,
-			   "unrecognized dbg sm exception type mask: 0x%x",
-			   exception_mask);
-		err = -EINVAL;
-		break;
 	}

+	nvgpu_log_fn(g, "unable to find the TSG\n");
+	err = -EINVAL;
+
+type_mask_end:
 	return err;
 }

@@ -1924,10 +1911,30 @@ static int nvgpu_dbg_gpu_set_sm_exception_type_mask(
 {
 	int err = 0;
 	struct gk20a *g = dbg_s->g;
+	u32 sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
+
+	switch (args->exception_type_mask) {
+	case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL:
+		sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL;
+		break;
+	case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE:
+		sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
+		break;
+	default:
+		nvgpu_err(g,
+			   "unrecognized dbg sm exception type mask: 0x%x",
+			   args->exception_type_mask);
+		err = -EINVAL;
+		break;
+	}
+
+	if (err != 0) {
+		return err;
+	}

 	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
 	err = nvgpu_set_sm_exception_type_mask_locked(dbg_s,
-					args->exception_type_mask);
+					sm_exception_mask_type);
 	nvgpu_mutex_release(&g->dbg_sessions_lock);

 	return err;