gpu: nvgpu: Add basic recovery debugging messages

Add basic recovery messages that describe what's happening during the recovery process. Hide this under a new recovery specific GPU debug log flag. The logs look like: [ 276.000733] nvgpu: 17000000.gv11b gv11b_fifo_recover:162 [DBG] REC | Recovery starting [ 276.000737] nvgpu: 17000000.gv11b gv11b_fifo_recover:163 [DBG] REC | ID = 0 [ 276.000741] nvgpu: 17000000.gv11b gv11b_fifo_recover:164 [DBG] REC | id_type = TSG [ 276.000745] nvgpu: 17000000.gv11b gv11b_fifo_recover:165 [DBG] REC | rc_type = MMU fault [ 276.000748] nvgpu: 17000000.gv11b gv11b_fifo_recover:166 [DBG] REC | Engine bitmask: 0x0 [ 276.000753] nvgpu: 17000000.gv11b gv11b_fifo_recover:170 [DBG] REC | Acquiring engines_reset_mutex [ 276.000756] nvgpu: 17000000.gv11b gv11b_fifo_recover:174 [DBG] REC | Acquiring runlist_lock for active runlists [ 276.000764] nvgpu: 17000000.gv11b gv11b_fifo_recover:185 [DBG] REC | Channels bound to this TSG: [ 276.000767] nvgpu: 17000000.gv11b gv11b_fifo_recover:190 [DBG] REC | 0 | chid 511 [ 276.001098] nvgpu: 17000000.gv11b gv11b_fifo_recover:222 [DBG] REC | PBDMA Bitmask: 0x1 [ 276.001102] nvgpu: 17000000.gv11b gv11b_fifo_recover:228 [DBG] REC | Runlist Bitmask: 0x1 [ 276.001106] nvgpu: 17000000.gv11b gv11b_fifo_recover:240 [DBG] REC | Disabling RL scheduler now [ 276.001126] nvgpu: 17000000.gv11b gv11b_fifo_recover:246 [DBG] REC | Disabling CG/PG now [ 276.189348] nvgpu: 17000000.gv11b gv11b_fifo_recover:259 [DBG] REC | Clearing PBDMA_FAULTED, ENG_FAULTED in CCSR register [ 276.191972] nvgpu: 17000000.gv11b gv11b_fifo_recover:264 [DBG] REC | Disabling TSG [ 276.191983] nvgpu: 17000000.gv11b gv11b_fifo_recover:279 [DBG] REC | Preempting runlists for RC [ 276.192001] nvgpu: 17000000.gv11b gv11b_fifo_recover:288 [DBG] REC | Polling for TSG to be off PBDMA [ 276.192012] nvgpu: 17000000.gv11b gv11b_fifo_recover:296 [DBG] REC | Done! [ 276.192016] nvgpu: 17000000.gv11b gv11b_fifo_recover:306 [DBG] REC | Resetting relevant engines [ 276.192020] nvgpu: 17000000.gv11b gv11b_fifo_recover:318 [DBG] REC | Engine bitmask for RL 0: 0xd [ 276.192024] nvgpu: 17000000.gv11b gv11b_fifo_recover:323 [DBG] REC | > Restting engine: ID=0 [ 276.209567] nvgpu: 17000000.gv11b gv11b_fifo_recover:347 [DBG] REC | Done! [ 276.209572] nvgpu: 17000000.gv11b gv11b_fifo_recover:323 [DBG] REC | > Restting engine: ID=2 [ 276.214290] nvgpu: 17000000.gv11b gv11b_fifo_recover:347 [DBG] REC | Done! [ 276.214295] nvgpu: 17000000.gv11b gv11b_fifo_recover:323 [DBG] REC | > Restting engine: ID=3 [ 276.224986] nvgpu: 17000000.gv11b gv11b_fifo_recover:347 [DBG] REC | Done! [ 276.225013] nvgpu: 17000000.gv11b gv11b_fifo_recover:377 [DBG] REC | Re-enabling runlists [ 276.225034] nvgpu: 17000000.gv11b gv11b_fifo_recover:383 [DBG] REC | Re-enabling CG/PG [ 276.225134] nvgpu: 17000000.gv11b gv11b_fifo_recover:394 [DBG] REC | Releasing engines reset mutex Note the "REC |" which lets one easily do: $ dmesg | grep "REC |" To get a clear ubobstrructed view of the recovery progress in the dmesg log. JIRA NVGPU-5606 Change-Id: I183f2b5ac54edc60ee894a82111723e27aa5c46b Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2392991 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
2025-12-22 17:36:20 +03:00 · 2020-07-24 11:06:14 -05:00
parent fcbd807842
commit 7e99a68e34
4 changed files with 115 additions and 12 deletions
--- a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c
@@ -159,13 +159,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	bool deferred_reset_pending = false;
 #endif
 	dbg_rec(g, "Recovery starting");
 	dbg_rec(g, "  ID      = %u", id);
 	dbg_rec(g, "  id_type = %s", nvgpu_id_type_to_str(id_type));
 	dbg_rec(g, "  rc_type = %s", nvgpu_rc_type_to_str(rc_type));
 	dbg_rec(g, "  Engine bitmask: 0x%x", act_eng_bitmask);
 	nvgpu_swprofile_begin_sample(prof);
-	nvgpu_log_info(g, "acquire engines_reset_mutex");
+	dbg_rec(g, "Acquiring engines_reset_mutex");
 	nvgpu_mutex_acquire(&f->engines_reset_mutex);
 	/* acquire runlist_lock for num_runlists */
-	nvgpu_log_fn(g, "acquire runlist_lock for active runlists");
+	dbg_rec(g, "Acquiring runlist_lock for active runlists");
 	nvgpu_runlist_lock_active_runlists(g);
 	nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ACQ_ACTIVE_RL);
@@ -174,13 +180,16 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	/* get tsg */
 	if (id != INVAL_ID && id_type == ID_TYPE_TSG) {
 		struct nvgpu_channel *c;
 		tsg = &g->fifo.tsg[id];
-	}
+		dbg_rec(g, "Channels bound to this TSG:");
-	/* get runlists mask */
+		i = 0U;
-	nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
+		nvgpu_list_for_each_entry(c, &tsg->ch_list,
-			"act_eng_bitmask = 0x%x, mmufault ptr = 0x%p",
+					  nvgpu_channel, ch_entry) {
-			 id, id_type, rc_type, act_eng_bitmask, mmufault);
+			dbg_rec(g, " %2u | chid %u", i++, c->chid);
 		}
 	}
 	/* Set unserviceable flag right at start of recovery to reduce
 	 * the window of race between job submit and recovery on same
@@ -205,13 +214,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	}
 	if (rc_type == RC_TYPE_MMU_FAULT && mmufault != NULL) {
-		if(mmufault->faulted_pbdma != INVAL_ID) {
+		if (mmufault->faulted_pbdma != INVAL_ID) {
 			pbdma_bitmask = BIT32(mmufault->faulted_pbdma);
 		}
 	}
 	dbg_rec(g, "PBDMA   Bitmask: 0x%x", pbdma_bitmask);
 	/* get runlists mask */
 	runlists_mask = nvgpu_runlist_get_runlists_mask(g, id, id_type,
 				act_eng_bitmask, pbdma_bitmask);
 	dbg_rec(g, "Runlist Bitmask: 0x%x", runlists_mask);
 	nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_GET_RL_MASK);
 	/*
@@ -220,12 +235,15 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	 */
 	nvgpu_runlist_unlock_runlists(g, ~runlists_mask);
 	/* Disable runlist scheduler */
 	dbg_rec(g, "Disabling RL scheduler now");
 	nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_DISABLED);
 	nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DISABLE_RL);
 #ifdef CONFIG_NVGPU_NON_FUSA
 	dbg_rec(g, "Disabling CG/PG now");
 	if (nvgpu_cg_pg_disable(g) != 0) {
 		nvgpu_warn(g, "fail to disable power mgmt");
 	}
@@ -238,10 +256,12 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 #ifdef CONFIG_NVGPU_DEBUGGER
 		client_type = mmufault->client_type;
 #endif
 		dbg_rec(g, "Clearing PBDMA_FAULTED, ENG_FAULTED in CCSR register");
 		nvgpu_tsg_reset_faulted_eng_pbdma(g, tsg, true, true);
 	}
 	if (tsg != NULL) {
 		dbg_rec(g, "Disabling TSG");
 		g->ops.tsg.disable(tsg);
 	}
@@ -256,6 +276,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	 * that all PBDMAs serving the engine are not loaded when engine is
 	 * reset.
 	 */
 	dbg_rec(g, "Preempting runlists for RC");
 	nvgpu_fifo_preempt_runlists_for_rc(g, runlists_mask);
 	nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_PREEMPT_RL);
@@ -264,6 +285,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	 * For each PBDMA which serves the runlist, poll to verify the TSG is no
 	 * longer on the PBDMA and the engine phase of the preempt has started.
 	 */
 	dbg_rec(g, "Polling for TSG to be off PBDMA");
 	if (tsg != NULL && (nvgpu_preempt_poll_tsg_on_pbdma(g, tsg) != 0)) {
 		nvgpu_err(g, "TSG preemption on PBDMA failed; "
 			"PBDMA seems stuck; cannot recover stuck PBDMA.");
@@ -271,6 +293,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 		nvgpu_sw_quiesce(g);
 		return;
 	}
 	dbg_rec(g, "  Done!");
 	nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_POLL_TSG_ON_PBDMA);
@@ -280,6 +303,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	nvgpu_mutex_release(&f->deferred_reset_mutex);
 #endif
 	dbg_rec(g, "Resetting relevant engines");
 	/* check if engine reset should be deferred */
 	for (i = 0U; i < f->num_runlists; i++) {
 		runlist = &f->active_runlist_info[i];
@@ -290,14 +314,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 		}
 		bitmask = runlist->reset_eng_bitmask;
 		dbg_rec(g, "  Engine bitmask for RL %u: 0x%lx",
 		       runlist->runlist_id, bitmask);
 		for_each_set_bit(bit, &bitmask, f->max_engines) {
 			engine_id = U32(bit);
 			dbg_rec(g, "  > Restting engine: ID=%u", engine_id);
 #ifdef CONFIG_NVGPU_DEBUGGER
-			if ((tsg != NULL) && nvgpu_engine_should_defer_reset(g,
+			if ((tsg != NULL) &&
-					engine_id, client_type, false)) {
+			    nvgpu_engine_should_defer_reset(g, engine_id,
 							    client_type, false)) {
 				dbg_rec(g, "    (deferred)");
 				f->deferred_fault_engines |= BIT64(engine_id);
@@ -315,6 +344,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 #endif
 #ifdef CONFIG_NVGPU_ENGINE_RESET
 				nvgpu_engine_reset(g, engine_id);
 				dbg_rec(g, "    Done!");
 #endif
 #ifdef CONFIG_NVGPU_DEBUGGER
 			}
@@ -344,11 +374,13 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 			runlists_mask);
 	}
 	dbg_rec(g, "Re-enabling runlists");
 	nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_ENABLED);
 	nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ENABLE_RL);
 #ifdef CONFIG_NVGPU_NON_FUSA
 	dbg_rec(g, "Re-enabling CG/PG");
 	if (nvgpu_cg_pg_enable(g) != 0) {
 		nvgpu_warn(g, "fail to enable power mgmt");
 	}
@@ -359,7 +391,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 	/* release runlist_lock for the recovered runlists */
 	nvgpu_runlist_unlock_runlists(g, runlists_mask);
-	nvgpu_log_info(g, "release engines_reset_mutex");
+	dbg_rec(g, "Releasing engines reset mutex");
 	nvgpu_mutex_release(&f->engines_reset_mutex);
 	nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DONE);
--- a/drivers/gpu/nvgpu/include/nvgpu/fifo.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/fifo.h
@@ -397,6 +397,28 @@ struct nvgpu_fifo {
 	u32 channel_base;
 };
 static inline const char *nvgpu_id_type_to_str(unsigned int id_type)
 {
 	const char *str = NULL;
 	switch (id_type) {
 	case ID_TYPE_CHANNEL:
 		str = "Channel";
 		break;
 	case ID_TYPE_TSG:
 		str = "TSG";
 		break;
 	case ID_TYPE_RUNLIST:
 		str = "Runlist";
 		break;
 	default:
 		str = "Unknown";
 		break;
 	}
 	return str;
 }
 /**
 * @brief Initialize FIFO software context.
 *
--- a/drivers/gpu/nvgpu/include/nvgpu/log_common.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log_common.h
@@ -71,5 +71,6 @@ enum nvgpu_log_type {
 #define gpu_dbg_device		BIT(32) /* Device initialization and
                                           querying. */
 #define gpu_dbg_mig		BIT(33) /* MIG info */
 #define gpu_dbg_rec		BIT(34) /* Recovery sequence debugging. */
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/rc.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h
@@ -37,6 +37,14 @@
 #define INVAL_ID			(~U32(0U))
 /*
 * Requires a string literal for the format - notice the string
 * concatination.
 */
 #define dbg_rec(g, fmt, args...)					\
 	nvgpu_log((g), gpu_dbg_rec, "REC | " fmt, ##args)
 struct gk20a;
 struct nvgpu_fifo;
 struct nvgpu_tsg;
@@ -44,6 +52,46 @@ struct nvgpu_channel;
 struct nvgpu_pbdma_status_info;
 struct mmu_fault_info;
 static inline const char *nvgpu_rc_type_to_str(unsigned int rc_type)
 {
 	const char *str = NULL;
 	switch (rc_type) {
 	case RC_TYPE_NO_RC:
 		str = "None";
 		break;
 	case RC_TYPE_MMU_FAULT:
 		str = "MMU fault";
 		break;
 	case RC_TYPE_PBDMA_FAULT:
 		str = "PBDMA fault";
 		break;
 	case RC_TYPE_GR_FAULT:
 		str = "GR fault";
 		break;
 	case RC_TYPE_PREEMPT_TIMEOUT:
 		str = "Preemption timeout";
 		break;
 	case RC_TYPE_CTXSW_TIMEOUT:
 		str = "CTXSW timeout";
 		break;
 	case RC_TYPE_RUNLIST_UPDATE_TIMEOUT:
 		str = "RL Update timeout";
 		break;
 	case RC_TYPE_FORCE_RESET:
 		str = "Force reset";
 		break;
 	case RC_TYPE_SCHED_ERR:
 		str = "Sched err";
 		break;
 	default:
 		str = "Unknown";
 		break;
 	}
 	return str;
 }
 void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
 				struct nvgpu_tsg *tsg, bool debug_dump);