gpu: nvgpu: Add basic recovery debugging messages

Add basic recovery messages that describe what's happening during
the recovery process. Hide this under a new recovery specific GPU
debug log flag. The logs look like:

[  276.000733] nvgpu: 17000000.gv11b                gv11b_fifo_recover:162  [DBG]  REC | Recovery starting
[  276.000737] nvgpu: 17000000.gv11b                gv11b_fifo_recover:163  [DBG]  REC |   ID      = 0
[  276.000741] nvgpu: 17000000.gv11b                gv11b_fifo_recover:164  [DBG]  REC |   id_type = TSG
[  276.000745] nvgpu: 17000000.gv11b                gv11b_fifo_recover:165  [DBG]  REC |   rc_type = MMU fault
[  276.000748] nvgpu: 17000000.gv11b                gv11b_fifo_recover:166  [DBG]  REC |   Engine bitmask: 0x0
[  276.000753] nvgpu: 17000000.gv11b                gv11b_fifo_recover:170  [DBG]  REC | Acquiring engines_reset_mutex
[  276.000756] nvgpu: 17000000.gv11b                gv11b_fifo_recover:174  [DBG]  REC | Acquiring runlist_lock for active runlists
[  276.000764] nvgpu: 17000000.gv11b                gv11b_fifo_recover:185  [DBG]  REC | Channels bound to this TSG:
[  276.000767] nvgpu: 17000000.gv11b                gv11b_fifo_recover:190  [DBG]  REC |   0 | chid 511
[  276.001098] nvgpu: 17000000.gv11b                gv11b_fifo_recover:222  [DBG]  REC | PBDMA   Bitmask: 0x1
[  276.001102] nvgpu: 17000000.gv11b                gv11b_fifo_recover:228  [DBG]  REC | Runlist Bitmask: 0x1
[  276.001106] nvgpu: 17000000.gv11b                gv11b_fifo_recover:240  [DBG]  REC | Disabling RL scheduler now
[  276.001126] nvgpu: 17000000.gv11b                gv11b_fifo_recover:246  [DBG]  REC | Disabling CG/PG now
[  276.189348] nvgpu: 17000000.gv11b                gv11b_fifo_recover:259  [DBG]  REC | Clearing PBDMA_FAULTED, ENG_FAULTED in CCSR register
[  276.191972] nvgpu: 17000000.gv11b                gv11b_fifo_recover:264  [DBG]  REC | Disabling TSG
[  276.191983] nvgpu: 17000000.gv11b                gv11b_fifo_recover:279  [DBG]  REC | Preempting runlists for RC
[  276.192001] nvgpu: 17000000.gv11b                gv11b_fifo_recover:288  [DBG]  REC | Polling for TSG to be off PBDMA
[  276.192012] nvgpu: 17000000.gv11b                gv11b_fifo_recover:296  [DBG]  REC |   Done!
[  276.192016] nvgpu: 17000000.gv11b                gv11b_fifo_recover:306  [DBG]  REC | Resetting relevant engines
[  276.192020] nvgpu: 17000000.gv11b                gv11b_fifo_recover:318  [DBG]  REC |   Engine bitmask for RL 0: 0xd
[  276.192024] nvgpu: 17000000.gv11b                gv11b_fifo_recover:323  [DBG]  REC |   > Restting engine: ID=0
[  276.209567] nvgpu: 17000000.gv11b                gv11b_fifo_recover:347  [DBG]  REC |     Done!
[  276.209572] nvgpu: 17000000.gv11b                gv11b_fifo_recover:323  [DBG]  REC |   > Restting engine: ID=2
[  276.214290] nvgpu: 17000000.gv11b                gv11b_fifo_recover:347  [DBG]  REC |     Done!
[  276.214295] nvgpu: 17000000.gv11b                gv11b_fifo_recover:323  [DBG]  REC |   > Restting engine: ID=3
[  276.224986] nvgpu: 17000000.gv11b                gv11b_fifo_recover:347  [DBG]  REC |     Done!
[  276.225013] nvgpu: 17000000.gv11b                gv11b_fifo_recover:377  [DBG]  REC | Re-enabling runlists
[  276.225034] nvgpu: 17000000.gv11b                gv11b_fifo_recover:383  [DBG]  REC | Re-enabling CG/PG
[  276.225134] nvgpu: 17000000.gv11b                gv11b_fifo_recover:394  [DBG]  REC | Releasing engines reset mutex

Note the "REC |" which lets one easily do:

  $ dmesg | grep "REC |"

To get a clear ubobstrructed view of the recovery progress in the dmesg
log.

JIRA NVGPU-5606

Change-Id: I183f2b5ac54edc60ee894a82111723e27aa5c46b
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2392991
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Alex Waterman
2020-07-24 11:06:14 -05:00
parent fcbd807842
commit 7e99a68e34
4 changed files with 115 additions and 12 deletions

View File

@@ -159,13 +159,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
bool deferred_reset_pending = false; bool deferred_reset_pending = false;
#endif #endif
dbg_rec(g, "Recovery starting");
dbg_rec(g, " ID = %u", id);
dbg_rec(g, " id_type = %s", nvgpu_id_type_to_str(id_type));
dbg_rec(g, " rc_type = %s", nvgpu_rc_type_to_str(rc_type));
dbg_rec(g, " Engine bitmask: 0x%x", act_eng_bitmask);
nvgpu_swprofile_begin_sample(prof); nvgpu_swprofile_begin_sample(prof);
nvgpu_log_info(g, "acquire engines_reset_mutex"); dbg_rec(g, "Acquiring engines_reset_mutex");
nvgpu_mutex_acquire(&f->engines_reset_mutex); nvgpu_mutex_acquire(&f->engines_reset_mutex);
/* acquire runlist_lock for num_runlists */ /* acquire runlist_lock for num_runlists */
nvgpu_log_fn(g, "acquire runlist_lock for active runlists"); dbg_rec(g, "Acquiring runlist_lock for active runlists");
nvgpu_runlist_lock_active_runlists(g); nvgpu_runlist_lock_active_runlists(g);
nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ACQ_ACTIVE_RL); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ACQ_ACTIVE_RL);
@@ -174,13 +180,16 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
/* get tsg */ /* get tsg */
if (id != INVAL_ID && id_type == ID_TYPE_TSG) { if (id != INVAL_ID && id_type == ID_TYPE_TSG) {
struct nvgpu_channel *c;
tsg = &g->fifo.tsg[id]; tsg = &g->fifo.tsg[id];
} dbg_rec(g, "Channels bound to this TSG:");
/* get runlists mask */ i = 0U;
nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, " nvgpu_list_for_each_entry(c, &tsg->ch_list,
"act_eng_bitmask = 0x%x, mmufault ptr = 0x%p", nvgpu_channel, ch_entry) {
id, id_type, rc_type, act_eng_bitmask, mmufault); dbg_rec(g, " %2u | chid %u", i++, c->chid);
}
}
/* Set unserviceable flag right at start of recovery to reduce /* Set unserviceable flag right at start of recovery to reduce
* the window of race between job submit and recovery on same * the window of race between job submit and recovery on same
@@ -205,13 +214,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
} }
if (rc_type == RC_TYPE_MMU_FAULT && mmufault != NULL) { if (rc_type == RC_TYPE_MMU_FAULT && mmufault != NULL) {
if(mmufault->faulted_pbdma != INVAL_ID) { if (mmufault->faulted_pbdma != INVAL_ID) {
pbdma_bitmask = BIT32(mmufault->faulted_pbdma); pbdma_bitmask = BIT32(mmufault->faulted_pbdma);
} }
} }
dbg_rec(g, "PBDMA Bitmask: 0x%x", pbdma_bitmask);
/* get runlists mask */
runlists_mask = nvgpu_runlist_get_runlists_mask(g, id, id_type, runlists_mask = nvgpu_runlist_get_runlists_mask(g, id, id_type,
act_eng_bitmask, pbdma_bitmask); act_eng_bitmask, pbdma_bitmask);
dbg_rec(g, "Runlist Bitmask: 0x%x", runlists_mask);
nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_GET_RL_MASK); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_GET_RL_MASK);
/* /*
@@ -220,12 +235,15 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
*/ */
nvgpu_runlist_unlock_runlists(g, ~runlists_mask); nvgpu_runlist_unlock_runlists(g, ~runlists_mask);
/* Disable runlist scheduler */ /* Disable runlist scheduler */
dbg_rec(g, "Disabling RL scheduler now");
nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_DISABLED); nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_DISABLED);
nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DISABLE_RL); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DISABLE_RL);
#ifdef CONFIG_NVGPU_NON_FUSA #ifdef CONFIG_NVGPU_NON_FUSA
dbg_rec(g, "Disabling CG/PG now");
if (nvgpu_cg_pg_disable(g) != 0) { if (nvgpu_cg_pg_disable(g) != 0) {
nvgpu_warn(g, "fail to disable power mgmt"); nvgpu_warn(g, "fail to disable power mgmt");
} }
@@ -238,10 +256,12 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
#ifdef CONFIG_NVGPU_DEBUGGER #ifdef CONFIG_NVGPU_DEBUGGER
client_type = mmufault->client_type; client_type = mmufault->client_type;
#endif #endif
dbg_rec(g, "Clearing PBDMA_FAULTED, ENG_FAULTED in CCSR register");
nvgpu_tsg_reset_faulted_eng_pbdma(g, tsg, true, true); nvgpu_tsg_reset_faulted_eng_pbdma(g, tsg, true, true);
} }
if (tsg != NULL) { if (tsg != NULL) {
dbg_rec(g, "Disabling TSG");
g->ops.tsg.disable(tsg); g->ops.tsg.disable(tsg);
} }
@@ -256,6 +276,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
* that all PBDMAs serving the engine are not loaded when engine is * that all PBDMAs serving the engine are not loaded when engine is
* reset. * reset.
*/ */
dbg_rec(g, "Preempting runlists for RC");
nvgpu_fifo_preempt_runlists_for_rc(g, runlists_mask); nvgpu_fifo_preempt_runlists_for_rc(g, runlists_mask);
nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_PREEMPT_RL); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_PREEMPT_RL);
@@ -264,6 +285,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
* For each PBDMA which serves the runlist, poll to verify the TSG is no * For each PBDMA which serves the runlist, poll to verify the TSG is no
* longer on the PBDMA and the engine phase of the preempt has started. * longer on the PBDMA and the engine phase of the preempt has started.
*/ */
dbg_rec(g, "Polling for TSG to be off PBDMA");
if (tsg != NULL && (nvgpu_preempt_poll_tsg_on_pbdma(g, tsg) != 0)) { if (tsg != NULL && (nvgpu_preempt_poll_tsg_on_pbdma(g, tsg) != 0)) {
nvgpu_err(g, "TSG preemption on PBDMA failed; " nvgpu_err(g, "TSG preemption on PBDMA failed; "
"PBDMA seems stuck; cannot recover stuck PBDMA."); "PBDMA seems stuck; cannot recover stuck PBDMA.");
@@ -271,6 +293,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);
return; return;
} }
dbg_rec(g, " Done!");
nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_POLL_TSG_ON_PBDMA); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_POLL_TSG_ON_PBDMA);
@@ -280,6 +303,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
nvgpu_mutex_release(&f->deferred_reset_mutex); nvgpu_mutex_release(&f->deferred_reset_mutex);
#endif #endif
dbg_rec(g, "Resetting relevant engines");
/* check if engine reset should be deferred */ /* check if engine reset should be deferred */
for (i = 0U; i < f->num_runlists; i++) { for (i = 0U; i < f->num_runlists; i++) {
runlist = &f->active_runlist_info[i]; runlist = &f->active_runlist_info[i];
@@ -290,14 +314,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
} }
bitmask = runlist->reset_eng_bitmask; bitmask = runlist->reset_eng_bitmask;
dbg_rec(g, " Engine bitmask for RL %u: 0x%lx",
runlist->runlist_id, bitmask);
for_each_set_bit(bit, &bitmask, f->max_engines) { for_each_set_bit(bit, &bitmask, f->max_engines) {
engine_id = U32(bit); engine_id = U32(bit);
dbg_rec(g, " > Restting engine: ID=%u", engine_id);
#ifdef CONFIG_NVGPU_DEBUGGER #ifdef CONFIG_NVGPU_DEBUGGER
if ((tsg != NULL) && nvgpu_engine_should_defer_reset(g, if ((tsg != NULL) &&
engine_id, client_type, false)) { nvgpu_engine_should_defer_reset(g, engine_id,
client_type, false)) {
dbg_rec(g, " (deferred)");
f->deferred_fault_engines |= BIT64(engine_id); f->deferred_fault_engines |= BIT64(engine_id);
@@ -315,6 +344,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
#endif #endif
#ifdef CONFIG_NVGPU_ENGINE_RESET #ifdef CONFIG_NVGPU_ENGINE_RESET
nvgpu_engine_reset(g, engine_id); nvgpu_engine_reset(g, engine_id);
dbg_rec(g, " Done!");
#endif #endif
#ifdef CONFIG_NVGPU_DEBUGGER #ifdef CONFIG_NVGPU_DEBUGGER
} }
@@ -344,11 +374,13 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
runlists_mask); runlists_mask);
} }
dbg_rec(g, "Re-enabling runlists");
nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_ENABLED); nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_ENABLED);
nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ENABLE_RL); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ENABLE_RL);
#ifdef CONFIG_NVGPU_NON_FUSA #ifdef CONFIG_NVGPU_NON_FUSA
dbg_rec(g, "Re-enabling CG/PG");
if (nvgpu_cg_pg_enable(g) != 0) { if (nvgpu_cg_pg_enable(g) != 0) {
nvgpu_warn(g, "fail to enable power mgmt"); nvgpu_warn(g, "fail to enable power mgmt");
} }
@@ -359,7 +391,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
/* release runlist_lock for the recovered runlists */ /* release runlist_lock for the recovered runlists */
nvgpu_runlist_unlock_runlists(g, runlists_mask); nvgpu_runlist_unlock_runlists(g, runlists_mask);
nvgpu_log_info(g, "release engines_reset_mutex"); dbg_rec(g, "Releasing engines reset mutex");
nvgpu_mutex_release(&f->engines_reset_mutex); nvgpu_mutex_release(&f->engines_reset_mutex);
nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DONE); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DONE);

View File

@@ -397,6 +397,28 @@ struct nvgpu_fifo {
u32 channel_base; u32 channel_base;
}; };
static inline const char *nvgpu_id_type_to_str(unsigned int id_type)
{
const char *str = NULL;
switch (id_type) {
case ID_TYPE_CHANNEL:
str = "Channel";
break;
case ID_TYPE_TSG:
str = "TSG";
break;
case ID_TYPE_RUNLIST:
str = "Runlist";
break;
default:
str = "Unknown";
break;
}
return str;
}
/** /**
* @brief Initialize FIFO software context. * @brief Initialize FIFO software context.
* *

View File

@@ -71,5 +71,6 @@ enum nvgpu_log_type {
#define gpu_dbg_device BIT(32) /* Device initialization and #define gpu_dbg_device BIT(32) /* Device initialization and
querying. */ querying. */
#define gpu_dbg_mig BIT(33) /* MIG info */ #define gpu_dbg_mig BIT(33) /* MIG info */
#define gpu_dbg_rec BIT(34) /* Recovery sequence debugging. */
#endif #endif

View File

@@ -37,6 +37,14 @@
#define INVAL_ID (~U32(0U)) #define INVAL_ID (~U32(0U))
/*
* Requires a string literal for the format - notice the string
* concatination.
*/
#define dbg_rec(g, fmt, args...) \
nvgpu_log((g), gpu_dbg_rec, "REC | " fmt, ##args)
struct gk20a; struct gk20a;
struct nvgpu_fifo; struct nvgpu_fifo;
struct nvgpu_tsg; struct nvgpu_tsg;
@@ -44,6 +52,46 @@ struct nvgpu_channel;
struct nvgpu_pbdma_status_info; struct nvgpu_pbdma_status_info;
struct mmu_fault_info; struct mmu_fault_info;
static inline const char *nvgpu_rc_type_to_str(unsigned int rc_type)
{
const char *str = NULL;
switch (rc_type) {
case RC_TYPE_NO_RC:
str = "None";
break;
case RC_TYPE_MMU_FAULT:
str = "MMU fault";
break;
case RC_TYPE_PBDMA_FAULT:
str = "PBDMA fault";
break;
case RC_TYPE_GR_FAULT:
str = "GR fault";
break;
case RC_TYPE_PREEMPT_TIMEOUT:
str = "Preemption timeout";
break;
case RC_TYPE_CTXSW_TIMEOUT:
str = "CTXSW timeout";
break;
case RC_TYPE_RUNLIST_UPDATE_TIMEOUT:
str = "RL Update timeout";
break;
case RC_TYPE_FORCE_RESET:
str = "Force reset";
break;
case RC_TYPE_SCHED_ERR:
str = "Sched err";
break;
default:
str = "Unknown";
break;
}
return str;
}
void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
struct nvgpu_tsg *tsg, bool debug_dump); struct nvgpu_tsg *tsg, bool debug_dump);