From 7e99a68e34e643ae8213720f6e109f986ef2e5e0 Mon Sep 17 00:00:00 2001 From: Alex Waterman Date: Fri, 24 Jul 2020 11:06:14 -0500 Subject: [PATCH] gpu: nvgpu: Add basic recovery debugging messages Add basic recovery messages that describe what's happening during the recovery process. Hide this under a new recovery specific GPU debug log flag. The logs look like: [ 276.000733] nvgpu: 17000000.gv11b gv11b_fifo_recover:162 [DBG] REC | Recovery starting [ 276.000737] nvgpu: 17000000.gv11b gv11b_fifo_recover:163 [DBG] REC | ID = 0 [ 276.000741] nvgpu: 17000000.gv11b gv11b_fifo_recover:164 [DBG] REC | id_type = TSG [ 276.000745] nvgpu: 17000000.gv11b gv11b_fifo_recover:165 [DBG] REC | rc_type = MMU fault [ 276.000748] nvgpu: 17000000.gv11b gv11b_fifo_recover:166 [DBG] REC | Engine bitmask: 0x0 [ 276.000753] nvgpu: 17000000.gv11b gv11b_fifo_recover:170 [DBG] REC | Acquiring engines_reset_mutex [ 276.000756] nvgpu: 17000000.gv11b gv11b_fifo_recover:174 [DBG] REC | Acquiring runlist_lock for active runlists [ 276.000764] nvgpu: 17000000.gv11b gv11b_fifo_recover:185 [DBG] REC | Channels bound to this TSG: [ 276.000767] nvgpu: 17000000.gv11b gv11b_fifo_recover:190 [DBG] REC | 0 | chid 511 [ 276.001098] nvgpu: 17000000.gv11b gv11b_fifo_recover:222 [DBG] REC | PBDMA Bitmask: 0x1 [ 276.001102] nvgpu: 17000000.gv11b gv11b_fifo_recover:228 [DBG] REC | Runlist Bitmask: 0x1 [ 276.001106] nvgpu: 17000000.gv11b gv11b_fifo_recover:240 [DBG] REC | Disabling RL scheduler now [ 276.001126] nvgpu: 17000000.gv11b gv11b_fifo_recover:246 [DBG] REC | Disabling CG/PG now [ 276.189348] nvgpu: 17000000.gv11b gv11b_fifo_recover:259 [DBG] REC | Clearing PBDMA_FAULTED, ENG_FAULTED in CCSR register [ 276.191972] nvgpu: 17000000.gv11b gv11b_fifo_recover:264 [DBG] REC | Disabling TSG [ 276.191983] nvgpu: 17000000.gv11b gv11b_fifo_recover:279 [DBG] REC | Preempting runlists for RC [ 276.192001] nvgpu: 17000000.gv11b gv11b_fifo_recover:288 [DBG] REC | Polling for TSG to be off PBDMA [ 276.192012] nvgpu: 17000000.gv11b gv11b_fifo_recover:296 [DBG] REC | Done! [ 276.192016] nvgpu: 17000000.gv11b gv11b_fifo_recover:306 [DBG] REC | Resetting relevant engines [ 276.192020] nvgpu: 17000000.gv11b gv11b_fifo_recover:318 [DBG] REC | Engine bitmask for RL 0: 0xd [ 276.192024] nvgpu: 17000000.gv11b gv11b_fifo_recover:323 [DBG] REC | > Restting engine: ID=0 [ 276.209567] nvgpu: 17000000.gv11b gv11b_fifo_recover:347 [DBG] REC | Done! [ 276.209572] nvgpu: 17000000.gv11b gv11b_fifo_recover:323 [DBG] REC | > Restting engine: ID=2 [ 276.214290] nvgpu: 17000000.gv11b gv11b_fifo_recover:347 [DBG] REC | Done! [ 276.214295] nvgpu: 17000000.gv11b gv11b_fifo_recover:323 [DBG] REC | > Restting engine: ID=3 [ 276.224986] nvgpu: 17000000.gv11b gv11b_fifo_recover:347 [DBG] REC | Done! [ 276.225013] nvgpu: 17000000.gv11b gv11b_fifo_recover:377 [DBG] REC | Re-enabling runlists [ 276.225034] nvgpu: 17000000.gv11b gv11b_fifo_recover:383 [DBG] REC | Re-enabling CG/PG [ 276.225134] nvgpu: 17000000.gv11b gv11b_fifo_recover:394 [DBG] REC | Releasing engines reset mutex Note the "REC |" which lets one easily do: $ dmesg | grep "REC |" To get a clear ubobstrructed view of the recovery progress in the dmesg log. JIRA NVGPU-5606 Change-Id: I183f2b5ac54edc60ee894a82111723e27aa5c46b Signed-off-by: Alex Waterman Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2392991 Reviewed-by: automaticguardword Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-cert Reviewed-by: Konsta Holtta Reviewed-by: Tejal Kudav Reviewed-by: mobile promotions Tested-by: mobile promotions GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/hal/rc/rc_gv11b.c | 56 +++++++++++++++----- drivers/gpu/nvgpu/include/nvgpu/fifo.h | 22 ++++++++ drivers/gpu/nvgpu/include/nvgpu/log_common.h | 1 + drivers/gpu/nvgpu/include/nvgpu/rc.h | 48 +++++++++++++++++ 4 files changed, 115 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c index 49a99970c..e0d13f89f 100644 --- a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c @@ -143,7 +143,7 @@ static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g, void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, u32 id, unsigned int id_type, unsigned int rc_type, - struct mmu_fault_info *mmufault) + struct mmu_fault_info *mmufault) { struct nvgpu_tsg *tsg = NULL; u32 runlists_mask, i; @@ -159,13 +159,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, bool deferred_reset_pending = false; #endif + dbg_rec(g, "Recovery starting"); + dbg_rec(g, " ID = %u", id); + dbg_rec(g, " id_type = %s", nvgpu_id_type_to_str(id_type)); + dbg_rec(g, " rc_type = %s", nvgpu_rc_type_to_str(rc_type)); + dbg_rec(g, " Engine bitmask: 0x%x", act_eng_bitmask); + nvgpu_swprofile_begin_sample(prof); - nvgpu_log_info(g, "acquire engines_reset_mutex"); + dbg_rec(g, "Acquiring engines_reset_mutex"); nvgpu_mutex_acquire(&f->engines_reset_mutex); /* acquire runlist_lock for num_runlists */ - nvgpu_log_fn(g, "acquire runlist_lock for active runlists"); + dbg_rec(g, "Acquiring runlist_lock for active runlists"); nvgpu_runlist_lock_active_runlists(g); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ACQ_ACTIVE_RL); @@ -174,13 +180,16 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, /* get tsg */ if (id != INVAL_ID && id_type == ID_TYPE_TSG) { + struct nvgpu_channel *c; tsg = &g->fifo.tsg[id]; - } + dbg_rec(g, "Channels bound to this TSG:"); - /* get runlists mask */ - nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, " - "act_eng_bitmask = 0x%x, mmufault ptr = 0x%p", - id, id_type, rc_type, act_eng_bitmask, mmufault); + i = 0U; + nvgpu_list_for_each_entry(c, &tsg->ch_list, + nvgpu_channel, ch_entry) { + dbg_rec(g, " %2u | chid %u", i++, c->chid); + } + } /* Set unserviceable flag right at start of recovery to reduce * the window of race between job submit and recovery on same @@ -205,13 +214,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, } if (rc_type == RC_TYPE_MMU_FAULT && mmufault != NULL) { - if(mmufault->faulted_pbdma != INVAL_ID) { + if (mmufault->faulted_pbdma != INVAL_ID) { pbdma_bitmask = BIT32(mmufault->faulted_pbdma); } } + + dbg_rec(g, "PBDMA Bitmask: 0x%x", pbdma_bitmask); + + /* get runlists mask */ runlists_mask = nvgpu_runlist_get_runlists_mask(g, id, id_type, act_eng_bitmask, pbdma_bitmask); + dbg_rec(g, "Runlist Bitmask: 0x%x", runlists_mask); + nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_GET_RL_MASK); /* @@ -220,12 +235,15 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, */ nvgpu_runlist_unlock_runlists(g, ~runlists_mask); + /* Disable runlist scheduler */ + dbg_rec(g, "Disabling RL scheduler now"); nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_DISABLED); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DISABLE_RL); #ifdef CONFIG_NVGPU_NON_FUSA + dbg_rec(g, "Disabling CG/PG now"); if (nvgpu_cg_pg_disable(g) != 0) { nvgpu_warn(g, "fail to disable power mgmt"); } @@ -238,10 +256,12 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, #ifdef CONFIG_NVGPU_DEBUGGER client_type = mmufault->client_type; #endif + dbg_rec(g, "Clearing PBDMA_FAULTED, ENG_FAULTED in CCSR register"); nvgpu_tsg_reset_faulted_eng_pbdma(g, tsg, true, true); } if (tsg != NULL) { + dbg_rec(g, "Disabling TSG"); g->ops.tsg.disable(tsg); } @@ -256,6 +276,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, * that all PBDMAs serving the engine are not loaded when engine is * reset. */ + dbg_rec(g, "Preempting runlists for RC"); nvgpu_fifo_preempt_runlists_for_rc(g, runlists_mask); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_PREEMPT_RL); @@ -264,6 +285,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, * For each PBDMA which serves the runlist, poll to verify the TSG is no * longer on the PBDMA and the engine phase of the preempt has started. */ + dbg_rec(g, "Polling for TSG to be off PBDMA"); if (tsg != NULL && (nvgpu_preempt_poll_tsg_on_pbdma(g, tsg) != 0)) { nvgpu_err(g, "TSG preemption on PBDMA failed; " "PBDMA seems stuck; cannot recover stuck PBDMA."); @@ -271,6 +293,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, nvgpu_sw_quiesce(g); return; } + dbg_rec(g, " Done!"); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_POLL_TSG_ON_PBDMA); @@ -280,6 +303,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, nvgpu_mutex_release(&f->deferred_reset_mutex); #endif + dbg_rec(g, "Resetting relevant engines"); /* check if engine reset should be deferred */ for (i = 0U; i < f->num_runlists; i++) { runlist = &f->active_runlist_info[i]; @@ -290,14 +314,19 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, } bitmask = runlist->reset_eng_bitmask; + dbg_rec(g, " Engine bitmask for RL %u: 0x%lx", + runlist->runlist_id, bitmask); for_each_set_bit(bit, &bitmask, f->max_engines) { engine_id = U32(bit); + dbg_rec(g, " > Restting engine: ID=%u", engine_id); #ifdef CONFIG_NVGPU_DEBUGGER - if ((tsg != NULL) && nvgpu_engine_should_defer_reset(g, - engine_id, client_type, false)) { + if ((tsg != NULL) && + nvgpu_engine_should_defer_reset(g, engine_id, + client_type, false)) { + dbg_rec(g, " (deferred)"); f->deferred_fault_engines |= BIT64(engine_id); @@ -315,6 +344,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, #endif #ifdef CONFIG_NVGPU_ENGINE_RESET nvgpu_engine_reset(g, engine_id); + dbg_rec(g, " Done!"); #endif #ifdef CONFIG_NVGPU_DEBUGGER } @@ -344,11 +374,13 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, runlists_mask); } + dbg_rec(g, "Re-enabling runlists"); nvgpu_runlist_set_state(g, runlists_mask, RUNLIST_ENABLED); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_ENABLE_RL); #ifdef CONFIG_NVGPU_NON_FUSA + dbg_rec(g, "Re-enabling CG/PG"); if (nvgpu_cg_pg_enable(g) != 0) { nvgpu_warn(g, "fail to enable power mgmt"); } @@ -359,7 +391,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, /* release runlist_lock for the recovered runlists */ nvgpu_runlist_unlock_runlists(g, runlists_mask); - nvgpu_log_info(g, "release engines_reset_mutex"); + dbg_rec(g, "Releasing engines reset mutex"); nvgpu_mutex_release(&f->engines_reset_mutex); nvgpu_swprofile_snapshot(prof, PROF_RECOVERY_DONE); diff --git a/drivers/gpu/nvgpu/include/nvgpu/fifo.h b/drivers/gpu/nvgpu/include/nvgpu/fifo.h index 90bc5da94..97c987042 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/fifo.h +++ b/drivers/gpu/nvgpu/include/nvgpu/fifo.h @@ -397,6 +397,28 @@ struct nvgpu_fifo { u32 channel_base; }; +static inline const char *nvgpu_id_type_to_str(unsigned int id_type) +{ + const char *str = NULL; + + switch (id_type) { + case ID_TYPE_CHANNEL: + str = "Channel"; + break; + case ID_TYPE_TSG: + str = "TSG"; + break; + case ID_TYPE_RUNLIST: + str = "Runlist"; + break; + default: + str = "Unknown"; + break; + } + + return str; +} + /** * @brief Initialize FIFO software context. * diff --git a/drivers/gpu/nvgpu/include/nvgpu/log_common.h b/drivers/gpu/nvgpu/include/nvgpu/log_common.h index b569d4721..b2623f07a 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/log_common.h +++ b/drivers/gpu/nvgpu/include/nvgpu/log_common.h @@ -71,5 +71,6 @@ enum nvgpu_log_type { #define gpu_dbg_device BIT(32) /* Device initialization and querying. */ #define gpu_dbg_mig BIT(33) /* MIG info */ +#define gpu_dbg_rec BIT(34) /* Recovery sequence debugging. */ #endif diff --git a/drivers/gpu/nvgpu/include/nvgpu/rc.h b/drivers/gpu/nvgpu/include/nvgpu/rc.h index 5389f2ce9..3dc5dde15 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/rc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h @@ -37,6 +37,14 @@ #define INVAL_ID (~U32(0U)) +/* + * Requires a string literal for the format - notice the string + * concatination. + */ +#define dbg_rec(g, fmt, args...) \ + nvgpu_log((g), gpu_dbg_rec, "REC | " fmt, ##args) + + struct gk20a; struct nvgpu_fifo; struct nvgpu_tsg; @@ -44,6 +52,46 @@ struct nvgpu_channel; struct nvgpu_pbdma_status_info; struct mmu_fault_info; +static inline const char *nvgpu_rc_type_to_str(unsigned int rc_type) +{ + const char *str = NULL; + + switch (rc_type) { + case RC_TYPE_NO_RC: + str = "None"; + break; + case RC_TYPE_MMU_FAULT: + str = "MMU fault"; + break; + case RC_TYPE_PBDMA_FAULT: + str = "PBDMA fault"; + break; + case RC_TYPE_GR_FAULT: + str = "GR fault"; + break; + case RC_TYPE_PREEMPT_TIMEOUT: + str = "Preemption timeout"; + break; + case RC_TYPE_CTXSW_TIMEOUT: + str = "CTXSW timeout"; + break; + case RC_TYPE_RUNLIST_UPDATE_TIMEOUT: + str = "RL Update timeout"; + break; + case RC_TYPE_FORCE_RESET: + str = "Force reset"; + break; + case RC_TYPE_SCHED_ERR: + str = "Sched err"; + break; + default: + str = "Unknown"; + break; + } + + return str; +} + void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, struct nvgpu_tsg *tsg, bool debug_dump);