gpu: nvgpu: Enable GPCCS debug data logging.

Currently in case of any fecs error, we only dump fecs
cxtsw fw related registers, mailboxes and trace registers.
With this change, we want to ensure we dump gpccs register
space as well. This will help in debugging ctxsw related
failures

JIRA NVGPU-9560
Bug 3907163 

Change-Id: I61e25883da4455ea1412ca70c5fc3377d9a786a3
Signed-off-by: Kishan <kpalankar@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2850402
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
Kishan
2023-01-30 10:47:55 +00:00
committed by mobile promotions
parent 49a6676ef6
commit 5adf709506
16 changed files with 175 additions and 3 deletions

View File

@@ -51,6 +51,7 @@ void nvgpu_pmu_dump_falcon_stats(struct nvgpu_pmu *pmu)
nvgpu_err(g, "pmu state: %d", pmu->pmu_state); nvgpu_err(g, "pmu state: %d", pmu->pmu_state);
nvgpu_err(g, "elpg state: %d", pmu->elpg_stat); nvgpu_err(g, "elpg state: %d", pmu->elpg_stat);
/* PMU may crash due to FECS crash. Dump FECS status */ /* PMU may crash due to FECS crash. Dump FECS and GPCCS status */
gk20a_fecs_dump_falcon_stats(g); gk20a_fecs_dump_falcon_stats(g);
gk20a_gpccs_dump_falcon_stats(g);
} }

View File

@@ -1407,6 +1407,7 @@ static void gk20a_fifo_handle_chsw_fault(struct gk20a *g)
intr = gk20a_readl(g, fifo_intr_chsw_error_r()); intr = gk20a_readl(g, fifo_intr_chsw_error_r());
nvgpu_err(g, "chsw: %08x", intr); nvgpu_err(g, "chsw: %08x", intr);
gk20a_fecs_dump_falcon_stats(g); gk20a_fecs_dump_falcon_stats(g);
gk20a_gpccs_dump_falcon_stats(g);
gk20a_writel(g, fifo_intr_chsw_error_r(), intr); gk20a_writel(g, fifo_intr_chsw_error_r(), intr);
} }
@@ -1723,6 +1724,7 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
if (ctxsw) { if (ctxsw) {
gk20a_fecs_dump_falcon_stats(g); gk20a_fecs_dump_falcon_stats(g);
gk20a_gpccs_dump_falcon_stats(g);
nvgpu_err(g, "gr_status_r : 0x%x", nvgpu_err(g, "gr_status_r : 0x%x",
gk20a_readl(g, gr_status_r())); gk20a_readl(g, gr_status_r()));
} }

View File

@@ -121,6 +121,83 @@ int gr_gk20a_get_ctx_id(struct gk20a *g,
return 0; return 0;
} }
void gk20a_gpccs_dump_falcon_stats(struct gk20a *g)
{
unsigned int i;
nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqstat : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_irqstat_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqmode : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_irqmode_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqmask : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_irqmask_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_irqdest : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_irqdest_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_debug1 : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_debug1_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_debuginfo : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_debuginfo_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_engctl : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_engctl_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_curctx : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_curctx_r()));
nvgpu_err(g, "gr_gpc0_gpccs_falcon_nxtctx : %d",
gk20a_readl(g, gr_gpc0_gpccs_falcon_nxtctx_r()));
nvgpu_err(g, "gr_gpc0_gpccs_ctxsw_status_1 : %d",
gk20a_readl(g, gr_gpc0_gpccs_ctxsw_status_1_r()));
for (i = 0; i < g->ops.gr.gpc0_gpccs_ctxsw_mailbox_size(); i++) {
nvgpu_err(g, "gr_gpc0_gpccs_ctxsw_mailbox_r(%d) : 0x%x",
i, gk20a_readl(g, gr_gpc0_gpccs_ctxsw_mailbox_r(i)));
}
gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_IMB : 0x%x",
gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_DMB : 0x%x",
gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_CSW : 0x%x",
gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_CTX : 0x%x",
gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_EXCI : 0x%x",
gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
for (i = 0; i < 4U; i++) {
gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_PC));
nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_PC : 0x%x",
gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
gk20a_writel(g, gr_gpc0_gpccs_falcon_icd_cmd_r(),
gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f() |
gr_gpc0_gpccs_falcon_icd_cmd_idx_f(PMU_FALCON_REG_SP));
nvgpu_err(g, "GPC0_GPCCS_FALCON_REG_SP : 0x%x",
gk20a_readl(g, gr_gpc_gpccs_falcon_icd_rdata_r()));
}
}
void gk20a_fecs_dump_falcon_stats(struct gk20a *g) void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
{ {
unsigned int i; unsigned int i;
@@ -527,6 +604,7 @@ int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
"timeout waiting on mailbox=%d value=0x%08x", "timeout waiting on mailbox=%d value=0x%08x",
mailbox_id, reg); mailbox_id, reg);
gk20a_fecs_dump_falcon_stats(g); gk20a_fecs_dump_falcon_stats(g);
gk20a_gpccs_dump_falcon_stats(g);
gk20a_gr_debug_dump(g); gk20a_gr_debug_dump(g);
return -1; return -1;
} else if (check == WAIT_UCODE_ERROR) { } else if (check == WAIT_UCODE_ERROR) {
@@ -534,6 +612,7 @@ int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
"ucode method failed on mailbox=%d value=0x%08x", "ucode method failed on mailbox=%d value=0x%08x",
mailbox_id, reg); mailbox_id, reg);
gk20a_fecs_dump_falcon_stats(g); gk20a_fecs_dump_falcon_stats(g);
gk20a_gpccs_dump_falcon_stats(g);
return -1; return -1;
} }
@@ -5297,9 +5376,10 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
} else if ((gr_fecs_intr & } else if ((gr_fecs_intr &
gr_fecs_host_int_status_watchdog_active_f()) != 0U) { gr_fecs_host_int_status_watchdog_active_f()) != 0U) {
/* currently, recovery is not initiated */ /* currently, recovery is not initiated */
nvgpu_err(g, "fecs watchdog triggered for channel %u, " nvgpu_err(g, "fecs watchdog triggered for channel %u", chid);
"cannot ctxsw anymore !!", chid);
gk20a_fecs_dump_falcon_stats(g); gk20a_fecs_dump_falcon_stats(g);
gk20a_gpccs_dump_falcon_stats(g);
gk20a_gr_debug_dump(g);
} else if ((gr_fecs_intr & } else if ((gr_fecs_intr &
gr_fecs_host_int_status_ctxsw_intr_f(CTXSW_INTR0)) != 0U) { gr_fecs_host_int_status_ctxsw_intr_f(CTXSW_INTR0)) != 0U) {
u32 mailbox_value = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)); u32 mailbox_value = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6));
@@ -5323,6 +5403,7 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
"unhandled fecs error interrupt 0x%08x for channel %u", "unhandled fecs error interrupt 0x%08x for channel %u",
gr_fecs_intr, chid); gr_fecs_intr, chid);
gk20a_fecs_dump_falcon_stats(g); gk20a_fecs_dump_falcon_stats(g);
gk20a_gpccs_dump_falcon_stats(g);
} }
gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr); gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr);

View File

@@ -438,6 +438,7 @@ struct gr_gk20a {
}; };
void gk20a_fecs_dump_falcon_stats(struct gk20a *g); void gk20a_fecs_dump_falcon_stats(struct gk20a *g);
void gk20a_gpccs_dump_falcon_stats(struct gk20a *g);
/* contexts associated with a TSG */ /* contexts associated with a TSG */
struct nvgpu_gr_ctx { struct nvgpu_gr_ctx {

View File

@@ -326,6 +326,7 @@ static const struct gpu_ops gm20b_ops = {
gr_gm20b_get_pmm_per_chiplet_offset, gr_gm20b_get_pmm_per_chiplet_offset,
.split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr, .split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr,
.fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v, .fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v,
.gpc0_gpccs_ctxsw_mailbox_size = gr_gpc0_gpccs_ctxsw_mailbox__size_1_v,
.alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers, .alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers,
.map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers, .map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers,
.commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers, .commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers,

View File

@@ -398,6 +398,7 @@ static const struct gpu_ops gp106_ops = {
gr_gm20b_get_pmm_per_chiplet_offset, gr_gm20b_get_pmm_per_chiplet_offset,
.split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr, .split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr,
.fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v, .fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v,
.gpc0_gpccs_ctxsw_mailbox_size = gr_gpc0_gpccs_ctxsw_mailbox__size_1_v,
.alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers, .alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers,
.map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers, .map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers,
.commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers, .commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers,

View File

@@ -361,6 +361,7 @@ static const struct gpu_ops gp10b_ops = {
gr_gm20b_get_pmm_per_chiplet_offset, gr_gm20b_get_pmm_per_chiplet_offset,
.split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr, .split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr,
.fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v, .fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v,
.gpc0_gpccs_ctxsw_mailbox_size = gr_gpc0_gpccs_ctxsw_mailbox__size_1_v,
.alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers, .alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers,
.map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers, .map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers,
.commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers, .commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers,

View File

@@ -483,6 +483,7 @@ static const struct gpu_ops gv100_ops = {
gr_gv11b_get_pmm_per_chiplet_offset, gr_gv11b_get_pmm_per_chiplet_offset,
.split_fbpa_broadcast_addr = gr_gv100_split_fbpa_broadcast_addr, .split_fbpa_broadcast_addr = gr_gv100_split_fbpa_broadcast_addr,
.fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v, .fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v,
.gpc0_gpccs_ctxsw_mailbox_size = gr_gpc0_gpccs_ctxsw_mailbox__size_1_v,
.alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers, .alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers,
.map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers, .map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers,
.commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers, .commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers,

View File

@@ -445,6 +445,7 @@ static const struct gpu_ops gv11b_ops = {
gr_gv11b_get_pmm_per_chiplet_offset, gr_gv11b_get_pmm_per_chiplet_offset,
.split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr, .split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr,
.fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v, .fecs_ctxsw_mailbox_size = gr_fecs_ctxsw_mailbox__size_1_v,
.gpc0_gpccs_ctxsw_mailbox_size = gr_gpc0_gpccs_ctxsw_mailbox__size_1_v,
.alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers, .alloc_global_ctx_buffers = gr_gk20a_alloc_global_ctx_buffers,
.map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers, .map_global_ctx_buffers = gr_gk20a_map_global_ctx_buffers,
.commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers, .commit_global_ctx_buffers = gr_gk20a_commit_global_ctx_buffers,

View File

@@ -517,6 +517,7 @@ struct gpu_ops {
u32 *priv_addr_table, u32 *priv_addr_table,
u32 *priv_addr_table_index); u32 *priv_addr_table_index);
u32 (*fecs_ctxsw_mailbox_size)(void); u32 (*fecs_ctxsw_mailbox_size)(void);
u32 (*gpc0_gpccs_ctxsw_mailbox_size)(void);
int (*init_sw_bundle64)(struct gk20a *g); int (*init_sw_bundle64)(struct gk20a *g);
int (*alloc_global_ctx_buffers)(struct gk20a *g); int (*alloc_global_ctx_buffers)(struct gk20a *g);
int (*map_global_ctx_buffers)(struct gk20a *g, int (*map_global_ctx_buffers)(struct gk20a *g,

View File

@@ -1380,6 +1380,10 @@ static inline u32 gr_gpc0_gpccs_ctxsw_status_1_r(void)
{ {
return 0x00502400U; return 0x00502400U;
} }
static inline u32 gr_gpc0_gpccs_ctxsw_mailbox__size_1_v(void)
{
return 0x00000010U;
}
static inline u32 gr_fecs_ctxsw_idlestate_r(void) static inline u32 gr_fecs_ctxsw_idlestate_r(void)
{ {
return 0x00409420U; return 0x00409420U;
@@ -3804,4 +3808,61 @@ static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f(void)
{ {
return 0x40000000U; return 0x40000000U;
} }
static inline u32 gr_gpc0_gpccs_falcon_irqstat_r(void)
{
return 0x00502008U;
}
static inline u32 gr_gpc0_gpccs_falcon_irqmode_r(void)
{
return 0x0050200cU;
}
static inline u32 gr_gpc0_gpccs_falcon_irqmask_r(void)
{
return 0x00502018U;
}
static inline u32 gr_gpc0_gpccs_falcon_irqdest_r(void)
{
return 0x0050201cU;
}
static inline u32 gr_gpc0_gpccs_falcon_debug1_r(void)
{
return 0x00502090U;
}
static inline u32 gr_gpc0_gpccs_falcon_debuginfo_r(void)
{
return 0x00502094U;
}
static inline u32 gr_gpc0_gpccs_falcon_engctl_r(void)
{
return 0x005020a4U;
}
static inline u32 gr_gpc0_gpccs_falcon_curctx_r(void)
{
return 0x00502050U;
}
static inline u32 gr_gpc0_gpccs_falcon_nxtctx_r(void)
{
return 0x00502054U;
}
static inline u32 gr_gpc0_gpccs_ctxsw_mailbox_r(u32 i)
{
return 0x00502800U + i*4U;
}
static inline u32 gr_gpc0_gpccs_falcon_icd_cmd_r(void)
{
return 0x00502200U;
}
static inline u32 gr_gpc0_gpccs_falcon_icd_cmd_opc_rreg_f(void)
{
return 0x8U;
}
static inline u32 gr_gpc0_gpccs_falcon_icd_cmd_idx_f(u32 v)
{
return (v & 0x1fU) << 8U;
}
static inline u32 gr_gpc_gpccs_falcon_icd_rdata_r(void)
{
return 0x0050220cU;
}
#endif #endif

View File

@@ -1396,6 +1396,10 @@ static inline u32 gr_gpc0_gpccs_ctxsw_status_1_r(void)
{ {
return 0x00502400U; return 0x00502400U;
} }
static inline u32 gr_gpc0_gpccs_ctxsw_mailbox__size_1_v(void)
{
return 0x00000010U;
}
static inline u32 gr_fecs_ctxsw_idlestate_r(void) static inline u32 gr_fecs_ctxsw_idlestate_r(void)
{ {
return 0x00409420U; return 0x00409420U;

View File

@@ -1508,6 +1508,10 @@ static inline u32 gr_gpc0_gpccs_ctxsw_status_1_r(void)
{ {
return 0x00502400U; return 0x00502400U;
} }
static inline u32 gr_gpc0_gpccs_ctxsw_mailbox__size_1_v(void)
{
return 0x00000010U;
}
static inline u32 gr_fecs_ctxsw_idlestate_r(void) static inline u32 gr_fecs_ctxsw_idlestate_r(void)
{ {
return 0x00409420U; return 0x00409420U;

View File

@@ -1584,6 +1584,10 @@ static inline u32 gr_gpc0_gpccs_ctxsw_status_1_r(void)
{ {
return 0x00502400U; return 0x00502400U;
} }
static inline u32 gr_gpc0_gpccs_ctxsw_mailbox__size_1_v(void)
{
return 0x00000010U;
}
static inline u32 gr_fecs_ctxsw_idlestate_r(void) static inline u32 gr_fecs_ctxsw_idlestate_r(void)
{ {
return 0x00409420U; return 0x00409420U;

View File

@@ -1816,6 +1816,10 @@ static inline u32 gr_gpc0_gpccs_ctxsw_status_1_r(void)
{ {
return 0x00502400U; return 0x00502400U;
} }
static inline u32 gr_gpc0_gpccs_ctxsw_mailbox__size_1_v(void)
{
return 0x00000010U;
}
static inline u32 gr_fecs_ctxsw_idlestate_r(void) static inline u32 gr_fecs_ctxsw_idlestate_r(void)
{ {
return 0x00409420U; return 0x00409420U;

View File

@@ -2420,6 +2420,10 @@ static inline u32 gr_gpc0_gpccs_ctxsw_status_1_r(void)
{ {
return 0x00502400U; return 0x00502400U;
} }
static inline u32 gr_gpc0_gpccs_ctxsw_mailbox__size_1_v(void)
{
return 0x00000010U;
}
static inline u32 gr_fecs_ctxsw_idlestate_r(void) static inline u32 gr_fecs_ctxsw_idlestate_r(void)
{ {
return 0x00409420U; return 0x00409420U;