gpu: nvgpu: report fecs ctxsw errors

Introduce hooks for reporting the following ctxsw errors.
CTXSW_WATCHDOG
CTXSW_CRC_MISMATCH
FAULT_DURING_CTXSW

Add missing accessors for CTXSW interrupt registers and
CRC error mailbox enumeration type.

Jira NVGPU-1860
Jira NVGPU-1865
Jira NVGPU-1862

Change-Id: I1a4953b874bdb212497f12ec1493bed30d9a0f67
Signed-off-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2017998
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Antony Clince Alex
2019-02-13 11:24:23 +05:30
committed by mobile promotions
parent 1fb5517a96
commit 09d5059369
7 changed files with 97 additions and 6 deletions

View File

@@ -93,6 +93,29 @@ void nvgpu_report_gr_exception(struct gk20a *g, u32 inst,
static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
u32 mailbox_value)
{
int ret = 0;
struct ctxsw_err_info err_info;
err_info.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
err_info.ctxsw_status0 = gk20a_readl(g, gr_fecs_ctxsw_status_fe_0_r());
err_info.ctxsw_status1 = gk20a_readl(g, gr_fecs_ctxsw_status_1_r());
err_info.mailbox_value = mailbox_value;
err_info.chid = chid;
if (g->ops.gr.err_ops.report_ctxsw_err != NULL) {
ret = g->ops.gr.err_ops.report_ctxsw_err(g,
NVGPU_ERR_MODULE_FECS,
err_type, (void *)&err_info);
if (ret != 0) {
nvgpu_err(g, "Failed to report FECS CTXSW error: %d",
err_type);
}
}
}
void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
{
unsigned int i;
@@ -3206,6 +3229,8 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
ret = -1;
} else if ((gr_fecs_intr &
gr_fecs_host_int_status_watchdog_active_f()) != 0U) {
gr_report_ctxsw_error(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
chid, 0);
/* currently, recovery is not initiated */
nvgpu_err(g, "fecs watchdog triggered for channel %u, "
"cannot ctxsw anymore !!", chid);
@@ -3220,18 +3245,40 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
nvgpu_info(g, "ctxsw intr0 set by ucode, "
"timestamp buffer full");
nvgpu_gr_fecs_trace_reset_buffer(g);
} else
#endif
/*
* The mailbox values may vary across chips hence keeping it
* as a HAL.
*/
if (g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val
!= NULL && mailbox_value ==
g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val()) {
gr_report_ctxsw_error(g, GPU_FECS_CTXSW_CRC_MISMATCH,
chid, mailbox_value);
nvgpu_err(g, "ctxsw intr0 set by ucode, "
"ctxsw checksum mismatch");
ret = -1;
} else {
/*
* Other errors are also treated as fatal and channel
* recovery is initiated and error is reported to
* 3LSS.
*/
gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
chid, mailbox_value);
nvgpu_err(g,
"ctxsw intr0 set by ucode, error_code: 0x%08x",
mailbox_value);
ret = -1;
}
#else
nvgpu_err(g,
"ctxsw intr0 set by ucode, error_code: 0x%08x",
mailbox_value);
} else if ((gr_fecs_intr &
gr_fecs_host_int_status_fault_during_ctxsw_f(1)) != 0U) {
gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
chid, 0);
nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid);
ret = -1;
#endif
} else {
nvgpu_err(g,
"unhandled fecs error interrupt 0x%08x for channel %u",

View File

@@ -78,6 +78,12 @@
*/
#define GR_TPCS_INFO_FOR_MAPREGISTER 6U
u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void)
{
return gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v();
}
bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num)
{
bool valid = false;
@@ -424,6 +430,7 @@ void gr_gv11b_enable_hww_exceptions(struct gk20a *g)
void gr_gv11b_fecs_host_int_enable(struct gk20a *g)
{
gk20a_writel(g, gr_fecs_host_int_enable_r(),
gr_fecs_host_int_enable_ctxsw_intr0_enable_f() |
gr_fecs_host_int_enable_ctxsw_intr1_enable_f() |
gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |

View File

@@ -76,7 +76,7 @@ void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
u32 *zcull_map_tiles);
void gr_gv11b_create_sysfs(struct gk20a *g);
void gr_gv11b_remove_sysfs(struct gk20a *g);
u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void);
bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num);
bool gr_gv11b_is_valid_gfx_class(struct gk20a *g, u32 class_num);
bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num);

View File

@@ -455,6 +455,8 @@ static const struct gpu_ops gv11b_ops = {
gk20a_gr_get_global_ctx_cb_buffer_size,
.get_global_ctx_pagepool_buffer_size =
gk20a_gr_get_global_ctx_pagepool_buffer_size,
.get_ctxsw_checksum_mismatch_mailbox_val =
gr_gv11b_ctxsw_checksum_mismatch_mailbox_val,
.ctxsw_prog = {
.hw_get_fecs_header_size =
gm20b_ctxsw_prog_hw_get_fecs_header_size,

View File

@@ -688,6 +688,7 @@ struct gpu_ops {
bool enable);
} init;
u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void);
u32 (*fecs_falcon_base_addr)(void);
u32 (*gpccs_falcon_base_addr)(void);
@@ -699,6 +700,8 @@ struct gpu_ops {
int (*report_gr_err)(struct gk20a *g,
u32 hw_id, u32 inst, u32 err_id,
u32 status);
int (*report_ctxsw_err)(struct gk20a *g,
u32 hw_id, u32 err_id, void *data);
} err_ops;
} gr;
struct {

View File

@@ -2026,6 +2026,10 @@ static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_f(u32 v)
{
return (v & 0x1U) << 16U;
}
static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_active_v(void)
{
return 0x00000001U;
}
static inline u32 gr_fecs_host_int_status_umimp_firmware_method_f(u32 v)
{
return (v & 0x1U) << 17U;
@@ -2062,6 +2066,10 @@ static inline u32 gr_fecs_host_int_clear_r(void)
{
return 0x00409c20U;
}
static inline u32 gr_fecs_host_int_clear_ctxsw_intr0_clear_v(void)
{
return 0x00000001U;
}
static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_f(u32 v)
{
return (v & 0x1U) << 1U;
@@ -2070,10 +2078,18 @@ static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_clear_f(void)
{
return 0x2U;
}
static inline u32 gr_fecs_host_int_clear_fault_during_ctxsw_clear_v(void)
{
return 0x00000001U;
}
static inline u32 gr_fecs_host_int_enable_r(void)
{
return 0x00409c24U;
}
static inline u32 gr_fecs_host_int_enable_ctxsw_intr0_enable_f(void)
{
return 0x1U;
}
static inline u32 gr_fecs_host_int_enable_ctxsw_intr1_enable_f(void)
{
return 0x2U;
@@ -2198,6 +2214,10 @@ static inline u32 gr_fecs_ctxsw_mailbox_value_fail_v(void)
{
return 0x00000002U;
}
static inline u32 gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v(void)
{
return 0x00000021U;
}
static inline u32 gr_fecs_ctxsw_mailbox_set_r(u32 i)
{
return 0x004098c0U + i*4U;

View File

@@ -81,6 +81,18 @@
#define GPU_FECS_FALCON_DMEM_ECC_CORRECTED 2U
#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED 3U
#define GPU_FECS_HOST_INT_EXCEPTION 4U
#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT 5U
#define GPU_FECS_CTXSW_CRC_MISMATCH 6U
#define GPU_FECS_FAULT_DURING_CTXSW 7U
#define GPU_FECS_CTXSW_INIT_ERROR 8U
#define GPU_FECS_INVALID_ERROR 9U
struct ctxsw_err_info {
u32 curr_ctx;
u32 ctxsw_status0;
u32 ctxsw_status1;
u32 chid;
u32 mailbox_value;
};
#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED 0U
#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED 1U