mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 02:22:34 +03:00
gpu: nvgpu: report fecs ctxsw errors
Introduce hooks for reporting the following ctxsw errors. CTXSW_WATCHDOG CTXSW_CRC_MISMATCH FAULT_DURING_CTXSW Add missing accessors for CTXSW interrupt registers and CRC error mailbox enumeration type. Jira NVGPU-1860 Jira NVGPU-1865 Jira NVGPU-1862 Change-Id: I1a4953b874bdb212497f12ec1493bed30d9a0f67 Signed-off-by: Antony Clince Alex <aalex@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2017998 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
1fb5517a96
commit
09d5059369
@@ -93,6 +93,29 @@ void nvgpu_report_gr_exception(struct gk20a *g, u32 inst,
|
||||
static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
|
||||
|
||||
|
||||
static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
|
||||
u32 mailbox_value)
|
||||
{
|
||||
int ret = 0;
|
||||
struct ctxsw_err_info err_info;
|
||||
|
||||
err_info.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
|
||||
err_info.ctxsw_status0 = gk20a_readl(g, gr_fecs_ctxsw_status_fe_0_r());
|
||||
err_info.ctxsw_status1 = gk20a_readl(g, gr_fecs_ctxsw_status_1_r());
|
||||
err_info.mailbox_value = mailbox_value;
|
||||
err_info.chid = chid;
|
||||
|
||||
if (g->ops.gr.err_ops.report_ctxsw_err != NULL) {
|
||||
ret = g->ops.gr.err_ops.report_ctxsw_err(g,
|
||||
NVGPU_ERR_MODULE_FECS,
|
||||
err_type, (void *)&err_info);
|
||||
if (ret != 0) {
|
||||
nvgpu_err(g, "Failed to report FECS CTXSW error: %d",
|
||||
err_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
|
||||
{
|
||||
unsigned int i;
|
||||
@@ -3206,6 +3229,8 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
|
||||
ret = -1;
|
||||
} else if ((gr_fecs_intr &
|
||||
gr_fecs_host_int_status_watchdog_active_f()) != 0U) {
|
||||
gr_report_ctxsw_error(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
|
||||
chid, 0);
|
||||
/* currently, recovery is not initiated */
|
||||
nvgpu_err(g, "fecs watchdog triggered for channel %u, "
|
||||
"cannot ctxsw anymore !!", chid);
|
||||
@@ -3220,18 +3245,40 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
|
||||
nvgpu_info(g, "ctxsw intr0 set by ucode, "
|
||||
"timestamp buffer full");
|
||||
nvgpu_gr_fecs_trace_reset_buffer(g);
|
||||
} else
|
||||
#endif
|
||||
/*
|
||||
* The mailbox values may vary across chips hence keeping it
|
||||
* as a HAL.
|
||||
*/
|
||||
if (g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val
|
||||
!= NULL && mailbox_value ==
|
||||
g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val()) {
|
||||
|
||||
gr_report_ctxsw_error(g, GPU_FECS_CTXSW_CRC_MISMATCH,
|
||||
chid, mailbox_value);
|
||||
nvgpu_err(g, "ctxsw intr0 set by ucode, "
|
||||
"ctxsw checksum mismatch");
|
||||
ret = -1;
|
||||
} else {
|
||||
/*
|
||||
* Other errors are also treated as fatal and channel
|
||||
* recovery is initiated and error is reported to
|
||||
* 3LSS.
|
||||
*/
|
||||
gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
|
||||
chid, mailbox_value);
|
||||
nvgpu_err(g,
|
||||
"ctxsw intr0 set by ucode, error_code: 0x%08x",
|
||||
mailbox_value);
|
||||
ret = -1;
|
||||
}
|
||||
#else
|
||||
nvgpu_err(g,
|
||||
"ctxsw intr0 set by ucode, error_code: 0x%08x",
|
||||
mailbox_value);
|
||||
} else if ((gr_fecs_intr &
|
||||
gr_fecs_host_int_status_fault_during_ctxsw_f(1)) != 0U) {
|
||||
gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
|
||||
chid, 0);
|
||||
nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid);
|
||||
ret = -1;
|
||||
#endif
|
||||
} else {
|
||||
nvgpu_err(g,
|
||||
"unhandled fecs error interrupt 0x%08x for channel %u",
|
||||
|
||||
@@ -78,6 +78,12 @@
|
||||
*/
|
||||
#define GR_TPCS_INFO_FOR_MAPREGISTER 6U
|
||||
|
||||
|
||||
u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void)
|
||||
{
|
||||
return gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v();
|
||||
}
|
||||
|
||||
bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num)
|
||||
{
|
||||
bool valid = false;
|
||||
@@ -424,6 +430,7 @@ void gr_gv11b_enable_hww_exceptions(struct gk20a *g)
|
||||
void gr_gv11b_fecs_host_int_enable(struct gk20a *g)
|
||||
{
|
||||
gk20a_writel(g, gr_fecs_host_int_enable_r(),
|
||||
gr_fecs_host_int_enable_ctxsw_intr0_enable_f() |
|
||||
gr_fecs_host_int_enable_ctxsw_intr1_enable_f() |
|
||||
gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
|
||||
gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
|
||||
|
||||
@@ -76,7 +76,7 @@ void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
|
||||
u32 *zcull_map_tiles);
|
||||
void gr_gv11b_create_sysfs(struct gk20a *g);
|
||||
void gr_gv11b_remove_sysfs(struct gk20a *g);
|
||||
|
||||
u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void);
|
||||
bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num);
|
||||
bool gr_gv11b_is_valid_gfx_class(struct gk20a *g, u32 class_num);
|
||||
bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num);
|
||||
|
||||
@@ -455,6 +455,8 @@ static const struct gpu_ops gv11b_ops = {
|
||||
gk20a_gr_get_global_ctx_cb_buffer_size,
|
||||
.get_global_ctx_pagepool_buffer_size =
|
||||
gk20a_gr_get_global_ctx_pagepool_buffer_size,
|
||||
.get_ctxsw_checksum_mismatch_mailbox_val =
|
||||
gr_gv11b_ctxsw_checksum_mismatch_mailbox_val,
|
||||
.ctxsw_prog = {
|
||||
.hw_get_fecs_header_size =
|
||||
gm20b_ctxsw_prog_hw_get_fecs_header_size,
|
||||
|
||||
@@ -688,6 +688,7 @@ struct gpu_ops {
|
||||
bool enable);
|
||||
} init;
|
||||
|
||||
u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void);
|
||||
u32 (*fecs_falcon_base_addr)(void);
|
||||
u32 (*gpccs_falcon_base_addr)(void);
|
||||
|
||||
@@ -699,6 +700,8 @@ struct gpu_ops {
|
||||
int (*report_gr_err)(struct gk20a *g,
|
||||
u32 hw_id, u32 inst, u32 err_id,
|
||||
u32 status);
|
||||
int (*report_ctxsw_err)(struct gk20a *g,
|
||||
u32 hw_id, u32 err_id, void *data);
|
||||
} err_ops;
|
||||
} gr;
|
||||
struct {
|
||||
|
||||
@@ -2026,6 +2026,10 @@ static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_f(u32 v)
|
||||
{
|
||||
return (v & 0x1U) << 16U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_active_v(void)
|
||||
{
|
||||
return 0x00000001U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_status_umimp_firmware_method_f(u32 v)
|
||||
{
|
||||
return (v & 0x1U) << 17U;
|
||||
@@ -2062,6 +2066,10 @@ static inline u32 gr_fecs_host_int_clear_r(void)
|
||||
{
|
||||
return 0x00409c20U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_clear_ctxsw_intr0_clear_v(void)
|
||||
{
|
||||
return 0x00000001U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_f(u32 v)
|
||||
{
|
||||
return (v & 0x1U) << 1U;
|
||||
@@ -2070,10 +2078,18 @@ static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_clear_f(void)
|
||||
{
|
||||
return 0x2U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_clear_fault_during_ctxsw_clear_v(void)
|
||||
{
|
||||
return 0x00000001U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_enable_r(void)
|
||||
{
|
||||
return 0x00409c24U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_enable_ctxsw_intr0_enable_f(void)
|
||||
{
|
||||
return 0x1U;
|
||||
}
|
||||
static inline u32 gr_fecs_host_int_enable_ctxsw_intr1_enable_f(void)
|
||||
{
|
||||
return 0x2U;
|
||||
@@ -2198,6 +2214,10 @@ static inline u32 gr_fecs_ctxsw_mailbox_value_fail_v(void)
|
||||
{
|
||||
return 0x00000002U;
|
||||
}
|
||||
static inline u32 gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v(void)
|
||||
{
|
||||
return 0x00000021U;
|
||||
}
|
||||
static inline u32 gr_fecs_ctxsw_mailbox_set_r(u32 i)
|
||||
{
|
||||
return 0x004098c0U + i*4U;
|
||||
|
||||
@@ -81,6 +81,18 @@
|
||||
#define GPU_FECS_FALCON_DMEM_ECC_CORRECTED 2U
|
||||
#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED 3U
|
||||
#define GPU_FECS_HOST_INT_EXCEPTION 4U
|
||||
#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT 5U
|
||||
#define GPU_FECS_CTXSW_CRC_MISMATCH 6U
|
||||
#define GPU_FECS_FAULT_DURING_CTXSW 7U
|
||||
#define GPU_FECS_CTXSW_INIT_ERROR 8U
|
||||
#define GPU_FECS_INVALID_ERROR 9U
|
||||
struct ctxsw_err_info {
|
||||
u32 curr_ctx;
|
||||
u32 ctxsw_status0;
|
||||
u32 ctxsw_status1;
|
||||
u32 chid;
|
||||
u32 mailbox_value;
|
||||
};
|
||||
|
||||
#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED 0U
|
||||
#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED 1U
|
||||
|
||||
Reference in New Issue
Block a user