From 09d50593699ea8ca6ae1ed17b534c596a8d971d1 Mon Sep 17 00:00:00 2001 From: Antony Clince Alex Date: Wed, 13 Feb 2019 11:24:23 +0530 Subject: [PATCH] gpu: nvgpu: report fecs ctxsw errors Introduce hooks for reporting the following ctxsw errors. CTXSW_WATCHDOG CTXSW_CRC_MISMATCH FAULT_DURING_CTXSW Add missing accessors for CTXSW interrupt registers and CRC error mailbox enumeration type. Jira NVGPU-1860 Jira NVGPU-1865 Jira NVGPU-1862 Change-Id: I1a4953b874bdb212497f12ec1493bed30d9a0f67 Signed-off-by: Antony Clince Alex Reviewed-on: https://git-master.nvidia.com/r/2017998 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 57 +++++++++++++++++-- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 7 +++ drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 2 +- drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 2 + drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 3 + .../include/nvgpu/hw/gv11b/hw_gr_gv11b.h | 20 +++++++ drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 12 ++++ 7 files changed, 97 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index f54108028..c94e69ab2 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -93,6 +93,29 @@ void nvgpu_report_gr_exception(struct gk20a *g, u32 inst, static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g); +static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid, + u32 mailbox_value) +{ + int ret = 0; + struct ctxsw_err_info err_info; + + err_info.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r()); + err_info.ctxsw_status0 = gk20a_readl(g, gr_fecs_ctxsw_status_fe_0_r()); + err_info.ctxsw_status1 = gk20a_readl(g, gr_fecs_ctxsw_status_1_r()); + err_info.mailbox_value = mailbox_value; + err_info.chid = chid; + + if (g->ops.gr.err_ops.report_ctxsw_err != NULL) { + ret = g->ops.gr.err_ops.report_ctxsw_err(g, + NVGPU_ERR_MODULE_FECS, + err_type, (void *)&err_info); + if (ret != 0) { + nvgpu_err(g, "Failed to report FECS CTXSW error: %d", + err_type); + } + } +} + void gk20a_fecs_dump_falcon_stats(struct gk20a *g) { unsigned int i; @@ -3206,6 +3229,8 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch, ret = -1; } else if ((gr_fecs_intr & gr_fecs_host_int_status_watchdog_active_f()) != 0U) { + gr_report_ctxsw_error(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT, + chid, 0); /* currently, recovery is not initiated */ nvgpu_err(g, "fecs watchdog triggered for channel %u, " "cannot ctxsw anymore !!", chid); @@ -3220,18 +3245,40 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch, nvgpu_info(g, "ctxsw intr0 set by ucode, " "timestamp buffer full"); nvgpu_gr_fecs_trace_reset_buffer(g); + } else +#endif + /* + * The mailbox values may vary across chips hence keeping it + * as a HAL. + */ + if (g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val + != NULL && mailbox_value == + g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val()) { + + gr_report_ctxsw_error(g, GPU_FECS_CTXSW_CRC_MISMATCH, + chid, mailbox_value); + nvgpu_err(g, "ctxsw intr0 set by ucode, " + "ctxsw checksum mismatch"); + ret = -1; } else { + /* + * Other errors are also treated as fatal and channel + * recovery is initiated and error is reported to + * 3LSS. + */ + gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW, + chid, mailbox_value); nvgpu_err(g, "ctxsw intr0 set by ucode, error_code: 0x%08x", mailbox_value); ret = -1; } -#else - nvgpu_err(g, - "ctxsw intr0 set by ucode, error_code: 0x%08x", - mailbox_value); + } else if ((gr_fecs_intr & + gr_fecs_host_int_status_fault_during_ctxsw_f(1)) != 0U) { + gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW, + chid, 0); + nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid); ret = -1; -#endif } else { nvgpu_err(g, "unhandled fecs error interrupt 0x%08x for channel %u", diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 378e03695..526c9cc2f 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -78,6 +78,12 @@ */ #define GR_TPCS_INFO_FOR_MAPREGISTER 6U + +u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void) +{ + return gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v(); +} + bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num) { bool valid = false; @@ -424,6 +430,7 @@ void gr_gv11b_enable_hww_exceptions(struct gk20a *g) void gr_gv11b_fecs_host_int_enable(struct gk20a *g) { gk20a_writel(g, gr_fecs_host_int_enable_r(), + gr_fecs_host_int_enable_ctxsw_intr0_enable_f() | gr_fecs_host_int_enable_ctxsw_intr1_enable_f() | gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() | gr_fecs_host_int_enable_umimp_firmware_method_enable_f() | diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 45b979fcb..10f2c8b59 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -76,7 +76,7 @@ void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries, u32 *zcull_map_tiles); void gr_gv11b_create_sysfs(struct gk20a *g); void gr_gv11b_remove_sysfs(struct gk20a *g); - +u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void); bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num); bool gr_gv11b_is_valid_gfx_class(struct gk20a *g, u32 class_num); bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num); diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 564ada89e..83ff2c837 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -455,6 +455,8 @@ static const struct gpu_ops gv11b_ops = { gk20a_gr_get_global_ctx_cb_buffer_size, .get_global_ctx_pagepool_buffer_size = gk20a_gr_get_global_ctx_pagepool_buffer_size, + .get_ctxsw_checksum_mismatch_mailbox_val = + gr_gv11b_ctxsw_checksum_mismatch_mailbox_val, .ctxsw_prog = { .hw_get_fecs_header_size = gm20b_ctxsw_prog_hw_get_fecs_header_size, diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index b0436bea7..7ec9a2b80 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -688,6 +688,7 @@ struct gpu_ops { bool enable); } init; + u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void); u32 (*fecs_falcon_base_addr)(void); u32 (*gpccs_falcon_base_addr)(void); @@ -699,6 +700,8 @@ struct gpu_ops { int (*report_gr_err)(struct gk20a *g, u32 hw_id, u32 inst, u32 err_id, u32 status); + int (*report_ctxsw_err)(struct gk20a *g, + u32 hw_id, u32 err_id, void *data); } err_ops; } gr; struct { diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h index 5874bc4e9..2769483a8 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h @@ -2026,6 +2026,10 @@ static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_f(u32 v) { return (v & 0x1U) << 16U; } +static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_active_v(void) +{ + return 0x00000001U; +} static inline u32 gr_fecs_host_int_status_umimp_firmware_method_f(u32 v) { return (v & 0x1U) << 17U; @@ -2062,6 +2066,10 @@ static inline u32 gr_fecs_host_int_clear_r(void) { return 0x00409c20U; } +static inline u32 gr_fecs_host_int_clear_ctxsw_intr0_clear_v(void) +{ + return 0x00000001U; +} static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_f(u32 v) { return (v & 0x1U) << 1U; @@ -2070,10 +2078,18 @@ static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_clear_f(void) { return 0x2U; } +static inline u32 gr_fecs_host_int_clear_fault_during_ctxsw_clear_v(void) +{ + return 0x00000001U; +} static inline u32 gr_fecs_host_int_enable_r(void) { return 0x00409c24U; } +static inline u32 gr_fecs_host_int_enable_ctxsw_intr0_enable_f(void) +{ + return 0x1U; +} static inline u32 gr_fecs_host_int_enable_ctxsw_intr1_enable_f(void) { return 0x2U; @@ -2198,6 +2214,10 @@ static inline u32 gr_fecs_ctxsw_mailbox_value_fail_v(void) { return 0x00000002U; } +static inline u32 gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v(void) +{ + return 0x00000021U; +} static inline u32 gr_fecs_ctxsw_mailbox_set_r(u32 i) { return 0x004098c0U + i*4U; diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 11a4d1ece..29309489c 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -81,6 +81,18 @@ #define GPU_FECS_FALCON_DMEM_ECC_CORRECTED 2U #define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED 3U #define GPU_FECS_HOST_INT_EXCEPTION 4U +#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT 5U +#define GPU_FECS_CTXSW_CRC_MISMATCH 6U +#define GPU_FECS_FAULT_DURING_CTXSW 7U +#define GPU_FECS_CTXSW_INIT_ERROR 8U +#define GPU_FECS_INVALID_ERROR 9U +struct ctxsw_err_info { + u32 curr_ctx; + u32 ctxsw_status0; + u32 ctxsw_status1; + u32 chid; + u32 mailbox_value; +}; #define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED 0U #define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED 1U