gpu: nvgpu: report fecs ctxsw errors

Introduce hooks for reporting the following ctxsw errors. CTXSW_WATCHDOG CTXSW_CRC_MISMATCH FAULT_DURING_CTXSW Add missing accessors for CTXSW interrupt registers and CRC error mailbox enumeration type. Jira NVGPU-1860 Jira NVGPU-1865 Jira NVGPU-1862 Change-Id: I1a4953b874bdb212497f12ec1493bed30d9a0f67 Signed-off-by: Antony Clince Alex <aalex@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2017998 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2019-02-13 11:24:23 +05:30
parent 1fb5517a96
commit 09d5059369
7 changed files with 97 additions and 6 deletions
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -93,6 +93,29 @@ void nvgpu_report_gr_exception(struct gk20a *g, u32 inst,
 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);


+static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
+		u32 mailbox_value)
+{
+	int ret = 0;
+	struct ctxsw_err_info err_info;
+
+	err_info.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
+	err_info.ctxsw_status0 = gk20a_readl(g, gr_fecs_ctxsw_status_fe_0_r());
+	err_info.ctxsw_status1 = gk20a_readl(g, gr_fecs_ctxsw_status_1_r());
+	err_info.mailbox_value = mailbox_value;
+	err_info.chid = chid;
+
+	if (g->ops.gr.err_ops.report_ctxsw_err != NULL) {
+		ret = g->ops.gr.err_ops.report_ctxsw_err(g,
+				NVGPU_ERR_MODULE_FECS,
+				err_type, (void *)&err_info);
+		if (ret != 0) {
+			nvgpu_err(g, "Failed to report FECS CTXSW error: %d",
+					err_type);
+		}
+	}
+}
+
 void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
 {
 	unsigned int i;
@@ -3206,6 +3229,8 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
 		ret = -1;
 	} else if ((gr_fecs_intr &
 			gr_fecs_host_int_status_watchdog_active_f()) != 0U) {
+		gr_report_ctxsw_error(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
+				chid, 0);
 		/* currently, recovery is not initiated */
 		nvgpu_err(g, "fecs watchdog triggered for channel %u, "
 				"cannot ctxsw anymore !!", chid);
@@ -3220,18 +3245,40 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
 			nvgpu_info(g, "ctxsw intr0 set by ucode, "
 					"timestamp buffer full");
 			nvgpu_gr_fecs_trace_reset_buffer(g);
+		} else
+#endif
+		/*
+		 * The mailbox values may vary across chips hence keeping it
+		 * as a HAL.
+		 */
+		if (g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val
+				!= NULL && mailbox_value ==
+				g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val()) {
+
+			gr_report_ctxsw_error(g, GPU_FECS_CTXSW_CRC_MISMATCH,
+					chid, mailbox_value);
+			nvgpu_err(g, "ctxsw intr0 set by ucode, "
+					"ctxsw checksum mismatch");
+			ret = -1;
 		} else {
+			/*
+			 * Other errors are also treated as fatal and channel
+			 * recovery is initiated and error is reported to
+			 * 3LSS.
+			 */
+			gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
+					chid, mailbox_value);
 			nvgpu_err(g,
 				 "ctxsw intr0 set by ucode, error_code: 0x%08x",
 				 mailbox_value);
 			ret = -1;
 		}
-#else
-		nvgpu_err(g,
-			 "ctxsw intr0 set by ucode, error_code: 0x%08x",
-			 mailbox_value);
+	} else if ((gr_fecs_intr &
+			gr_fecs_host_int_status_fault_during_ctxsw_f(1)) != 0U) {
+		gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
+				chid, 0);
+		nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid);
 		ret = -1;
-#endif
 	} else {
 		nvgpu_err(g,
 			"unhandled fecs error interrupt 0x%08x for channel %u",
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -78,6 +78,12 @@
 */
 #define GR_TPCS_INFO_FOR_MAPREGISTER 6U

+
+u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void)
+{
+	return gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v();
+}
+
 bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num)
 {
 	bool valid = false;
@@ -424,6 +430,7 @@ void gr_gv11b_enable_hww_exceptions(struct gk20a *g)
 void gr_gv11b_fecs_host_int_enable(struct gk20a *g)
 {
 	gk20a_writel(g, gr_fecs_host_int_enable_r(),
+		     gr_fecs_host_int_enable_ctxsw_intr0_enable_f() |
 		     gr_fecs_host_int_enable_ctxsw_intr1_enable_f() |
 		     gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
 		     gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -76,7 +76,7 @@ void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
 					u32 *zcull_map_tiles);
 void gr_gv11b_create_sysfs(struct gk20a *g);
 void gr_gv11b_remove_sysfs(struct gk20a *g);
-
+u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void);
 bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num);
 bool gr_gv11b_is_valid_gfx_class(struct gk20a *g, u32 class_num);
 bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num);
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -455,6 +455,8 @@ static const struct gpu_ops gv11b_ops = {
 			gk20a_gr_get_global_ctx_cb_buffer_size,
 		.get_global_ctx_pagepool_buffer_size =
 			gk20a_gr_get_global_ctx_pagepool_buffer_size,
+		.get_ctxsw_checksum_mismatch_mailbox_val =
+				gr_gv11b_ctxsw_checksum_mismatch_mailbox_val,
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
 				gm20b_ctxsw_prog_hw_get_fecs_header_size,
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -688,6 +688,7 @@ struct gpu_ops {
 				bool enable);
 		} init;

+		u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void);
 		u32 (*fecs_falcon_base_addr)(void);
 		u32 (*gpccs_falcon_base_addr)(void);

@@ -699,6 +700,8 @@ struct gpu_ops {
 			int (*report_gr_err)(struct gk20a *g,
 					u32 hw_id, u32 inst, u32 err_id,
 					u32 status);
+			int (*report_ctxsw_err)(struct gk20a *g,
+					u32 hw_id, u32 err_id, void *data);
 		} err_ops;
 	} gr;
 	struct {
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
@@ -2026,6 +2026,10 @@ static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_f(u32 v)
 {
 	return (v & 0x1U) << 16U;
 }
+static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_active_v(void)
+{
+	return 0x00000001U;
+}
 static inline u32 gr_fecs_host_int_status_umimp_firmware_method_f(u32 v)
 {
 	return (v & 0x1U) << 17U;
@@ -2062,6 +2066,10 @@ static inline u32 gr_fecs_host_int_clear_r(void)
 {
 	return 0x00409c20U;
 }
+static inline u32 gr_fecs_host_int_clear_ctxsw_intr0_clear_v(void)
+{
+	return 0x00000001U;
+}
 static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_f(u32 v)
 {
 	return (v & 0x1U) << 1U;
@@ -2070,10 +2078,18 @@ static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_clear_f(void)
 {
 	return 0x2U;
 }
+static inline u32 gr_fecs_host_int_clear_fault_during_ctxsw_clear_v(void)
+{
+	return 0x00000001U;
+}
 static inline u32 gr_fecs_host_int_enable_r(void)
 {
 	return 0x00409c24U;
 }
+static inline u32 gr_fecs_host_int_enable_ctxsw_intr0_enable_f(void)
+{
+	return 0x1U;
+}
 static inline u32 gr_fecs_host_int_enable_ctxsw_intr1_enable_f(void)
 {
 	return 0x2U;
@@ -2198,6 +2214,10 @@ static inline u32 gr_fecs_ctxsw_mailbox_value_fail_v(void)
 {
 	return 0x00000002U;
 }
+static inline u32 gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v(void)
+{
+	return 0x00000021U;
+}
 static inline u32 gr_fecs_ctxsw_mailbox_set_r(u32 i)
 {
 	return 0x004098c0U + i*4U;
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -81,6 +81,18 @@
 #define GPU_FECS_FALCON_DMEM_ECC_CORRECTED	2U
 #define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED	3U
 #define GPU_FECS_HOST_INT_EXCEPTION		4U
+#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT		5U
+#define GPU_FECS_CTXSW_CRC_MISMATCH		6U
+#define GPU_FECS_FAULT_DURING_CTXSW		7U
+#define GPU_FECS_CTXSW_INIT_ERROR		8U
+#define GPU_FECS_INVALID_ERROR			9U
+struct ctxsw_err_info {
+	u32 curr_ctx;
+	u32 ctxsw_status0;
+	u32 ctxsw_status1;
+	u32 chid;
+	u32 mailbox_value;
+};

 #define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED	0U
 #define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED	1U