gpu: nvgpu: update sm ecc_status_error handling

Use gv11b_gr_intr_handle_tpc_sm_ecc_exception function for future chip to avoid code replication. Add sm_ecc_status_errors hal to read the ecc_status_errors Jira NVGPU-5033 Signed-off-by: Vinod G <vinodg@nvidia.com> Change-Id: I4a25837d9b833a48307b9353b82ff6597f985e41 Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2325537 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2020-04-07 18:16:23 -07:00
parent 72d01afd0c
commit 6a7bf6cdc0
5 changed files with 557 additions and 186 deletions
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h
@@ -58,6 +58,38 @@ struct nvgpu_gr_isr_data;
 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE	U32(0)
 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_TRUE	U32(1)

+#define SHIFT_8_BITS	8U
+
+#define MAX_SM_ECC_ERR_COUNT	8U
+
+/* Enum for different types of SM ecc errors */
+enum nvgpu_gr_sm_ecc_error_types {
+	SM_LRF_ECC_ERROR = 0U,
+	SM_L1_DATA_ECC_ERROR = 1U,
+	SM_L1_TAG_ERROR = 2U,
+	SM_CBU_ECC_ERROR = 3U,
+	SM_ICACHE_ECC_ERROR = 4U,
+	SM_RAMS_ECC_ERROR = 5U
+};
+
+/* Use this struch with each SM ecc_status_error type */
+struct nvgpu_gr_sm_ecc_status {
+	/*
+	 * Total ecc errors reporting back to SDL
+	 * from each sm exception
+	 */
+	u32 err_count;
+
+	/* Error index report to SDL */
+	u32 err_id[MAX_SM_ECC_ERR_COUNT];
+
+	/* Reported corrected error status from SM ecc_status */
+	u32 corrected_err_status;
+
+	/* Reported uncorrected error status from SM ecc_status */
+	u32 uncorrected_err_status;
+};
+
 int gv11b_gr_intr_handle_fecs_error(struct gk20a *g,
 				struct nvgpu_channel *ch_ptr,
 				struct nvgpu_gr_isr_data *isr_data);
@@ -109,6 +141,9 @@ u64 gv11b_gr_intr_get_sm_hww_warp_esr_pc(struct gk20a *g, u32 offset);

 u32 gv11b_gr_intr_ctxsw_checksum_mismatch_mailbox_val(void);

+bool gv11b_gr_intr_sm_ecc_status_errors(struct gk20a *g,
+	u32 ecc_status_reg, enum nvgpu_gr_sm_ecc_error_types err_type,
+	struct nvgpu_gr_sm_ecc_status *ecc_status);
 #ifdef CONFIG_NVGPU_HAL_NON_FUSA
 void gv11b_gr_intr_set_shader_exceptions(struct gk20a *g, u32 data);
 #endif
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -736,6 +736,8 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
 				gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask,
 			.get_ctxsw_checksum_mismatch_mailbox_val =
 				gv11b_gr_intr_ctxsw_checksum_mismatch_mailbox_val,
+			.sm_ecc_status_errors =
+				gv11b_gr_intr_sm_ecc_status_errors,
 #ifdef CONFIG_NVGPU_HAL_NON_FUSA
 			.handle_tex_exception = NULL,
 			.set_shader_exceptions =
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -754,6 +754,8 @@ static const struct gpu_ops tu104_ops = {
 				gv11b_gr_intr_get_sm_hww_global_esr,
 			.get_sm_no_lock_down_hww_global_esr_mask =
 				gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask,
+			.sm_ecc_status_errors =
+				gv11b_gr_intr_sm_ecc_status_errors,
 #ifdef CONFIG_NVGPU_HAL_NON_FUSA
 			.handle_tex_exception = NULL,
 			.set_shader_exceptions =
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
@@ -45,6 +45,9 @@ struct nvgpu_fecs_ecc_status;
 struct nvgpu_fecs_host_intr_status;
 struct netlist_av_list;
 struct nvgpu_hw_err_inject_info_desc;
+struct nvgpu_gr_sm_ecc_status;
+
+enum nvgpu_gr_sm_ecc_error_types;

 #ifdef CONFIG_NVGPU_FECS_TRACE
 struct nvgpu_gr_subctx;
@@ -504,6 +507,9 @@ struct gops_gr_intr {
 	u32 (*get_sm_no_lock_down_hww_global_esr_mask)(
 						struct gk20a *g);
 	u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void);
+	bool (*sm_ecc_status_errors)(struct gk20a *g, u32 ecc_status_reg,
+				enum nvgpu_gr_sm_ecc_error_types err_type,
+				struct nvgpu_gr_sm_ecc_status *ecc_status);
 #ifdef CONFIG_NVGPU_HAL_NON_FUSA
 	void (*handle_tex_exception)(struct gk20a *g,
 				     u32 gpc, u32 tpc);