gpu: nvgpu: Update GR intr code as per Orin HSIs

Most SM RAMs are protected with parity (except L1 D-cache TAG mem which is protected with SEC-DED ECC). The memory corruption errors reported by these RAMs are therefore uncorrected errors only. Remove the code to handle corrected errors from GR SM ECC. The SM RAMS ECC errors currently report error to SDL using ID GPU_SM_L1_TAG_ECC_(UN)CORRECTED. Update the error reporting to use the newly created error IDs for Drive 6.0. JIRA NVGPU-7987 Change-Id: Ic426d45f851d87aafaa7963b937535582cdafadf Signed-off-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2674389 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2022-03-08 01:40:52 +00:00
parent 70b987dcf1
commit 9b7c8cdd8c
4 changed files with 67 additions and 97 deletions
--- a/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c
@@ -228,8 +228,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
 					NULL, NULL,
 					NULL, NULL, 0, 0),
 			GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
-					GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED,
-					INJECT_SW,
+					0, INJECT_NONE,
 					NULL, NULL,
 					NULL, NULL, 0, 0),
 		},
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c
@@ -527,9 +527,7 @@ static bool ga10b_gr_intr_sm_l1_tag_ecc_status_errors(struct gk20a *g,
 	bool err_status = true;

 	corr_err =  l1_tag_ecc_status &
-		(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
-		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() |
-		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m());
+		gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m();

 	uncorr_err = l1_tag_ecc_status &
 		(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
@@ -553,24 +551,18 @@ static bool ga10b_gr_intr_sm_l1_tag_ecc_status_errors(struct gk20a *g,
 static bool ga10b_gr_intr_sm_lrf_ecc_status_errors(struct gk20a *g,
 	u32 lrf_ecc_status, struct nvgpu_gr_sm_ecc_status *ecc_status)
 {
-	u32 corr_err, uncorr_err;
+	u32 uncorr_err;
 	bool err_status = true;

 	(void)g;

-	corr_err = lrf_ecc_status &
-		(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m() |
-		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m() |
-		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m() |
-		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m());
-
 	uncorr_err = lrf_ecc_status &
 		(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m() |
 		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m() |
 		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m() |
 		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m());

-	if ((corr_err == 0U) && (uncorr_err == 0U)) {
+	if (uncorr_err == 0U) {
 		err_status = false;
 	}

@@ -583,7 +575,7 @@ static bool ga10b_gr_intr_sm_lrf_ecc_status_errors(struct gk20a *g,
 			nvgpu_safe_add_u32(ecc_status->err_count, 1U);
 	}

-	ecc_status->corrected_err_status = corr_err;
+	ecc_status->corrected_err_status = 0U;
 	ecc_status->uncorrected_err_status = uncorr_err;

 	return err_status;
@@ -656,45 +648,60 @@ static bool ga10b_gr_intr_sm_l1_data_ecc_status_errors(struct gk20a *g,
 	return err_status;
 }

+static void ga10b_gr_intr_set_rams_uncorrected_err(struct gk20a *g,
+	u32 rams_ecc_status, struct nvgpu_gr_sm_ecc_status *ecc_status)
+{
+	(void)g;
+
+	if ((rams_ecc_status &
+	    gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_data_m()) != 0U) {
+		ecc_status->err_id[ecc_status->err_count] =
+				GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED;
+		ecc_status->err_count =
+				nvgpu_safe_add_u32(ecc_status->err_count, 1U);
+	}
+
+	if ((rams_ecc_status &
+	    gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_predecode_m()) != 0U) {
+		ecc_status->err_id[ecc_status->err_count] =
+				GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED;
+		ecc_status->err_count =
+				nvgpu_safe_add_u32(ecc_status->err_count, 1U);
+	}
+
+	if ((rams_ecc_status &
+	     gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_urf_data_m()) != 0U) {
+		ecc_status->err_id[ecc_status->err_count] =
+				GPU_SM_RAMS_URF_ECC_UNCORRECTED;
+		ecc_status->err_count =
+				nvgpu_safe_add_u32(ecc_status->err_count, 1U);
+	}
+}
+
 static bool ga10b_gr_intr_sm_rams_ecc_status_errors(struct gk20a *g,
 	u32 rams_ecc_status, struct nvgpu_gr_sm_ecc_status *ecc_status)
 {
-	u32 corr_err, uncorr_err;
+	u32 uncorr_err;
 	bool err_status = true;

 	(void)g;

-	corr_err = rams_ecc_status &\
-		(gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_l0ic_data_m() |\
-		 gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_l0ic_predecode_m() |\
-		 gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_urf_data_m());
 	uncorr_err = rams_ecc_status &\
 		(gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_data_m() |\
 		 gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_predecode_m() |\
 		 gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_urf_data_m());

-	if ((corr_err == 0U) && (uncorr_err == 0U)) {
+	if (uncorr_err == 0U) {
 		err_status = false;
 	}

 	ecc_status->err_count = 0U;

-	if (uncorr_err != 0U) {
-		ecc_status->err_id[ecc_status->err_count] =
-				GPU_SM_RAMS_ECC_UNCORRECTED;
-		ecc_status->err_count =
-			nvgpu_safe_add_u32(ecc_status->err_count, 1U);
-	}
-	if (corr_err != 0U) {
-		ecc_status->err_id[ecc_status->err_count] =
-				GPU_SM_RAMS_ECC_CORRECTED;
-		ecc_status->err_count =
-			nvgpu_safe_add_u32(ecc_status->err_count, 1U);
-	}
-
-	ecc_status->corrected_err_status = corr_err;
+	ecc_status->corrected_err_status = 0U;
 	ecc_status->uncorrected_err_status = uncorr_err;

+	ga10b_gr_intr_set_rams_uncorrected_err(g, rams_ecc_status, ecc_status);
+
 	return err_status;
 }

@@ -744,15 +751,24 @@ static void ga10b_gr_intr_report_tpc_sm_rams_ecc_err(struct gk20a *g,
 	tpc = tpc & U8_MAX;

 	for (i = 0U; i < ecc_status->err_count; i++) {
-		if (ecc_status->err_id[i] == GPU_SM_RAMS_ECC_CORRECTED) {
+		if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) {
 			nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
-					GPU_SM_L1_TAG_ECC_CORRECTED);
-			nvgpu_err(g, "sm_l1_tag_ecc_corrected. "
+					GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED);
+			nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. "
 					"gpc_id(%d), tpc_id(%d)", gpc, tpc);
-		} else {
+		}
+
+		if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) {
 			nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
-					GPU_SM_L1_TAG_ECC_UNCORRECTED);
-			nvgpu_err(g, "sm_l1_tag_ecc_uncorrected. "
+					GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED);
+			nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. "
+					"gpc_id(%d), tpc_id(%d)", gpc, tpc);
+		}
+
+		if (ecc_status->err_id[i] == GPU_SM_RAMS_URF_ECC_UNCORRECTED) {
+			nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
+					GPU_SM_RAMS_URF_ECC_UNCORRECTED);
+			nvgpu_err(g, "sm_rams_urf_ecc_corrected. "
 					"gpc_id(%d), tpc_id(%d)", gpc, tpc);
 		}
 	}
@@ -765,9 +781,7 @@ static void ga10b_gr_intr_handle_tpc_sm_rams_ecc_exception(struct gk20a *g,
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
 	u32 offset;
 	u32 rams_ecc_status;
-	u32 rams_corrected_err_count_delta = 0U;
 	u32 rams_uncorrected_err_count_delta = 0U;
-	bool is_rams_ecc_corrected_total_err_overflow = false;
 	bool is_rams_ecc_uncorrected_total_err_overflow = false;
 	struct nvgpu_gr_sm_ecc_status ecc_status;

@@ -787,41 +801,14 @@ static void ga10b_gr_intr_handle_tpc_sm_rams_ecc_exception(struct gk20a *g,
 		return;
 	}

-	rams_corrected_err_count_delta =
-		gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_total_v(
-			nvgpu_readl(g, nvgpu_safe_add_u32(
-				gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_r(),
-				offset)));
 	rams_uncorrected_err_count_delta =
 		gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_total_v(
 			nvgpu_readl(g, nvgpu_safe_add_u32(
 				gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_r(),
 				offset)));
-	is_rams_ecc_corrected_total_err_overflow =
-		gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_total_counter_overflow_v(rams_ecc_status) != 0U;
 	is_rams_ecc_uncorrected_total_err_overflow =
 		gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_total_counter_overflow_v(rams_ecc_status) != 0U;

-	if ((rams_corrected_err_count_delta > 0U) || is_rams_ecc_corrected_total_err_overflow) {
-		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
-			"corrected error (SBE) detected in SM RAMS! err_mask [%08x] is_overf [%d]",
-			ecc_status.corrected_err_status, is_rams_ecc_corrected_total_err_overflow);
-
-		/* HW uses 16-bits counter */
-		if (is_rams_ecc_corrected_total_err_overflow) {
-			rams_corrected_err_count_delta =
-			   nvgpu_safe_add_u32(rams_corrected_err_count_delta,
-				BIT32(gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_total_s()));
-		}
-		g->ecc.gr.sm_rams_ecc_corrected_err_count[gpc][tpc].counter =
-		   nvgpu_safe_add_u32(
-			g->ecc.gr.sm_rams_ecc_corrected_err_count[gpc][tpc].counter,
-			rams_corrected_err_count_delta);
-		nvgpu_writel(g, nvgpu_safe_add_u32(
-			gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_r(), offset),
-			0U);
-	}
-
 	if ((rams_uncorrected_err_count_delta > 0U) || is_rams_ecc_uncorrected_total_err_overflow) {
 		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
 			"Uncorrected error (DBE) detected in SM RAMS! err_mask [%08x] is_overf [%d]",
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c
@@ -1572,13 +1572,6 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g,
 			nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
 					GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
 		}
-
-		if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED) {
-			nvgpu_err(g, "sm_icache_l1_predecode_ecc_uncorrected. "
-					"gpc_id(%d), tpc_id(%d)", gpc, tpc);
-			nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
-					GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED);
-		}
 	}
 }

@@ -1608,13 +1601,6 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g,
 				GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED;
 		ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
 	}
-
-	if ((icache_ecc_status &
-	     gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) {
-		ecc_status->err_id[ecc_status->err_count] =
-				GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED;
-		ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
-	}
 }

 static bool gv11b_gr_intr_sm_icache_ecc_status_errors(struct gk20a *g,
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -89,20 +89,18 @@ struct mmu_fault_info;
 * Macros used to assign unique index to errors reported from the SM unit.
 * @{
 */
-#define GPU_SM_L1_TAG_ECC_CORRECTED			(0U)
-#define GPU_SM_L1_TAG_ECC_UNCORRECTED			(1U)
-#define GPU_SM_CBU_ECC_UNCORRECTED			(3U)
-#define GPU_SM_LRF_ECC_UNCORRECTED			(5U)
-#define GPU_SM_L1_DATA_ECC_UNCORRECTED			(7U)
-#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED		(9U)
-#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED		(11U)
-#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED	(13U)
-#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED		(15U)
-#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED	(17U)
-#define GPU_SM_MACHINE_CHECK_ERROR			(18U)
-#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED	(20U)
-#define GPU_SM_RAMS_ECC_CORRECTED			(21U)
-#define GPU_SM_RAMS_ECC_UNCORRECTED			(22U)
+#define GPU_SM_L1_TAG_ECC_CORRECTED			(0x0U)
+#define GPU_SM_L1_TAG_ECC_UNCORRECTED			(0x1U)
+#define GPU_SM_CBU_ECC_UNCORRECTED			(0x2U)
+#define GPU_SM_LRF_ECC_UNCORRECTED			(0x3U)
+#define GPU_SM_L1_DATA_ECC_UNCORRECTED			(0x4U)
+#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED		(0x5U)
+#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED		(0x6U)
+#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED	(0x7U)
+#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED		(0x8U)
+#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED	(0x9U)
+#define GPU_SM_MACHINE_CHECK_ERROR			(0xAU)
+#define GPU_SM_RAMS_URF_ECC_UNCORRECTED			(0xBU)

 /**
 * @}