From 9b7c8cdd8c96feb61d0145f19b28da89e2e3e7aa Mon Sep 17 00:00:00 2001 From: Tejal Kudav Date: Tue, 8 Mar 2022 01:40:52 +0000 Subject: [PATCH] gpu: nvgpu: Update GR intr code as per Orin HSIs Most SM RAMs are protected with parity (except L1 D-cache TAG mem which is protected with SEC-DED ECC). The memory corruption errors reported by these RAMs are therefore uncorrected errors only. Remove the code to handle corrected errors from GR SM ECC. The SM RAMS ECC errors currently report error to SDL using ID GPU_SM_L1_TAG_ECC_(UN)CORRECTED. Update the error reporting to use the newly created error IDs for Drive 6.0. JIRA NVGPU-7987 Change-Id: Ic426d45f851d87aafaa7963b937535582cdafadf Signed-off-by: Tejal Kudav Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2674389 Tested-by: mobile promotions Reviewed-by: mobile promotions --- .../gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c | 3 +- .../nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c | 121 ++++++++---------- .../nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c | 14 -- drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 26 ++-- 4 files changed, 67 insertions(+), 97 deletions(-) diff --git a/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c index 99fedd559..70ce2ca79 100644 --- a/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c @@ -228,8 +228,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { NULL, NULL, NULL, NULL, 0, 0), GPU_CRITERR("icache_l1_predecode_ecc_uncorrected", - GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, - INJECT_SW, + 0, INJECT_NONE, NULL, NULL, NULL, NULL, 0, 0), }, diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c index fa3a32e4f..8224580e2 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c @@ -527,9 +527,7 @@ static bool ga10b_gr_intr_sm_l1_tag_ecc_status_errors(struct gk20a *g, bool err_status = true; corr_err = l1_tag_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()); + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m(); uncorr_err = l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | @@ -553,24 +551,18 @@ static bool ga10b_gr_intr_sm_l1_tag_ecc_status_errors(struct gk20a *g, static bool ga10b_gr_intr_sm_lrf_ecc_status_errors(struct gk20a *g, u32 lrf_ecc_status, struct nvgpu_gr_sm_ecc_status *ecc_status) { - u32 corr_err, uncorr_err; + u32 uncorr_err; bool err_status = true; (void)g; - corr_err = lrf_ecc_status & - (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m()); - uncorr_err = lrf_ecc_status & (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m() | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m() | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m() | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m()); - if ((corr_err == 0U) && (uncorr_err == 0U)) { + if (uncorr_err == 0U) { err_status = false; } @@ -583,7 +575,7 @@ static bool ga10b_gr_intr_sm_lrf_ecc_status_errors(struct gk20a *g, nvgpu_safe_add_u32(ecc_status->err_count, 1U); } - ecc_status->corrected_err_status = corr_err; + ecc_status->corrected_err_status = 0U; ecc_status->uncorrected_err_status = uncorr_err; return err_status; @@ -656,45 +648,60 @@ static bool ga10b_gr_intr_sm_l1_data_ecc_status_errors(struct gk20a *g, return err_status; } +static void ga10b_gr_intr_set_rams_uncorrected_err(struct gk20a *g, + u32 rams_ecc_status, struct nvgpu_gr_sm_ecc_status *ecc_status) +{ + (void)g; + + if ((rams_ecc_status & + gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_data_m()) != 0U) { + ecc_status->err_id[ecc_status->err_count] = + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED; + ecc_status->err_count = + nvgpu_safe_add_u32(ecc_status->err_count, 1U); + } + + if ((rams_ecc_status & + gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_predecode_m()) != 0U) { + ecc_status->err_id[ecc_status->err_count] = + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED; + ecc_status->err_count = + nvgpu_safe_add_u32(ecc_status->err_count, 1U); + } + + if ((rams_ecc_status & + gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_urf_data_m()) != 0U) { + ecc_status->err_id[ecc_status->err_count] = + GPU_SM_RAMS_URF_ECC_UNCORRECTED; + ecc_status->err_count = + nvgpu_safe_add_u32(ecc_status->err_count, 1U); + } +} + static bool ga10b_gr_intr_sm_rams_ecc_status_errors(struct gk20a *g, u32 rams_ecc_status, struct nvgpu_gr_sm_ecc_status *ecc_status) { - u32 corr_err, uncorr_err; + u32 uncorr_err; bool err_status = true; (void)g; - corr_err = rams_ecc_status &\ - (gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_l0ic_data_m() |\ - gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_l0ic_predecode_m() |\ - gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_urf_data_m()); uncorr_err = rams_ecc_status &\ (gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_data_m() |\ gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_l0ic_predecode_m() |\ gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_urf_data_m()); - if ((corr_err == 0U) && (uncorr_err == 0U)) { + if (uncorr_err == 0U) { err_status = false; } ecc_status->err_count = 0U; - if (uncorr_err != 0U) { - ecc_status->err_id[ecc_status->err_count] = - GPU_SM_RAMS_ECC_UNCORRECTED; - ecc_status->err_count = - nvgpu_safe_add_u32(ecc_status->err_count, 1U); - } - if (corr_err != 0U) { - ecc_status->err_id[ecc_status->err_count] = - GPU_SM_RAMS_ECC_CORRECTED; - ecc_status->err_count = - nvgpu_safe_add_u32(ecc_status->err_count, 1U); - } - - ecc_status->corrected_err_status = corr_err; + ecc_status->corrected_err_status = 0U; ecc_status->uncorrected_err_status = uncorr_err; + ga10b_gr_intr_set_rams_uncorrected_err(g, rams_ecc_status, ecc_status); + return err_status; } @@ -744,15 +751,24 @@ static void ga10b_gr_intr_report_tpc_sm_rams_ecc_err(struct gk20a *g, tpc = tpc & U8_MAX; for (i = 0U; i < ecc_status->err_count; i++) { - if (ecc_status->err_id[i] == GPU_SM_RAMS_ECC_CORRECTED) { + if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) { nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_L1_TAG_ECC_CORRECTED); - nvgpu_err(g, "sm_l1_tag_ecc_corrected. " + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - } else { + } + + if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) { nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_L1_TAG_ECC_UNCORRECTED); - nvgpu_err(g, "sm_l1_tag_ecc_uncorrected. " + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED); + nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + } + + if (ecc_status->err_id[i] == GPU_SM_RAMS_URF_ECC_UNCORRECTED) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_RAMS_URF_ECC_UNCORRECTED); + nvgpu_err(g, "sm_rams_urf_ecc_corrected. " "gpc_id(%d), tpc_id(%d)", gpc, tpc); } } @@ -765,9 +781,7 @@ static void ga10b_gr_intr_handle_tpc_sm_rams_ecc_exception(struct gk20a *g, u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); u32 offset; u32 rams_ecc_status; - u32 rams_corrected_err_count_delta = 0U; u32 rams_uncorrected_err_count_delta = 0U; - bool is_rams_ecc_corrected_total_err_overflow = false; bool is_rams_ecc_uncorrected_total_err_overflow = false; struct nvgpu_gr_sm_ecc_status ecc_status; @@ -787,41 +801,14 @@ static void ga10b_gr_intr_handle_tpc_sm_rams_ecc_exception(struct gk20a *g, return; } - rams_corrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_total_v( - nvgpu_readl(g, nvgpu_safe_add_u32( - gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_r(), - offset))); rams_uncorrected_err_count_delta = gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_total_v( nvgpu_readl(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_r(), offset))); - is_rams_ecc_corrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_rams_ecc_status_corrected_err_total_counter_overflow_v(rams_ecc_status) != 0U; is_rams_ecc_uncorrected_total_err_overflow = gr_pri_gpc0_tpc0_sm_rams_ecc_status_uncorrected_err_total_counter_overflow_v(rams_ecc_status) != 0U; - if ((rams_corrected_err_count_delta > 0U) || is_rams_ecc_corrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "corrected error (SBE) detected in SM RAMS! err_mask [%08x] is_overf [%d]", - ecc_status.corrected_err_status, is_rams_ecc_corrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_rams_ecc_corrected_total_err_overflow) { - rams_corrected_err_count_delta = - nvgpu_safe_add_u32(rams_corrected_err_count_delta, - BIT32(gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_total_s())); - } - g->ecc.gr.sm_rams_ecc_corrected_err_count[gpc][tpc].counter = - nvgpu_safe_add_u32( - g->ecc.gr.sm_rams_ecc_corrected_err_count[gpc][tpc].counter, - rams_corrected_err_count_delta); - nvgpu_writel(g, nvgpu_safe_add_u32( - gr_pri_gpc0_tpc0_sm_rams_ecc_corrected_err_count_r(), offset), - 0U); - } - if ((rams_uncorrected_err_count_delta > 0U) || is_rams_ecc_uncorrected_total_err_overflow) { nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "Uncorrected error (DBE) detected in SM RAMS! err_mask [%08x] is_overf [%d]", diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c index 7a7d1f028..a3327301d 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c @@ -1572,13 +1572,6 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g, nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED); } - - if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED) { - nvgpu_err(g, "sm_icache_l1_predecode_ecc_uncorrected. " - "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED); - } } } @@ -1608,13 +1601,6 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED; ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); } - - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { - ecc_status->err_id[ecc_status->err_count] = - GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED; - ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); - } } static bool gv11b_gr_intr_sm_icache_ecc_status_errors(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index c0b2f4c30..4106a3ae6 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -89,20 +89,18 @@ struct mmu_fault_info; * Macros used to assign unique index to errors reported from the SM unit. * @{ */ -#define GPU_SM_L1_TAG_ECC_CORRECTED (0U) -#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U) -#define GPU_SM_CBU_ECC_UNCORRECTED (3U) -#define GPU_SM_LRF_ECC_UNCORRECTED (5U) -#define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U) -#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U) -#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U) -#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U) -#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U) -#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U) -#define GPU_SM_MACHINE_CHECK_ERROR (18U) -#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U) -#define GPU_SM_RAMS_ECC_CORRECTED (21U) -#define GPU_SM_RAMS_ECC_UNCORRECTED (22U) +#define GPU_SM_L1_TAG_ECC_CORRECTED (0x0U) +#define GPU_SM_L1_TAG_ECC_UNCORRECTED (0x1U) +#define GPU_SM_CBU_ECC_UNCORRECTED (0x2U) +#define GPU_SM_LRF_ECC_UNCORRECTED (0x3U) +#define GPU_SM_L1_DATA_ECC_UNCORRECTED (0x4U) +#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (0x5U) +#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (0x6U) +#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (0x7U) +#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (0x8U) +#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (0x9U) +#define GPU_SM_MACHINE_CHECK_ERROR (0xAU) +#define GPU_SM_RAMS_URF_ECC_UNCORRECTED (0xBU) /** * @}