gpu: nvgpu: add missing error reporting for GV11B Hals

Error reportings were removed from the following functions in GV11B

1. gp10b_priv_ring_decode_error_code
2. gv11b_gr_intr_report_gpcmmu_ecc_err
3. gv11b_gr_intr_report_icache_uncorrected_err -> Duplicate
4. gv11b_gr_intr_report_l1_tag_corrected_err -> Duplicate
5. gv11b_gr_intr_report_l1_tag_uncorrected_err -> Duplicate
6. gv11b_ltc_intr_handle_dstg_ecc_interrupts
7. gv11b_ltc_intr_handle_ecc_sec_ded_interrupts
8. gv11b_ltc_intr_handle_tstg_ecc_interrupts
9. gv11b_pbdma_handle_intr_1

The ones marked "Duplicate" are the only ones which are used for both
gv11b and ga10b. Others are invoked only for gv11b and not ga10b.

a) For gv11b_gr_intr_report_l1_tag_corrected_err and
gv11b_gr_intr_report_l1_tag_corrected_err, the errors are handled by moving
them into gv11b_gr_intr_set_l1_tag_corrected_err and
gv11b_gr_intr_set_l1_tag_uncorrected_err functions respectively. These
functions are invoked only from GV11B.

b) For gv11b_gr_intr_report_icache_uncorrected_err, the errors are
handled by adding them in gv11b_set_icache_ecc_status_uncorrected_errors
which is specific to gv11b.

Bug 200588528

Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Change-Id: I581bdfec8f996643d6af63b2b80a135e7d715b89
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2770836
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
Debarshi Dutta
2022-09-02 16:29:19 +05:30
committed by mobile promotions
parent 1274f25dda
commit 00f4dbf9aa
4 changed files with 27 additions and 0 deletions

View File

@@ -195,6 +195,9 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
recover = true; recover = true;
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
GPU_HOST_PBDMA_HCE_ERROR);
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) { if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d", nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",
pbdma_id); pbdma_id);

View File

@@ -376,6 +376,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
0U) { 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc); nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc);
} }
if ((ecc_status & if ((ecc_status &
@@ -390,6 +392,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
0U) { 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc); nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc);
} }
} }
@@ -978,6 +982,8 @@ static void gv11b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
if ((l1_tag_ecc_status & if ((l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_ECC_UNCORRECTED);
ecc_status->err_id[ecc_status->err_count] = ecc_status->err_id[ecc_status->err_count] =
GPU_SM_L1_TAG_ECC_UNCORRECTED; GPU_SM_L1_TAG_ECC_UNCORRECTED;
ecc_status->err_count = ecc_status->err_count =
@@ -1009,6 +1015,8 @@ static void gv11b_gr_intr_set_l1_tag_corrected_err(struct gk20a *g,
if ((l1_tag_ecc_status & if ((l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_ECC_CORRECTED);
ecc_status->err_id[ecc_status->err_count] = ecc_status->err_id[ecc_status->err_count] =
GPU_SM_L1_TAG_ECC_CORRECTED; GPU_SM_L1_TAG_ECC_CORRECTED;
ecc_status->err_count = ecc_status->err_count =
@@ -1592,6 +1600,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g,
ecc_status->err_id[ecc_status->err_count] = ecc_status->err_id[ecc_status->err_count] =
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED; GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED;
ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED);
} }
if ((icache_ecc_status & if ((icache_ecc_status &
@@ -1599,6 +1609,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g,
ecc_status->err_id[ecc_status->err_count] = ecc_status->err_id[ecc_status->err_count] =
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED; GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED;
ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED);
} }
if ((icache_ecc_status & if ((icache_ecc_status &
@@ -1606,6 +1618,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g,
ecc_status->err_id[ecc_status->err_count] = ecc_status->err_id[ecc_status->err_count] =
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED; GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED;
ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
} }
} }

View File

@@ -126,6 +126,8 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
uncorrected_delta); uncorrected_delta);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
nvgpu_err(g, "tstg ecc error uncorrected. " nvgpu_err(g, "tstg ecc error uncorrected. "
"ecc_addr(0x%x)", ecc_addr); "ecc_addr(0x%x)", ecc_addr);
} }
@@ -279,6 +281,8 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
ecc_stats_reg_val); ecc_stats_reg_val);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
nvgpu_err(g, "dstg ecc error corrected. " nvgpu_err(g, "dstg ecc error corrected. "
"ecc_addr(0x%x)", dstg_ecc_addr); "ecc_addr(0x%x)", dstg_ecc_addr);
@@ -324,6 +328,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
ecc_stats_reg_val); ecc_stats_reg_val);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
nvgpu_err(g, "dstg ecc error uncorrected. " nvgpu_err(g, "dstg ecc error uncorrected. "
"ecc_addr(0x%x)", dstg_ecc_addr); "ecc_addr(0x%x)", dstg_ecc_addr);
} }

View File

@@ -71,6 +71,9 @@ void gp10b_priv_ring_decode_error_code(struct gk20a *g,
{ {
u32 error_type_index; u32 error_type_index;
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PRI,
GPU_PRI_ACCESS_VIOLATION);
error_type_index = (error_code & 0x00000f00U) >> 8U; error_type_index = (error_code & 0x00000f00U) >> 8U;
error_code = error_code & 0xBADFf000U; error_code = error_code & 0xBADFf000U;