From 00f4dbf9aad12b94655116f84c022bef393c6ef2 Mon Sep 17 00:00:00 2001 From: Debarshi Dutta Date: Fri, 2 Sep 2022 16:29:19 +0530 Subject: [PATCH] gpu: nvgpu: add missing error reporting for GV11B Hals Error reportings were removed from the following functions in GV11B 1. gp10b_priv_ring_decode_error_code 2. gv11b_gr_intr_report_gpcmmu_ecc_err 3. gv11b_gr_intr_report_icache_uncorrected_err -> Duplicate 4. gv11b_gr_intr_report_l1_tag_corrected_err -> Duplicate 5. gv11b_gr_intr_report_l1_tag_uncorrected_err -> Duplicate 6. gv11b_ltc_intr_handle_dstg_ecc_interrupts 7. gv11b_ltc_intr_handle_ecc_sec_ded_interrupts 8. gv11b_ltc_intr_handle_tstg_ecc_interrupts 9. gv11b_pbdma_handle_intr_1 The ones marked "Duplicate" are the only ones which are used for both gv11b and ga10b. Others are invoked only for gv11b and not ga10b. a) For gv11b_gr_intr_report_l1_tag_corrected_err and gv11b_gr_intr_report_l1_tag_corrected_err, the errors are handled by moving them into gv11b_gr_intr_set_l1_tag_corrected_err and gv11b_gr_intr_set_l1_tag_uncorrected_err functions respectively. These functions are invoked only from GV11B. b) For gv11b_gr_intr_report_icache_uncorrected_err, the errors are handled by adding them in gv11b_set_icache_ecc_status_uncorrected_errors which is specific to gv11b. Bug 200588528 Signed-off-by: Debarshi Dutta Change-Id: I581bdfec8f996643d6af63b2b80a135e7d715b89 Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2770836 Reviewed-by: Bibek Basu GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c | 3 +++ drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c | 14 ++++++++++++++ .../gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c | 7 +++++++ .../gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c | 3 +++ 4 files changed, 27 insertions(+) diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c index 43408db27..fa406a3c1 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c @@ -195,6 +195,9 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1, recover = true; + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST, + GPU_HOST_PBDMA_HCE_ERROR); + if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) { nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d", pbdma_id); diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c index 85dbe8c08..776aea28a 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c @@ -376,6 +376,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU, + GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED); nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc); } if ((ecc_status & @@ -390,6 +392,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU, + GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED); nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc); } } @@ -978,6 +982,8 @@ static void gv11b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_L1_TAG_ECC_UNCORRECTED); ecc_status->err_id[ecc_status->err_count] = GPU_SM_L1_TAG_ECC_UNCORRECTED; ecc_status->err_count = @@ -1009,6 +1015,8 @@ static void gv11b_gr_intr_set_l1_tag_corrected_err(struct gk20a *g, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_L1_TAG_ECC_CORRECTED); ecc_status->err_id[ecc_status->err_count] = GPU_SM_L1_TAG_ECC_CORRECTED; ecc_status->err_count = @@ -1592,6 +1600,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g, ecc_status->err_id[ecc_status->err_count] = GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED; ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED); } if ((icache_ecc_status & @@ -1599,6 +1609,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g, ecc_status->err_id[ecc_status->err_count] = GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED; ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED); } if ((icache_ecc_status & @@ -1606,6 +1618,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g, ecc_status->err_id[ecc_status->err_count] = GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED; ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U); + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED); } } diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c index 55b4d762a..d5c3ec82a 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c @@ -126,6 +126,8 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g, g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, uncorrected_delta); + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC, + GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED); nvgpu_err(g, "tstg ecc error uncorrected. " "ecc_addr(0x%x)", ecc_addr); } @@ -279,6 +281,8 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ecc_stats_reg_val); + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC, + GPU_LTC_CACHE_DSTG_ECC_CORRECTED); nvgpu_err(g, "dstg ecc error corrected. " "ecc_addr(0x%x)", dstg_ecc_addr); @@ -324,6 +328,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ecc_stats_reg_val); + + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC, + GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED); nvgpu_err(g, "dstg ecc error uncorrected. " "ecc_addr(0x%x)", dstg_ecc_addr); } diff --git a/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c b/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c index d65b337c4..a11c8a9f1 100644 --- a/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c @@ -71,6 +71,9 @@ void gp10b_priv_ring_decode_error_code(struct gk20a *g, { u32 error_type_index; + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PRI, + GPU_PRI_ACCESS_VIOLATION); + error_type_index = (error_code & 0x00000f00U) >> 8U; error_code = error_code & 0xBADFf000U;