From b10960e7b7a246b65e55c091e09a42ae4dd6c3ed Mon Sep 17 00:00:00 2001 From: Antony Clince Alex Date: Wed, 21 Nov 2018 15:38:05 +0530 Subject: [PATCH] gpu: nvgpu: Enable the reporting of ECC errors Enable the reporting of ECC errors on hw modules like gr, pmu and ltc. These errors will be notified to the underlying safety service. Jira NVGPU-1366 Change-Id: Ibf0f9761d30bcab31809f92aa2b4378360066385 Signed-off-by: Antony Clince Alex Reviewed-on: https://git-master.nvidia.com/r/1955267 Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-misra Reviewed-by: svc-misra-checker GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom Reviewed-by: Raghuram Kothakota Tested-by: Rajesh Devaraj Reviewed-by: Ankur Kishore Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c | 44 +++++++ drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c | 35 ++++++ drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 146 ++++++++++++++++++++++- 3 files changed, 221 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c b/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c index 9906c3255..51380549b 100644 --- a/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c +++ b/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c @@ -24,6 +24,7 @@ #include #include +#include #include "ltc_gp10b.h" #include "ltc_gv11b.h" @@ -35,6 +36,31 @@ #include +static void gv11b_ltc_report_ecc_error(struct gk20a *g, u32 ltc, u32 slice, + u32 err_type, u64 err_addr, u64 err_cnt) +{ + int ret = 0; + u32 inst = 0U; + + if (g->ops.ltc.err_ops.report_ecc_parity_err == NULL) { + return ; + } + if (slice < 256U) { + inst = (ltc << 8U) | slice; + } else { + nvgpu_err(g, "Invalid slice id=%u", slice); + return ; + } + ret = g->ops.ltc.err_ops.report_ecc_parity_err(g, + NVGPU_ERR_MODULE_LTC, inst, err_type, err_addr, + err_cnt); + if (ret != 0) { + nvgpu_err(g, "Failed to report LTC error: inst=%u, \ + err_type=%u, err_addr=%llu, err_cnt=%llu", + inst, err_type, err_addr, err_cnt); + } +} + /* * Sets the ZBC stencil for the passed index. */ @@ -174,21 +200,39 @@ void gv11b_ltc_lts_isr(struct gk20a *g, unsigned int ltc, unsigned int slice) "ltc:%d lts: %d cache ecc interrupt intr: 0x%x", ltc, slice, ltc_intr3); if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { + gv11b_ltc_report_ecc_error(g, ltc, slice, + LTC_CACHE_RSTG_ECC_CORRECTED, ecc_addr, + g->ecc.ltc.ecc_sec_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) { + gv11b_ltc_report_ecc_error(g, ltc, slice, + LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.ecc_ded_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { + gv11b_ltc_report_ecc_error(g, ltc, slice, + LTC_CACHE_TSTG_ECC_CORRECTED, ecc_addr, + g->ecc.ltc.ecc_sec_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) { + gv11b_ltc_report_ecc_error(g, ltc, slice, + LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.ecc_ded_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) { + gv11b_ltc_report_ecc_error(g, ltc, slice, + LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr, + g->ecc.ltc.ecc_sec_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "dstg ecc error corrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) { + gv11b_ltc_report_ecc_error(g, ltc, slice, + LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.ecc_ded_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected"); } diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c b/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c index 837f5ff24..a1f365216 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "pmu_gp10b.h" #include "pmu_gp106.h" @@ -115,6 +116,24 @@ static struct pg_init_sequence_list _pginitseq_gv11b[] = { {0x00020004, 0x00000000} , }; +static void gv11b_pmu_report_ecc_error(struct gk20a *g, u32 inst, + u32 err_type, u64 err_addr, u64 err_cnt) +{ + int ret = 0; + + if (g->ops.pmu.err_ops.report_ecc_parity_err == NULL) { + return ; + } + ret = g->ops.pmu.err_ops.report_ecc_parity_err(g, + NVGPU_ERR_MODULE_PWR, inst, err_type, err_addr, + err_cnt); + if (ret != 0) { + nvgpu_err(g, "Failed to report PMU error: inst=%u, \ + err_type=%u, err_addr=%llu, err_cnt=%llu", + inst, err_type, err_addr, err_cnt); + } +} + int gv11b_pmu_setup_elpg(struct gk20a *g) { int ret = 0; @@ -354,18 +373,34 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) "pmu ecc interrupt intr1: 0x%x", intr1); if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { + gv11b_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + gv11b_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { + gv11b_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_DMEM_ECC_CORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + gv11b_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); } diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 8e912d24d..f0903305f 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "gk20a/gr_gk20a.h" #include "gk20a/regops_gk20a.h" @@ -71,6 +72,33 @@ */ #define GR_TPCS_INFO_FOR_MAPREGISTER 6U +static void gv11b_gr_report_ecc_error(struct gk20a *g, u32 hw_module, + u32 gpc, u32 tpc, u32 err_type, + u64 err_addr, u64 err_cnt) +{ + int ret = 0; + u32 inst = 0U; + + if (g->ops.gr.err_ops.report_ecc_parity_err == NULL) { + return ; + } + if (tpc < 256U) { + inst = (gpc << 8) | tpc; + } else { + nvgpu_err(g, "Invalid tpc id=%u", tpc); + return ; + } + ret = g->ops.gr.err_ops.report_ecc_parity_err(g, + hw_module, inst, err_type, + err_addr, err_cnt); + if (ret != 0) { + nvgpu_err(g, "Failed to report GR error: hw_module=%u, \ + inst=%u, err_type=%u, err_addr=%llu, \ + err_cnt=%llu", hw_module, inst, err_type, + err_addr, err_cnt); + } +} + bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num) { bool valid = false; @@ -222,6 +250,9 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += l1_tag_corrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_L1_TAG_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, 0); @@ -238,6 +269,9 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += l1_tag_uncorrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, 0); @@ -317,6 +351,9 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += lrf_corrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_LRF_ECC_CORRECTED, 0, + g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset, 0); @@ -333,6 +370,9 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += lrf_uncorrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_LRF_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, 0); @@ -479,6 +519,9 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter += cbu_corrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_CBU_ECC_CORRECTED, + 0, g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset, 0); @@ -495,6 +538,9 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += cbu_uncorrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_CBU_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, 0); @@ -562,6 +608,9 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter += l1_data_corrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_L1_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset, 0); @@ -578,11 +627,13 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += l1_data_uncorrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_L1_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, 0); } - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset, gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f()); @@ -652,6 +703,30 @@ static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset, 0); + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } } if ((icache_uncorrected_err_count_delta > 0U) || is_icache_ecc_uncorrected_total_err_overflow) { nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, @@ -668,6 +743,30 @@ static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset, 0); + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } } gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset, @@ -756,6 +855,9 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter += gcc_l15_corrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc, + GPU_GCC_L15_ECC_CORRECTED, + 0, g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter); gk20a_writel(g, gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + offset, 0); @@ -772,6 +874,9 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter += gcc_l15_uncorrected_err_count_delta; + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc, + GPU_GCC_L15_ECC_UNCORRECTED, + 0, g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter); gk20a_writel(g, gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + offset, 0); @@ -844,7 +949,6 @@ static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc, uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s()); } - g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter += corrected_delta; g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter += @@ -854,18 +958,30 @@ static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, + 0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); } if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); } if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, + 0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); } if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); } if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { @@ -941,21 +1057,32 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, uncorrected_delta; nvgpu_log(g, gpu_dbg_intr, "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, + ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); } if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { @@ -2516,22 +2643,33 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) nvgpu_log(g, gpu_dbg_intr, "fecs ecc interrupt intr: 0x%x", intr); - if ((ecc_status & gr_fecs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + GPU_FECS_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if ((ecc_status & gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if ((ecc_status & gr_fecs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + GPU_FECS_FALCON_DMEM_ECC_CORRECTED, + ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if ((ecc_status & gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); }