gpu: nvgpu: Enable the reporting of ECC errors

Enable the reporting of ECC errors on hw modules
like gr, pmu and ltc. These errors will be notified
to the underlying safety service.

Jira NVGPU-1366

Change-Id: Ibf0f9761d30bcab31809f92aa2b4378360066385
Signed-off-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1955267
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: svc-misra-checker <svc-misra-checker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: Raghuram Kothakota <rkothakota@nvidia.com>
Tested-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-by: Ankur Kishore <ankkishore@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Antony Clince Alex
2018-11-21 15:38:05 +05:30
committed by mobile promotions
parent 16dd642366
commit b10960e7b7
3 changed files with 221 additions and 4 deletions

View File

@@ -24,6 +24,7 @@
#include <nvgpu/io.h> #include <nvgpu/io.h>
#include <nvgpu/gk20a.h> #include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_err.h>
#include "ltc_gp10b.h" #include "ltc_gp10b.h"
#include "ltc_gv11b.h" #include "ltc_gv11b.h"
@@ -35,6 +36,31 @@
#include <nvgpu/utils.h> #include <nvgpu/utils.h>
static void gv11b_ltc_report_ecc_error(struct gk20a *g, u32 ltc, u32 slice,
u32 err_type, u64 err_addr, u64 err_cnt)
{
int ret = 0;
u32 inst = 0U;
if (g->ops.ltc.err_ops.report_ecc_parity_err == NULL) {
return ;
}
if (slice < 256U) {
inst = (ltc << 8U) | slice;
} else {
nvgpu_err(g, "Invalid slice id=%u", slice);
return ;
}
ret = g->ops.ltc.err_ops.report_ecc_parity_err(g,
NVGPU_ERR_MODULE_LTC, inst, err_type, err_addr,
err_cnt);
if (ret != 0) {
nvgpu_err(g, "Failed to report LTC error: inst=%u, \
err_type=%u, err_addr=%llu, err_cnt=%llu",
inst, err_type, err_addr, err_cnt);
}
}
/* /*
* Sets the ZBC stencil for the passed index. * Sets the ZBC stencil for the passed index.
*/ */
@@ -174,21 +200,39 @@ void gv11b_ltc_lts_isr(struct gk20a *g, unsigned int ltc, unsigned int slice)
"ltc:%d lts: %d cache ecc interrupt intr: 0x%x", ltc, slice, ltc_intr3); "ltc:%d lts: %d cache ecc interrupt intr: 0x%x", ltc, slice, ltc_intr3);
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) {
gv11b_ltc_report_ecc_error(g, ltc, slice,
LTC_CACHE_RSTG_ECC_CORRECTED, ecc_addr,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected"); nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected");
} }
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) {
gv11b_ltc_report_ecc_error(g, ltc, slice,
LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected"); nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected");
} }
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) {
gv11b_ltc_report_ecc_error(g, ltc, slice,
LTC_CACHE_TSTG_ECC_CORRECTED, ecc_addr,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected"); nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected");
} }
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) {
gv11b_ltc_report_ecc_error(g, ltc, slice,
LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected");
} }
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) {
gv11b_ltc_report_ecc_error(g, ltc, slice,
LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error corrected"); nvgpu_log(g, gpu_dbg_intr, "dstg ecc error corrected");
} }
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) {
gv11b_ltc_report_ecc_error(g, ltc, slice,
LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected"); nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected");
} }

View File

@@ -29,6 +29,7 @@
#include <nvgpu/io.h> #include <nvgpu/io.h>
#include <nvgpu/utils.h> #include <nvgpu/utils.h>
#include <nvgpu/gk20a.h> #include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_err.h>
#include "pmu_gp10b.h" #include "pmu_gp10b.h"
#include "pmu_gp106.h" #include "pmu_gp106.h"
@@ -115,6 +116,24 @@ static struct pg_init_sequence_list _pginitseq_gv11b[] = {
{0x00020004, 0x00000000} , {0x00020004, 0x00000000} ,
}; };
static void gv11b_pmu_report_ecc_error(struct gk20a *g, u32 inst,
u32 err_type, u64 err_addr, u64 err_cnt)
{
int ret = 0;
if (g->ops.pmu.err_ops.report_ecc_parity_err == NULL) {
return ;
}
ret = g->ops.pmu.err_ops.report_ecc_parity_err(g,
NVGPU_ERR_MODULE_PWR, inst, err_type, err_addr,
err_cnt);
if (ret != 0) {
nvgpu_err(g, "Failed to report PMU error: inst=%u, \
err_type=%u, err_addr=%llu, err_cnt=%llu",
inst, err_type, err_addr, err_cnt);
}
}
int gv11b_pmu_setup_elpg(struct gk20a *g) int gv11b_pmu_setup_elpg(struct gk20a *g)
{ {
int ret = 0; int ret = 0;
@@ -354,18 +373,34 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
"pmu ecc interrupt intr1: 0x%x", intr1); "pmu ecc interrupt intr1: 0x%x", intr1);
if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
gv11b_pmu_report_ecc_error(g, 0,
GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"imem ecc error corrected"); "imem ecc error corrected");
} }
if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
gv11b_pmu_report_ecc_error(g, 0,
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"imem ecc error uncorrected"); "imem ecc error uncorrected");
} }
if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
gv11b_pmu_report_ecc_error(g, 0,
GPU_PMU_FALCON_DMEM_ECC_CORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"dmem ecc error corrected"); "dmem ecc error corrected");
} }
if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
gv11b_pmu_report_ecc_error(g, 0,
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"dmem ecc error uncorrected"); "dmem ecc error uncorrected");
} }

View File

@@ -38,6 +38,7 @@
#include <nvgpu/bitops.h> #include <nvgpu/bitops.h>
#include <nvgpu/gk20a.h> #include <nvgpu/gk20a.h>
#include <nvgpu/channel.h> #include <nvgpu/channel.h>
#include <nvgpu/nvgpu_err.h>
#include "gk20a/gr_gk20a.h" #include "gk20a/gr_gk20a.h"
#include "gk20a/regops_gk20a.h" #include "gk20a/regops_gk20a.h"
@@ -71,6 +72,33 @@
*/ */
#define GR_TPCS_INFO_FOR_MAPREGISTER 6U #define GR_TPCS_INFO_FOR_MAPREGISTER 6U
static void gv11b_gr_report_ecc_error(struct gk20a *g, u32 hw_module,
u32 gpc, u32 tpc, u32 err_type,
u64 err_addr, u64 err_cnt)
{
int ret = 0;
u32 inst = 0U;
if (g->ops.gr.err_ops.report_ecc_parity_err == NULL) {
return ;
}
if (tpc < 256U) {
inst = (gpc << 8) | tpc;
} else {
nvgpu_err(g, "Invalid tpc id=%u", tpc);
return ;
}
ret = g->ops.gr.err_ops.report_ecc_parity_err(g,
hw_module, inst, err_type,
err_addr, err_cnt);
if (ret != 0) {
nvgpu_err(g, "Failed to report GR error: hw_module=%u, \
inst=%u, err_type=%u, err_addr=%llu, \
err_cnt=%llu", hw_module, inst, err_type,
err_addr, err_cnt);
}
}
bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num) bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num)
{ {
bool valid = false; bool valid = false;
@@ -222,6 +250,9 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
l1_tag_corrected_err_count_delta; l1_tag_corrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_L1_TAG_ECC_CORRECTED, 0,
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
0); 0);
@@ -238,6 +269,9 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
l1_tag_uncorrected_err_count_delta; l1_tag_uncorrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
0); 0);
@@ -317,6 +351,9 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
lrf_corrected_err_count_delta; lrf_corrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_LRF_ECC_CORRECTED, 0,
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset,
0); 0);
@@ -333,6 +370,9 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
lrf_uncorrected_err_count_delta; lrf_uncorrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_LRF_ECC_UNCORRECTED, 0,
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
0); 0);
@@ -479,6 +519,9 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter += g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter +=
cbu_corrected_err_count_delta; cbu_corrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_CBU_ECC_CORRECTED,
0, g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset,
0); 0);
@@ -495,6 +538,9 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
cbu_uncorrected_err_count_delta; cbu_uncorrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_CBU_ECC_UNCORRECTED,
0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
0); 0);
@@ -562,6 +608,9 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter += g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter +=
l1_data_corrected_err_count_delta; l1_data_corrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_L1_DATA_ECC_CORRECTED,
0, g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset,
0); 0);
@@ -578,11 +627,13 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
l1_data_uncorrected_err_count_delta; l1_data_uncorrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_L1_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
0); 0);
} }
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset, gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f()); gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f());
@@ -652,6 +703,30 @@ static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset,
0); 0);
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED,
0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter);
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED,
0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter);
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED,
0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter);
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED,
0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter);
}
} }
if ((icache_uncorrected_err_count_delta > 0U) || is_icache_ecc_uncorrected_total_err_overflow) { if ((icache_uncorrected_err_count_delta > 0U) || is_icache_ecc_uncorrected_total_err_overflow) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
@@ -668,6 +743,30 @@ static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset, gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset,
0); 0);
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc,
GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED,
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
}
} }
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset, gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset,
@@ -756,6 +855,9 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter += g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter +=
gcc_l15_corrected_err_count_delta; gcc_l15_corrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc,
GPU_GCC_L15_ECC_CORRECTED,
0, g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + offset, gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + offset,
0); 0);
@@ -772,6 +874,9 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
} }
g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter += g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter +=
gcc_l15_uncorrected_err_count_delta; gcc_l15_uncorrected_err_count_delta;
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc,
GPU_GCC_L15_ECC_UNCORRECTED,
0, g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter);
gk20a_writel(g, gk20a_writel(g,
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + offset, gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + offset,
0); 0);
@@ -844,7 +949,6 @@ static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s()); uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
} }
g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter += g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter +=
corrected_delta; corrected_delta;
g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter += g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter +=
@@ -854,18 +958,30 @@ static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
} }
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
@@ -941,21 +1057,32 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
uncorrected_delta; uncorrected_delta;
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
} }
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
@@ -2516,22 +2643,33 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"fecs ecc interrupt intr: 0x%x", intr); "fecs ecc interrupt intr: 0x%x", intr);
if ((ecc_status & if ((ecc_status &
gr_fecs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { gr_fecs_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0,
GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
} }
if ((ecc_status & if ((ecc_status &
gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0,
GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"imem ecc error uncorrected"); "imem ecc error uncorrected");
} }
if ((ecc_status & if ((ecc_status &
gr_fecs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { gr_fecs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0,
GPU_FECS_FALCON_DMEM_ECC_CORRECTED,
ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
} }
if ((ecc_status & if ((ecc_status &
gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0,
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"dmem ecc error uncorrected"); "dmem ecc error uncorrected");
} }