gpu: nvgpu: Fix CERT INT30-C errors in hal.gr.intr unit

Fix CERT INT30-C erros in hal.gr.intr unit.
Unsigned integer operation may wrap. Use safe_ops macro to fix
the wrap errors.

Jira NVGPU-3585

Change-Id: If806b0e9e54c118dba6808a9c73ff107797d3ee0
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2134074
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2019-06-10 22:40:36 -07:00
committed by mobile promotions
parent 3f08cf8a48
commit a6b1725b04
2 changed files with 266 additions and 186 deletions

View File

@@ -280,11 +280,13 @@ static void gr_gp10b_sm_lrf_ecc_overcount_war(bool single_err,
overcount for the subpartition if the opposite error counts are
zero. */
if (((sed_status & ded_status) != 0U) && (opposite_count == 0U)) {
over_count += (u32)hweight32(sed_status & ded_status);
over_count = nvgpu_safe_add_u32(over_count,
(u32)hweight32(sed_status & ded_status));
}
if (*count_to_adjust > over_count) {
*count_to_adjust -= over_count;
*count_to_adjust = nvgpu_safe_sub_u32(
*count_to_adjust, over_count);
} else {
*count_to_adjust = 0;
}
@@ -343,8 +345,10 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
lrf_ecc_ded_status,
&lrf_single_count_delta,
lrf_double_count_delta);
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
lrf_single_count_delta;
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter,
lrf_single_count_delta);
}
if (lrf_ecc_ded_status != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
@@ -355,8 +359,10 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
lrf_ecc_ded_status,
&lrf_double_count_delta,
lrf_single_count_delta);
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
lrf_double_count_delta;
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
lrf_double_count_delta);
}
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(), offset),
@@ -383,10 +389,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
offset));
g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val);
g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val);
g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val));
g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val));
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() |
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m());
nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -406,8 +416,10 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
offset));
g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val);
g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val));
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m());
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
@@ -445,9 +457,11 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r(), offset));
g->ecc.gr.tex_ecc_total_sec_pipe0_count[gpc][tpc].counter +=
g->ecc.gr.tex_ecc_total_sec_pipe0_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_ecc_total_sec_pipe0_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_sec_v(
ecc_stats_reg_val);
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_sec_m();
nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -456,9 +470,11 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(), offset));
g->ecc.gr.tex_unique_ecc_sec_pipe0_count[gpc][tpc].counter +=
g->ecc.gr.tex_unique_ecc_sec_pipe0_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_unique_ecc_sec_pipe0_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_sec_v(
ecc_stats_reg_val);
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_sec_m();
nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -466,71 +482,6 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
ecc_stats_reg_val);
/* Pipe 1 counters */
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_routing_r() + offset,
gr_pri_gpc0_tpc0_tex_m_routing_sel_pipe1_f());
ecc_stats_reg_val = nvgpu_readl(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r() + offset);
g->ecc.gr.tex_ecc_total_sec_pipe1_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_sec_v(
ecc_stats_reg_val);
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_sec_m();
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r() + offset,
ecc_stats_reg_val);
ecc_stats_reg_val = nvgpu_readl(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r() + offset);
g->ecc.gr.tex_unique_ecc_sec_pipe1_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_sec_v(
ecc_stats_reg_val);
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_sec_m();
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r() + offset,
ecc_stats_reg_val);
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_routing_r() + offset,
gr_pri_gpc0_tpc0_tex_m_routing_sel_default_f());
}
if ((esr & gr_gpc0_tpc0_tex_m_hww_esr_ecc_ded_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Double bit error detected in TEX!");
/* Pipe 0 counters */
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_routing_r() + offset,
gr_pri_gpc0_tpc0_tex_m_routing_sel_pipe0_f());
ecc_stats_reg_val = nvgpu_readl(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r() + offset);
g->ecc.gr.tex_ecc_total_ded_pipe0_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_ded_v(
ecc_stats_reg_val);
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_ded_m();
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r() + offset,
ecc_stats_reg_val);
ecc_stats_reg_val = nvgpu_readl(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r() + offset);
g->ecc.gr.tex_unique_ecc_ded_pipe0_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_v(
ecc_stats_reg_val);
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_m();
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r() + offset,
ecc_stats_reg_val);
/* Pipe 1 counters */
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_routing_r(), offset),
@@ -538,9 +489,52 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r(), offset));
g->ecc.gr.tex_ecc_total_ded_pipe1_count[gpc][tpc].counter +=
g->ecc.gr.tex_ecc_total_sec_pipe1_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_ecc_total_sec_pipe1_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_sec_v(
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_sec_m();
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r(), offset),
ecc_stats_reg_val);
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(), offset));
g->ecc.gr.tex_unique_ecc_sec_pipe1_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_unique_ecc_sec_pipe1_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_sec_v(
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_sec_m();
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(), offset),
ecc_stats_reg_val);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_routing_r(), offset),
gr_pri_gpc0_tpc0_tex_m_routing_sel_default_f());
}
if ((esr & gr_gpc0_tpc0_tex_m_hww_esr_ecc_ded_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Double bit error detected in TEX!");
/* Pipe 0 counters */
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_routing_r(), offset),
gr_pri_gpc0_tpc0_tex_m_routing_sel_pipe0_f());
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r(), offset));
g->ecc.gr.tex_ecc_total_ded_pipe0_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_ecc_total_ded_pipe0_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_ded_v(
ecc_stats_reg_val);
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_ded_m();
nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -549,9 +543,43 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(), offset));
g->ecc.gr.tex_unique_ecc_ded_pipe1_count[gpc][tpc].counter +=
g->ecc.gr.tex_unique_ecc_ded_pipe0_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_unique_ecc_ded_pipe0_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_v(
ecc_stats_reg_val);
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_m();
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(), offset),
ecc_stats_reg_val);
/* Pipe 1 counters */
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_routing_r(), offset),
gr_pri_gpc0_tpc0_tex_m_routing_sel_pipe1_f());
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r(), offset));
g->ecc.gr.tex_ecc_total_ded_pipe1_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_ecc_total_ded_pipe1_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_ded_v(
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_ded_m();
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_total_r(), offset),
ecc_stats_reg_val);
ecc_stats_reg_val = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(), offset));
g->ecc.gr.tex_unique_ecc_ded_pipe1_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.tex_unique_ecc_ded_pipe1_count[gpc][tpc].counter,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_v(
ecc_stats_reg_val));
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_m();
nvgpu_writel(g, nvgpu_safe_add_u32(

View File

@@ -44,10 +44,14 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g)
g->ops.gr.falcon.handle_fecs_ecc_error(g, &fecs_ecc_status);
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter +=
fecs_ecc_status.corrected_delta;
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter +=
fecs_ecc_status.uncorrected_delta;
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter =
nvgpu_safe_add_u32(
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter,
fecs_ecc_status.corrected_delta);
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter =
nvgpu_safe_add_u32(
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter,
fecs_ecc_status.uncorrected_delta);
if (fecs_ecc_status.imem_corrected_err) {
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
@@ -291,14 +295,14 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
gcc_l15_corrected_err_count_delta =
gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_v(
nvgpu_readl(g,
gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() +
offset));
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r(),
offset)));
gcc_l15_uncorrected_err_count_delta =
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_v(
nvgpu_readl(g,
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() +
offset));
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r(),
offset)));
is_gcc_l15_ecc_corrected_total_err_overflow =
gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_total_counter_overflow_v(
gcc_l15_ecc_status) != 0U;
@@ -316,17 +320,21 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
/* HW uses 16-bits counter */
if (is_gcc_l15_ecc_corrected_total_err_overflow) {
gcc_l15_corrected_err_count_delta +=
gcc_l15_corrected_err_count_delta =
nvgpu_safe_add_u32(
gcc_l15_corrected_err_count_delta,
BIT32(
gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_s()
);
));
}
*corrected_err += gcc_l15_corrected_err_count_delta;
*corrected_err = nvgpu_safe_add_u32(
*corrected_err,
gcc_l15_corrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc,
GPU_GCC_L15_ECC_CORRECTED,
0, *corrected_err);
nvgpu_writel(g,
gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + offset, 0);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r(), offset), 0);
}
if ((gcc_l15_uncorrected_err_count_delta > 0U) ||
is_gcc_l15_ecc_uncorrected_total_err_overflow) {
@@ -338,21 +346,25 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
/* HW uses 16-bits counter */
if (is_gcc_l15_ecc_uncorrected_total_err_overflow) {
gcc_l15_uncorrected_err_count_delta +=
gcc_l15_uncorrected_err_count_delta =
nvgpu_safe_add_u32(
gcc_l15_uncorrected_err_count_delta,
BIT32(
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_s()
);
));
}
*uncorrected_err += gcc_l15_uncorrected_err_count_delta;
*uncorrected_err = nvgpu_safe_add_u32(*uncorrected_err,
gcc_l15_uncorrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc,
GPU_GCC_L15_ECC_UNCORRECTED,
0, *uncorrected_err);
nvgpu_writel(g,
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r(), offset),
0);
}
nvgpu_writel(g, gr_pri_gpc0_gcc_l15_ecc_status_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_gcc_l15_ecc_status_r(), offset),
gr_pri_gpc0_gcc_l15_ecc_status_reset_task_f());
}
@@ -378,14 +390,14 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
return;
}
ecc_status = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_status_r() + offset);
ecc_addr = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_address_r() + offset);
corrected_cnt = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset);
uncorrected_cnt = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset);
ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_mmu_l1tlb_ecc_status_r(), offset));
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_mmu_l1tlb_ecc_address_r(), offset));
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r(), offset));
uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r(), offset));
corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(
corrected_cnt);
@@ -400,31 +412,33 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
/* clear the interrupt */
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() +
offset, 0);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r(),
offset), 0);
}
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() +
offset, 0);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r(),
offset), 0);
}
nvgpu_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_mmu_l1tlb_ecc_status_r(), offset),
gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f());
/* Handle overflow */
if (corrected_overflow != 0U) {
corrected_delta +=
BIT32(gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s());
corrected_delta = nvgpu_safe_add_u32(corrected_delta,
BIT32(gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s()));
}
if (uncorrected_overflow != 0U) {
uncorrected_delta +=
BIT32(gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
uncorrected_delta = nvgpu_safe_add_u32(uncorrected_delta,
BIT32(gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s()));
}
*corrected_err += corrected_delta;
*uncorrected_err += uncorrected_delta;
*corrected_err = nvgpu_safe_add_u32(*corrected_err, corrected_delta);
*uncorrected_err = nvgpu_safe_add_u32(
*uncorrected_err, uncorrected_delta);
nvgpu_log(g, gpu_dbg_intr,
"mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
@@ -494,14 +508,14 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
return;
}
ecc_status = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_status_r() + offset);
ecc_addr = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_address_r() + offset);
corrected_cnt = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + offset);
uncorrected_cnt = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset);
ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_gpccs_falcon_ecc_status_r(), offset));
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_gpccs_falcon_ecc_address_r(), offset));
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r(), offset));
uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r(), offset));
corrected_delta =
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v(
@@ -518,21 +532,23 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
/* clear the interrupt */
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() +
offset, 0);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r(),
offset), 0);
}
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() +
offset, 0);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r(),
offset), 0);
}
nvgpu_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_gpccs_falcon_ecc_status_r(), offset),
gr_gpc0_gpccs_falcon_ecc_status_reset_task_f());
*corrected_err += corrected_delta;
*corrected_err += uncorrected_delta;
*corrected_err = nvgpu_safe_add_u32(*corrected_err, corrected_delta);
*uncorrected_err = nvgpu_safe_add_u32(
*uncorrected_err, uncorrected_delta);
nvgpu_log(g, gpu_dbg_intr,
"gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
@@ -788,11 +804,15 @@ static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32
/* HW uses 16-bits counter */
if (is_l1_tag_ecc_corrected_total_err_overflow) {
l1_tag_corrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s());
l1_tag_corrected_err_count_delta =
nvgpu_safe_add_u32(
l1_tag_corrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s()));
}
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
l1_tag_corrected_err_count_delta;
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter,
l1_tag_corrected_err_count_delta);
if ((l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) {
@@ -826,11 +846,15 @@ static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32
/* HW uses 16-bits counter */
if (is_l1_tag_ecc_uncorrected_total_err_overflow) {
l1_tag_uncorrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s());
l1_tag_uncorrected_err_count_delta =
nvgpu_safe_add_u32(
l1_tag_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s()));
}
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
l1_tag_uncorrected_err_count_delta;
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter,
l1_tag_uncorrected_err_count_delta);
if ((l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) {
@@ -930,17 +954,21 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
/* HW uses 16-bits counter */
if (is_lrf_ecc_corrected_total_err_overflow) {
lrf_corrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s());
lrf_corrected_err_count_delta =
nvgpu_safe_add_u32(
lrf_corrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s()));
}
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
lrf_corrected_err_count_delta;
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter,
lrf_corrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_LRF_ECC_CORRECTED, 0,
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter);
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r(), offset),
0);
}
if ((lrf_uncorrected_err_count_delta > 0U) || is_lrf_ecc_uncorrected_total_err_overflow) {
@@ -950,11 +978,15 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
/* HW uses 16-bits counter */
if (is_lrf_ecc_uncorrected_total_err_overflow) {
lrf_uncorrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s());
lrf_uncorrected_err_count_delta =
nvgpu_safe_add_u32(
lrf_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s()));
}
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
lrf_uncorrected_err_count_delta;
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
lrf_uncorrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_LRF_ECC_UNCORRECTED, 0,
@@ -1027,11 +1059,14 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
/* HW uses 16-bits counter */
if (is_cbu_ecc_corrected_total_err_overflow) {
cbu_corrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s());
cbu_corrected_err_count_delta =
nvgpu_safe_add_u32(cbu_corrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s()));
}
g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter +=
cbu_corrected_err_count_delta;
g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter,
cbu_corrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_CBU_ECC_CORRECTED,
@@ -1047,11 +1082,14 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
/* HW uses 16-bits counter */
if (is_cbu_ecc_uncorrected_total_err_overflow) {
cbu_uncorrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s());
cbu_uncorrected_err_count_delta =
nvgpu_safe_add_u32(cbu_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s()));
}
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
cbu_uncorrected_err_count_delta;
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter,
cbu_uncorrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_CBU_ECC_UNCORRECTED,
@@ -1120,11 +1158,15 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
/* HW uses 16-bits counter */
if (is_l1_data_ecc_corrected_total_err_overflow) {
l1_data_corrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s());
l1_data_corrected_err_count_delta =
nvgpu_safe_add_u32(
l1_data_corrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s()));
}
g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter +=
l1_data_corrected_err_count_delta;
g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter,
l1_data_corrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_DATA_ECC_CORRECTED,
@@ -1140,11 +1182,14 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
/* HW uses 16-bits counter */
if (is_l1_data_ecc_uncorrected_total_err_overflow) {
l1_data_uncorrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s());
l1_data_uncorrected_err_count_delta =
nvgpu_safe_add_u32(l1_data_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s()));
}
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
l1_data_uncorrected_err_count_delta;
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter,
l1_data_uncorrected_err_count_delta);
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_DATA_ECC_UNCORRECTED,
@@ -1216,11 +1261,14 @@ static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32
/* HW uses 16-bits counter */
if (is_icache_ecc_corrected_total_err_overflow) {
icache_corrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s());
icache_corrected_err_count_delta =
nvgpu_safe_add_u32(icache_corrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s()));
}
g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter +=
icache_corrected_err_count_delta;
g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter,
icache_corrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(), offset),
0);
@@ -1260,11 +1308,15 @@ static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32
/* HW uses 16-bits counter */
if (is_icache_ecc_uncorrected_total_err_overflow) {
icache_uncorrected_err_count_delta +=
BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s());
icache_uncorrected_err_count_delta =
nvgpu_safe_add_u32(
icache_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s()));
}
g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter +=
icache_uncorrected_err_count_delta;
g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_safe_add_u32(
g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter,
icache_uncorrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(), offset),
0);