gpu: nvgpu: Fix CERT INT30-C errors in gr intr unit

Fix CERT INT30-C error in gr interrupt units

cert_violation: Unsigned integer operation may wrap.

Use nvgpu_safe_ops macros for addition and subtraction.

Jira NVGPU-3412

Change-Id: Id2d936e77959005616faf069aff6701789342456
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2122474
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Seshendra Gadagottu <sgadagottu@nvidia.com>
Tested-by: Seshendra Gadagottu <sgadagottu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2019-05-20 16:16:17 -07:00
committed by mobile promotions
parent d652c16fa3
commit cd02e4d70f
5 changed files with 94 additions and 56 deletions

View File

@@ -24,6 +24,7 @@
#include <nvgpu/io.h>
#include <nvgpu/channel.h>
#include <nvgpu/rc.h>
#include <nvgpu/safe_ops.h>
#include <nvgpu/error_notifier.h>
#include <nvgpu/power_features/pg.h>
#if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -68,7 +69,8 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
{
int tmp_ret, ret = 0;
struct nvgpu_gr_tpc_exception pending_tpc;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
nvgpu_gr_tpc_offset(g, tpc));
u32 tpc_exception = g->ops.gr.intr.get_tpc_exception(g, offset,
&pending_tpc);
u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
@@ -298,8 +300,8 @@ struct nvgpu_channel *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
intr->chid_tlb[intr->channel_tlb_flush_index].tsgid = tsgid;
intr->channel_tlb_flush_index =
(intr->channel_tlb_flush_index + 1U) &
(GR_CHANNEL_MAP_TLB_SIZE - 1U);
(nvgpu_safe_add_u32(intr->channel_tlb_flush_index, 1U)) &
(nvgpu_safe_sub_u32(GR_CHANNEL_MAP_TLB_SIZE, 1U));
unlock:
nvgpu_spinlock_release(&intr->ch_tlb_lock);
@@ -374,7 +376,8 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
int ret = 0;
bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
bool disable_sm_exceptions = true;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
nvgpu_gr_tpc_offset(g, tpc));
bool sm_debugger_attached;
u32 global_esr, warp_esr, global_mask;
u64 hww_warp_esr_pc = 0;

View File

@@ -23,6 +23,7 @@
#include <nvgpu/gk20a.h>
#include <nvgpu/io.h>
#include <nvgpu/class.h>
#include <nvgpu/safe_ops.h>
#include <nvgpu/gr/config.h>
#include <nvgpu/gr/gr.h>
@@ -345,7 +346,10 @@ u32 gm20b_gr_intr_read_gpc_exception(struct gk20a *g, u32 gpc)
{
u32 gpc_offset = nvgpu_gr_gpc_offset(g, gpc);
return nvgpu_readl(g, gr_gpc0_gpccs_gpc_exception_r() + gpc_offset);
return nvgpu_readl(g,
nvgpu_safe_add_u32(
gr_gpc0_gpccs_gpc_exception_r(),
gpc_offset));
}
u32 gm20b_gr_intr_read_exception1(struct gk20a *g)
@@ -397,18 +401,21 @@ u32 gm20b_gr_intr_get_tpc_exception(struct gk20a *g, u32 offset,
void gm20b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
{
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
u32 offset = nvgpu_safe_add_u32(
nvgpu_gr_gpc_offset(g, gpc),
nvgpu_gr_tpc_offset(g, tpc));
u32 esr;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
esr = nvgpu_readl(g,
gr_gpc0_tpc0_tex_m_hww_esr_r() + offset);
nvgpu_safe_add_u32(
gr_gpc0_tpc0_tex_m_hww_esr_r(), offset));
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr);
nvgpu_writel(g,
gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
esr);
nvgpu_safe_add_u32(
gr_gpc0_tpc0_tex_m_hww_esr_r(), offset), esr);
}
void gm20b_gr_intr_enable_hww_exceptions(struct gk20a *g)
@@ -455,21 +462,22 @@ void gm20b_gr_intr_enable_gpc_exceptions(struct gk20a *g,
tpc_mask_calc = (u32)BIT32(
nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config));
tpc_mask = gr_gpcs_gpccs_gpc_exception_en_tpc_f(tpc_mask_calc - 1U);
tpc_mask = gr_gpcs_gpccs_gpc_exception_en_tpc_f(
nvgpu_safe_sub_u32(tpc_mask_calc, 1U));
nvgpu_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask);
}
void gm20ab_gr_intr_tpc_exception_sm_disable(struct gk20a *g, u32 offset)
{
u32 tpc_exception_en = nvgpu_readl(g,
gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
offset);
u32 tpc_exception_en = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
offset));
tpc_exception_en &=
~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
nvgpu_writel(g,
gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), offset),
tpc_exception_en);
}

View File

@@ -24,6 +24,7 @@
#include <nvgpu/io.h>
#include <nvgpu/class.h>
#include <nvgpu/channel.h>
#include <nvgpu/safe_ops.h>
#include <nvgpu/gr/config.h>
#include <nvgpu/gr/gr.h>
@@ -290,7 +291,8 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
u32 *hww_global_esr)
{
int ret = 0;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
nvgpu_gr_tpc_offset(g, tpc));
u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status;
u32 lrf_single_count_delta, lrf_double_count_delta;
u32 shm_ecc_status;
@@ -300,7 +302,9 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
/* Check for LRF ECC errors. */
lrf_ecc_status = nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(),
offset));
lrf_ecc_sed_status =
lrf_ecc_status &
(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() |
@@ -314,17 +318,17 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f());
lrf_single_count_delta =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() +
offset);
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r(),
offset));
lrf_double_count_delta =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() +
offset);
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset, 0);
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset, 0);
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r(),
offset));
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r(), offset), 0);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r(), offset), 0);
if (lrf_ecc_sed_status != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Single bit error detected in SM LRF!");
@@ -349,12 +353,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
lrf_double_count_delta;
}
nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(), offset),
lrf_ecc_status);
/* Check for SHM ECC errors. */
shm_ecc_status = nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset);
shm_ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_status_r(),
offset));
if ((shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U ||
(shm_ecc_status &
@@ -369,17 +375,18 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
"Single bit error detected in SM SHM!");
ecc_stats_reg_val =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
offset));
g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val);
g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val);
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() |
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m());
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
ecc_stats_reg_val);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
offset), ecc_stats_reg_val);
}
if ((shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U ||
@@ -391,17 +398,19 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
"Double bit error detected in SM SHM!");
ecc_stats_reg_val =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
nvgpu_readl(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
offset));
g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val);
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m());
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
ecc_stats_reg_val);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
offset), ecc_stats_reg_val);
}
nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset,
shm_ecc_status);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_shm_ecc_status_r(),
offset), shm_ecc_status);
return ret;
@@ -539,8 +548,9 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
ecc_stats_reg_val);
ecc_stats_reg_val &=
~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_m();
nvgpu_writel(g,
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(),
offset),
ecc_stats_reg_val);
@@ -549,8 +559,8 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
gr_pri_gpc0_tpc0_tex_m_routing_sel_default_f());
}
nvgpu_writel(g,
gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_gpc0_tpc0_tex_m_hww_esr_r(), offset),
esr | gr_gpc0_tpc0_tex_m_hww_esr_reset_active_f());
}

View File

@@ -23,6 +23,7 @@
#include <nvgpu/gk20a.h>
#include <nvgpu/io.h>
#include <nvgpu/class.h>
#include <nvgpu/safe_ops.h>
#include <nvgpu/gr/config.h>
#include <nvgpu/gr/gr.h>
@@ -271,7 +272,8 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
/* Check for gcc l15 ECC errors. */
gcc_l15_ecc_status = nvgpu_readl(g,
gr_pri_gpc0_gcc_l15_ecc_status_r() + offset);
nvgpu_safe_add_u32(
gr_pri_gpc0_gcc_l15_ecc_status_r(), offset));
gcc_l15_ecc_corrected_err_status = gcc_l15_ecc_status &
(gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank0_m() |
gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank1_m());
@@ -364,7 +366,9 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
return;
}
hww_esr = nvgpu_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
hww_esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_mmu_gpcmmu_global_esr_r(),
offset));
if ((hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() |
gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())) == 0U) {
@@ -478,7 +482,9 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
return;
}
hww_esr = nvgpu_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
hww_esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_gpccs_hww_esr_r(),
offset));
if ((hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() |
gr_gpc0_gpccs_hww_esr_ecc_corrected_m())) == 0U) {
@@ -572,21 +578,28 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
{
u32 esr;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
nvgpu_gr_tpc_offset(g, tpc));
esr = nvgpu_readl(g, gr_gpc0_tpc0_mpc_hww_esr_r() + offset);
esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_r(),
offset));
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "mpc hww esr 0x%08x", esr);
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
GPU_PGRAPH_MPC_EXCEPTION,
esr);
esr = nvgpu_readl(g, gr_gpc0_tpc0_mpc_hww_esr_info_r() + offset);
esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),
offset));
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"mpc hww esr info: veid 0x%08x",
gr_gpc0_tpc0_mpc_hww_esr_info_veid_v(esr));
nvgpu_writel(g, gr_gpc0_tpc0_mpc_hww_esr_r() + offset,
nvgpu_writel(g,
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_r(),
offset),
gr_gpc0_tpc0_mpc_hww_esr_reset_trigger_f());
}
@@ -644,7 +657,8 @@ void gv11b_gr_intr_enable_exceptions(struct gk20a *g,
nvgpu_writel(g, gr_exception2_en_r(), 0x0U); /* BE not enabled */
reg_val = (u32)BIT32(nvgpu_gr_config_get_gpc_count(gr_config));
nvgpu_writel(g, gr_exception1_en_r(), (reg_val - 1U));
nvgpu_writel(g, gr_exception1_en_r(),
nvgpu_safe_sub_u32(reg_val, 1U));
reg_val = gr_exception_en_fe_enabled_f() |
gr_exception_en_memfmt_enabled_f() |
@@ -673,7 +687,8 @@ void gv11b_gr_intr_enable_gpc_exceptions(struct gk20a *g,
tpc_mask_calc = (u32)BIT32(
nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config));
tpc_mask =
gr_gpcs_gpccs_gpc_exception_en_tpc_f(tpc_mask_calc - 1U);
gr_gpcs_gpccs_gpc_exception_en_tpc_f(
nvgpu_safe_sub_u32(tpc_mask_calc, 1U));
nvgpu_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
(tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1U) |

View File

@@ -23,6 +23,7 @@
#include <nvgpu/gk20a.h>
#include <nvgpu/io.h>
#include <nvgpu/class.h>
#include <nvgpu/safe_ops.h>
#include <nvgpu/gr/config.h>
@@ -146,7 +147,8 @@ void tu104_gr_intr_enable_gpc_exceptions(struct gk20a *g,
tpc_mask_calc = (u32)BIT32(
nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config));
tpc_mask =
gr_gpcs_gpccs_gpc_exception_en_tpc_f(tpc_mask_calc - 1U);
gr_gpcs_gpccs_gpc_exception_en_tpc_f(
nvgpu_safe_sub_u32(tpc_mask_calc, 1U));
nvgpu_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
(tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1U) |