gpu: nvgpu: Fix CERT INT30-C errors in gr intr unit

Fix CERT INT30-C error in gr interrupt units cert_violation: Unsigned integer operation may wrap. Use nvgpu_safe_ops macros for addition and subtraction. Jira NVGPU-3412 Change-Id: Id2d936e77959005616faf069aff6701789342456 Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2122474 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Seshendra Gadagottu <sgadagottu@nvidia.com> Tested-by: Seshendra Gadagottu <sgadagottu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-05-20 16:16:17 -07:00
parent d652c16fa3
commit cd02e4d70f
5 changed files with 94 additions and 56 deletions
--- a/drivers/gpu/nvgpu/common/gr/gr_intr.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_intr.c
@@ -24,6 +24,7 @@
 #include <nvgpu/io.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/rc.h>
+#include <nvgpu/safe_ops.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/power_features/pg.h>
 #if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -68,7 +69,8 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
 {
 	int tmp_ret, ret = 0;
 	struct nvgpu_gr_tpc_exception pending_tpc;
-	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
+	u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
+					nvgpu_gr_tpc_offset(g, tpc));
 	u32 tpc_exception = g->ops.gr.intr.get_tpc_exception(g, offset,
 							&pending_tpc);
 	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
@@ -298,8 +300,8 @@ struct nvgpu_channel *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
 	intr->chid_tlb[intr->channel_tlb_flush_index].tsgid = tsgid;

 	intr->channel_tlb_flush_index =
-		(intr->channel_tlb_flush_index + 1U) &
-		(GR_CHANNEL_MAP_TLB_SIZE - 1U);
+		(nvgpu_safe_add_u32(intr->channel_tlb_flush_index, 1U)) &
+		(nvgpu_safe_sub_u32(GR_CHANNEL_MAP_TLB_SIZE, 1U));

 unlock:
 	nvgpu_spinlock_release(&intr->ch_tlb_lock);
@@ -374,7 +376,8 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
 	int ret = 0;
 	bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
 	bool disable_sm_exceptions = true;
-	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
+	u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
+					  nvgpu_gr_tpc_offset(g, tpc));
 	bool sm_debugger_attached;
 	u32 global_esr, warp_esr, global_mask;
 	u64 hww_warp_esr_pc = 0;
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c
@@ -23,6 +23,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/io.h>
 #include <nvgpu/class.h>
+#include <nvgpu/safe_ops.h>

 #include <nvgpu/gr/config.h>
 #include <nvgpu/gr/gr.h>
@@ -345,7 +346,10 @@ u32 gm20b_gr_intr_read_gpc_exception(struct gk20a *g, u32 gpc)
 {
 	u32 gpc_offset = nvgpu_gr_gpc_offset(g, gpc);

-	return nvgpu_readl(g, gr_gpc0_gpccs_gpc_exception_r() + gpc_offset);
+	return nvgpu_readl(g,
+			nvgpu_safe_add_u32(
+				gr_gpc0_gpccs_gpc_exception_r(),
+				gpc_offset));
 }

 u32 gm20b_gr_intr_read_exception1(struct gk20a *g)
@@ -397,18 +401,21 @@ u32 gm20b_gr_intr_get_tpc_exception(struct gk20a *g, u32 offset,

 void gm20b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
 {
-	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
+	u32 offset = nvgpu_safe_add_u32(
+			nvgpu_gr_gpc_offset(g, gpc),
+			nvgpu_gr_tpc_offset(g, tpc));
 	u32 esr;

 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");

 	esr = nvgpu_readl(g,
-			  gr_gpc0_tpc0_tex_m_hww_esr_r() + offset);
+			  nvgpu_safe_add_u32(
+				gr_gpc0_tpc0_tex_m_hww_esr_r(), offset));
 	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr);

 	nvgpu_writel(g,
-		     gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
-		     esr);
+		     nvgpu_safe_add_u32(
+			gr_gpc0_tpc0_tex_m_hww_esr_r(), offset), esr);
 }

 void gm20b_gr_intr_enable_hww_exceptions(struct gk20a *g)
@@ -455,21 +462,22 @@ void gm20b_gr_intr_enable_gpc_exceptions(struct gk20a *g,

 	tpc_mask_calc = (u32)BIT32(
 			 nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config));
-	tpc_mask = gr_gpcs_gpccs_gpc_exception_en_tpc_f(tpc_mask_calc - 1U);
+	tpc_mask = gr_gpcs_gpccs_gpc_exception_en_tpc_f(
+				nvgpu_safe_sub_u32(tpc_mask_calc, 1U));

 	nvgpu_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask);
 }

 void gm20ab_gr_intr_tpc_exception_sm_disable(struct gk20a *g, u32 offset)
 {
-	u32 tpc_exception_en = nvgpu_readl(g,
-				gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
-				offset);
+	u32 tpc_exception_en = nvgpu_readl(g, nvgpu_safe_add_u32(
+				gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
+				offset));

 	tpc_exception_en &=
 			~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
-	nvgpu_writel(g,
-		     gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset,
+	nvgpu_writel(g, nvgpu_safe_add_u32(
+		     gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), offset),
 		     tpc_exception_en);
 }

--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c
@@ -24,6 +24,7 @@
 #include <nvgpu/io.h>
 #include <nvgpu/class.h>
 #include <nvgpu/channel.h>
+#include <nvgpu/safe_ops.h>

 #include <nvgpu/gr/config.h>
 #include <nvgpu/gr/gr.h>
@@ -290,7 +291,8 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
 			u32 *hww_global_esr)
 {
 	int ret = 0;
-	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
+	u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
+					  nvgpu_gr_tpc_offset(g, tpc));
 	u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status;
 	u32 lrf_single_count_delta, lrf_double_count_delta;
 	u32 shm_ecc_status;
@@ -300,7 +302,9 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,

 	/* Check for LRF ECC errors. */
        lrf_ecc_status = nvgpu_readl(g,
-			gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
+				nvgpu_safe_add_u32(
+					gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(),
+					offset));
 	lrf_ecc_sed_status =
 		lrf_ecc_status &
 		(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() |
@@ -314,17 +318,17 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
 		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() |
 		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f());
 	lrf_single_count_delta =
-		nvgpu_readl(g,
-			gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() +
-			offset);
+		nvgpu_readl(g, nvgpu_safe_add_u32(
+			    gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r(),
+			    offset));
 	lrf_double_count_delta =
-		nvgpu_readl(g,
-			gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() +
-			offset);
-	nvgpu_writel(g,
-		gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset, 0);
-	nvgpu_writel(g,
-		gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset, 0);
+		nvgpu_readl(g, nvgpu_safe_add_u32(
+			    gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r(),
+			    offset));
+	nvgpu_writel(g, nvgpu_safe_add_u32(
+		gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r(), offset), 0);
+	nvgpu_writel(g, nvgpu_safe_add_u32(
+		gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r(), offset), 0);
 	if (lrf_ecc_sed_status != 0U) {
 		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
 			"Single bit error detected in SM LRF!");
@@ -349,12 +353,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
 		g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
 							lrf_double_count_delta;
 	}
-	nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
+	nvgpu_writel(g, nvgpu_safe_add_u32(
+			gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(), offset),
 			lrf_ecc_status);

 	/* Check for SHM ECC errors. */
-        shm_ecc_status = nvgpu_readl(g,
-			gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset);
+	shm_ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32(
+				     gr_pri_gpc0_tpc0_sm_shm_ecc_status_r(),
+				     offset));
 	if ((shm_ecc_status &
 		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U ||
 		(shm_ecc_status &
@@ -369,17 +375,18 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
 			"Single bit error detected in SM SHM!");

 		ecc_stats_reg_val =
-			nvgpu_readl(g,
-				gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
+			nvgpu_readl(g, nvgpu_safe_add_u32(
+				    gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
+				    offset));
 		g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter +=
 			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val);
 		g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter +=
 			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val);
 		ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() |
 					gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m());
-		nvgpu_writel(g,
-			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
-			ecc_stats_reg_val);
+		nvgpu_writel(g, nvgpu_safe_add_u32(
+			     gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
+			     offset), ecc_stats_reg_val);
 	}
 	if ((shm_ecc_status &
 		gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U ||
@@ -391,17 +398,19 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
 			"Double bit error detected in SM SHM!");

 		ecc_stats_reg_val =
-			nvgpu_readl(g,
-				gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
+			nvgpu_readl(g, nvgpu_safe_add_u32(
+				    gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
+				    offset));
 		g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter +=
 			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val);
 		ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m());
-		nvgpu_writel(g,
-			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
-			ecc_stats_reg_val);
+		nvgpu_writel(g, nvgpu_safe_add_u32(
+			     gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r(),
+			     offset), ecc_stats_reg_val);
 	}
-	nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset,
-			shm_ecc_status);
+	nvgpu_writel(g, nvgpu_safe_add_u32(
+				gr_pri_gpc0_tpc0_sm_shm_ecc_status_r(),
+				offset), shm_ecc_status);


 	return ret;
@@ -539,8 +548,9 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
 							ecc_stats_reg_val);
 		ecc_stats_reg_val &=
 			~gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_ded_m();
-		nvgpu_writel(g,
-			gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r() + offset,
+		nvgpu_writel(g, nvgpu_safe_add_u32(
+				gr_pri_gpc0_tpc0_tex_m_ecc_cnt_unique_r(),
+				offset),
 			ecc_stats_reg_val);


@@ -549,8 +559,8 @@ void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
 			gr_pri_gpc0_tpc0_tex_m_routing_sel_default_f());
 	}

-	nvgpu_writel(g,
-		     gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
+	nvgpu_writel(g, nvgpu_safe_add_u32(
+				gr_gpc0_tpc0_tex_m_hww_esr_r(), offset),
 		     esr | gr_gpc0_tpc0_tex_m_hww_esr_reset_active_f());

 }
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c
@@ -23,6 +23,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/io.h>
 #include <nvgpu/class.h>
+#include <nvgpu/safe_ops.h>

 #include <nvgpu/gr/config.h>
 #include <nvgpu/gr/gr.h>
@@ -271,7 +272,8 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,

 	/* Check for gcc l15 ECC errors. */
 	gcc_l15_ecc_status = nvgpu_readl(g,
-		gr_pri_gpc0_gcc_l15_ecc_status_r() + offset);
+		nvgpu_safe_add_u32(
+			gr_pri_gpc0_gcc_l15_ecc_status_r(), offset));
 	gcc_l15_ecc_corrected_err_status = gcc_l15_ecc_status &
 		(gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank0_m() |
 		 gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank1_m());
@@ -364,7 +366,9 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
 		return;
 	}

-	hww_esr = nvgpu_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
+	hww_esr = nvgpu_readl(g,
+			nvgpu_safe_add_u32(gr_gpc0_mmu_gpcmmu_global_esr_r(),
+						offset));

 	if ((hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() |
 		gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())) == 0U) {
@@ -478,7 +482,9 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
 		return;
 	}

-	hww_esr = nvgpu_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
+	hww_esr = nvgpu_readl(g,
+			nvgpu_safe_add_u32(gr_gpc0_gpccs_hww_esr_r(),
+						offset));

 	if ((hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() |
 			gr_gpc0_gpccs_hww_esr_ecc_corrected_m())) == 0U) {
@@ -572,21 +578,28 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
 void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
 {
 	u32 esr;
-	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
+	u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
+					nvgpu_gr_tpc_offset(g, tpc));

-	esr = nvgpu_readl(g, gr_gpc0_tpc0_mpc_hww_esr_r() + offset);
+	esr = nvgpu_readl(g,
+			nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_r(),
+						offset));
 	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "mpc hww esr 0x%08x", esr);

 	nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
 			GPU_PGRAPH_MPC_EXCEPTION,
 			esr);

-	esr = nvgpu_readl(g, gr_gpc0_tpc0_mpc_hww_esr_info_r() + offset);
+	esr = nvgpu_readl(g,
+			nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),
+						offset));
 	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
 			"mpc hww esr info: veid 0x%08x",
 			gr_gpc0_tpc0_mpc_hww_esr_info_veid_v(esr));

-	nvgpu_writel(g, gr_gpc0_tpc0_mpc_hww_esr_r() + offset,
+	nvgpu_writel(g,
+		     nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_r(),
+						offset),
 		     gr_gpc0_tpc0_mpc_hww_esr_reset_trigger_f());
 }

@@ -644,7 +657,8 @@ void gv11b_gr_intr_enable_exceptions(struct gk20a *g,
 	nvgpu_writel(g, gr_exception2_en_r(), 0x0U); /* BE not enabled */

 	reg_val = (u32)BIT32(nvgpu_gr_config_get_gpc_count(gr_config));
-	nvgpu_writel(g, gr_exception1_en_r(), (reg_val - 1U));
+	nvgpu_writel(g, gr_exception1_en_r(),
+				nvgpu_safe_sub_u32(reg_val, 1U));

 	reg_val = gr_exception_en_fe_enabled_f() |
 			gr_exception_en_memfmt_enabled_f() |
@@ -673,7 +687,8 @@ void gv11b_gr_intr_enable_gpc_exceptions(struct gk20a *g,
 	tpc_mask_calc = (u32)BIT32(
 			 nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config));
 	tpc_mask =
-		gr_gpcs_gpccs_gpc_exception_en_tpc_f(tpc_mask_calc - 1U);
+		gr_gpcs_gpccs_gpc_exception_en_tpc_f(
+			nvgpu_safe_sub_u32(tpc_mask_calc, 1U));

 	nvgpu_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
 		(tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1U) |
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c
@@ -23,6 +23,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/io.h>
 #include <nvgpu/class.h>
+#include <nvgpu/safe_ops.h>

 #include <nvgpu/gr/config.h>

@@ -146,7 +147,8 @@ void tu104_gr_intr_enable_gpc_exceptions(struct gk20a *g,
 	tpc_mask_calc = (u32)BIT32(
 			 nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config));
 	tpc_mask =
-		gr_gpcs_gpccs_gpc_exception_en_tpc_f(tpc_mask_calc - 1U);
+		gr_gpcs_gpccs_gpc_exception_en_tpc_f(
+				nvgpu_safe_sub_u32(tpc_mask_calc, 1U));

 	nvgpu_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
 		(tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1U) |