gpu: nvgpu: move handle_sm_exception to gr.intr

Move gr_gp10b_handle_sm_exception from gr_gp10b to gp10b_gr_intr_handle_sm_exception in hal.gr.intr unit Move gr_gk20a_handle_sm_exception from gr_gk20a to nvgpu_gr_intr_handle_sm_exception in common.gr.intr Move nvgpu_report_gr_sm_exception to common.gr.intr JIRA NVGPU-3016 Change-Id: I545ddca052122f87685f35f515831841a246dab3 Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2103736 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-25 02:52:51 +03:00 · 2019-04-23 14:37:02 -07:00
parent 3bd35af767
commit 490ea365d2
16 changed files with 331 additions and 332 deletions
--- a/drivers/gpu/nvgpu/common/gr/gr_intr.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_intr.c
@@ -26,6 +26,7 @@
 #include <nvgpu/regops.h>
 #include <nvgpu/rc.h>
 #include <nvgpu/error_notifier.h>
+#include <nvgpu/power_features/pg.h>

 #include <nvgpu/gr/gr.h>
 #include <nvgpu/gr/gr_intr.h>
@@ -71,7 +72,7 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
 				"GPC%d TPC%d: SM%d exception pending",
 				 gpc, tpc, sm);

-			tmp_ret = g->ops.gr.handle_sm_exception(g,
+			tmp_ret = g->ops.gr.intr.handle_sm_exception(g,
 					gpc, tpc, sm, post_event, fault_ch,
 					hww_global_esr);
 			ret = (ret != 0) ? ret : tmp_ret;
@@ -153,6 +154,48 @@ static int gr_intr_handle_class_error(struct gk20a *g,
 	return -EINVAL;
 }

+static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
+		u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
+{
+	int ret;
+	struct gr_sm_mcerr_info err_info;
+	struct channel_gk20a *ch;
+	struct gr_err_info info;
+	u32 tsgid, chid, curr_ctx, inst = 0;
+
+	if (g->ops.gr.err_ops.report_gr_err == NULL) {
+		return;
+	}
+
+	tsgid = NVGPU_INVALID_TSG_ID;
+	curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
+	ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
+	chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
+	if (ch != NULL) {
+		gk20a_channel_put(ch);
+	}
+
+	(void) memset(&err_info, 0, sizeof(err_info));
+	(void) memset(&info, 0, sizeof(info));
+	err_info.curr_ctx = curr_ctx;
+	err_info.chid = chid;
+	err_info.tsgid = tsgid;
+	err_info.hww_warp_esr_pc = hww_warp_esr_pc;
+	err_info.hww_warp_esr_status = hww_warp_esr_status;
+	err_info.gpc = gpc;
+	err_info.tpc = tpc;
+	err_info.sm = sm;
+	info.sm_mcerr_info = &err_info;
+	ret = g->ops.gr.err_ops.report_gr_err(g,
+			NVGPU_ERR_MODULE_SM, inst, GPU_SM_MACHINE_CHECK_ERROR,
+			&info);
+	if (ret != 0) {
+		nvgpu_err(g, "failed to report SM_EXCEPTION "
+				"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
+				gpc, tpc, sm, hww_warp_esr_status);
+	}
+}
+
 /* Used by sw interrupt thread to translate current ctx to chid.
 * Also used by regops to translate current ctx to chid and tsgid.
 * For performance, we don't want to go through 128 channels every time.
@@ -295,6 +338,112 @@ void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
 	}
 }

+int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
+		bool *post_event, struct channel_gk20a *fault_ch,
+		u32 *hww_global_esr)
+{
+	int ret = 0;
+	bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
+	bool disable_sm_exceptions = true;
+	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
+	bool sm_debugger_attached;
+	u32 global_esr, warp_esr, global_mask;
+	u64 hww_warp_esr_pc = 0;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
+
+	sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
+
+	global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
+	*hww_global_esr = global_esr;
+	warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
+	global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
+
+	if (!sm_debugger_attached) {
+		nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
+			  global_esr, warp_esr);
+		return -EFAULT;
+	}
+
+	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+		  "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
+
+	/*
+	 * Check and report any fatal wrap errors.
+	 */
+	if ((global_esr & ~global_mask) != 0U) {
+		if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) {
+			hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g,
+					offset);
+		}
+		gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr,
+				hww_warp_esr_pc);
+	}
+	nvgpu_pg_elpg_protected_call(g,
+		g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
+
+	if (g->ops.gr.pre_process_sm_exception != NULL) {
+		ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
+				global_esr, warp_esr,
+				sm_debugger_attached,
+				fault_ch,
+				&early_exit,
+				&ignore_debugger);
+		if (ret != 0) {
+			nvgpu_err(g, "could not pre-process sm error!");
+			return ret;
+		}
+	}
+
+	if (early_exit) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+				"returning early");
+		return ret;
+	}
+
+	/*
+	 * Disable forwarding of tpc exceptions,
+	 * the debugger will reenable exceptions after servicing them.
+	 *
+	 * Do not disable exceptions if the only SM exception is BPT_INT
+	 */
+	if ((g->ops.gr.esr_bpt_pending_events(global_esr,
+			NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) {
+		disable_sm_exceptions = false;
+	}
+
+	if (!ignore_debugger && disable_sm_exceptions) {
+		g->ops.gr.intr.tpc_exception_sm_disable(g, offset);
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			  "SM Exceptions disabled");
+	}
+
+	/* if a debugger is present and an error has occurred, do a warp sync */
+	if (!ignore_debugger &&
+	    ((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) {
+		nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
+		do_warp_sync = true;
+	}
+
+	if (do_warp_sync) {
+		ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
+				 global_mask, true);
+		if (ret != 0) {
+			nvgpu_err(g, "sm did not lock down!");
+			return ret;
+		}
+	}
+
+	if (ignore_debugger) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			"ignore_debugger set, skipping event posting");
+	} else {
+		*post_event = true;
+	}
+
+	return ret;
+}
+
 int nvgpu_gr_intr_handle_gpc_exception(struct gk20a *g, bool *post_event,
 	struct nvgpu_gr_config *gr_config, struct channel_gk20a *fault_ch,
 	u32 *hww_global_esr)
--- a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c
@@ -139,7 +139,6 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.set_sm_debug_mode = vgpu_gr_set_sm_debug_mode,
 		.bpt_reg_info = NULL,
 		.handle_fecs_error = NULL,
-		.handle_sm_exception = NULL,
 		.get_lrf_tex_ltc_dram_override = NULL,
 		.update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode,
 		.update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode,
--- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c
@@ -165,7 +165,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.set_sm_debug_mode = vgpu_gr_set_sm_debug_mode,
 		.bpt_reg_info = NULL,
 		.handle_fecs_error = NULL,
-		.handle_sm_exception = NULL,
 		.get_lrf_tex_ltc_dram_override = NULL,
 		.update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode,
 		.update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode,
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -59,7 +59,6 @@
 #include <nvgpu/engine_status.h>
 #include <nvgpu/nvgpu_err.h>
 #include <nvgpu/power_features/cg.h>
-#include <nvgpu/power_features/pg.h>
 #include <nvgpu/preempt.h>

 #include "gr_gk20a.h"
@@ -67,51 +66,8 @@

 #include "common/gr/gr_priv.h"

-#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>

-static void nvgpu_report_gr_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
-		u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
-{
-	int ret;
-	struct gr_sm_mcerr_info err_info;
-	struct channel_gk20a *ch;
-	struct gr_err_info info;
-	u32 tsgid, chid, curr_ctx, inst = 0;
-
-	if (g->ops.gr.err_ops.report_gr_err == NULL) {
-		return;
-	}
-
-	tsgid = NVGPU_INVALID_TSG_ID;
-	curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
-	ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
-	chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
-	if (ch != NULL) {
-		gk20a_channel_put(ch);
-	}
-
-	(void) memset(&err_info, 0, sizeof(err_info));
-	(void) memset(&info, 0, sizeof(info));
-	err_info.curr_ctx = curr_ctx;
-	err_info.chid = chid;
-	err_info.tsgid = tsgid;
-	err_info.hww_warp_esr_pc = hww_warp_esr_pc;
-	err_info.hww_warp_esr_status = hww_warp_esr_status;
-	err_info.gpc = gpc;
-	err_info.tpc = tpc;
-	err_info.sm = sm;
-	info.sm_mcerr_info = &err_info;
-	ret = g->ops.gr.err_ops.report_gr_err(g,
-			NVGPU_ERR_MODULE_SM, inst, GPU_SM_MACHINE_CHECK_ERROR,
-			&info);
-	if (ret != 0) {
-		nvgpu_err(g, "failed to report SM_EXCEPTION "
-				"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
-				gpc, tpc, sm, hww_warp_esr_status);
-	}
-}
-
 static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
 		u32 mailbox_value)
 {
@@ -373,112 +329,6 @@ bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
 	return false;
 }

-int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
-		bool *post_event, struct channel_gk20a *fault_ch,
-		u32 *hww_global_esr)
-{
-	int ret = 0;
-	bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
-	bool disable_sm_exceptions = true;
-	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
-	bool sm_debugger_attached;
-	u32 global_esr, warp_esr, global_mask;
-	u64 hww_warp_esr_pc = 0;
-
-	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
-
-	sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
-
-	global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
-	*hww_global_esr = global_esr;
-	warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
-	global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
-
-	if (!sm_debugger_attached) {
-		nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
-			  global_esr, warp_esr);
-		return -EFAULT;
-	}
-
-	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
-		  "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
-
-	/*
-	 * Check and report any fatal wrap errors.
-	 */
-	if ((global_esr & ~global_mask) != 0U) {
-		if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) {
-			hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g,
-					offset);
-		}
-		nvgpu_report_gr_sm_exception(g, gpc, tpc, sm, warp_esr,
-				hww_warp_esr_pc);
-	}
-	nvgpu_pg_elpg_protected_call(g,
-		g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
-
-	if (g->ops.gr.pre_process_sm_exception != NULL) {
-		ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
-				global_esr, warp_esr,
-				sm_debugger_attached,
-				fault_ch,
-				&early_exit,
-				&ignore_debugger);
-		if (ret != 0) {
-			nvgpu_err(g, "could not pre-process sm error!");
-			return ret;
-		}
-	}
-
-	if (early_exit) {
-		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
-				"returning early");
-		return ret;
-	}
-
-	/*
-	 * Disable forwarding of tpc exceptions,
-	 * the debugger will reenable exceptions after servicing them.
-	 *
-	 * Do not disable exceptions if the only SM exception is BPT_INT
-	 */
-	if ((g->ops.gr.esr_bpt_pending_events(global_esr,
-			NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) {
-		disable_sm_exceptions = false;
-	}
-
-	if (!ignore_debugger && disable_sm_exceptions) {
-		g->ops.gr.intr.tpc_exception_sm_disable(g, offset);
-		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
-			  "SM Exceptions disabled");
-	}
-
-	/* if a debugger is present and an error has occurred, do a warp sync */
-	if (!ignore_debugger &&
-	    ((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) {
-		nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
-		do_warp_sync = true;
-	}
-
-	if (do_warp_sync) {
-		ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
-				 global_mask, true);
-		if (ret != 0) {
-			nvgpu_err(g, "sm did not lock down!");
-			return ret;
-		}
-	}
-
-	if (ignore_debugger) {
-		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
-			"ignore_debugger set, skipping event posting");
-	} else {
-		*post_event = true;
-	}
-
-	return ret;
-}
-
 void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
 				u32 *esr_sm_sel)
 {
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -96,9 +96,6 @@ void gk20a_gr_suspend_all_sms(struct gk20a *g,
 int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
 	struct channel_gk20a *ch, u64 sms, bool enable);
 bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch);
-int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
-		bool *post_event, struct channel_gk20a *fault_ch,
-		u32 *hww_global_esr);

 #if defined(CONFIG_GK20A_CYCLE_STATS)
 int gr_gk20a_css_attach(struct channel_gk20a *ch,   /* in - main hw structure */
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -57,164 +57,6 @@
 #include <nvgpu/hw/gp10b/hw_gr_gp10b.h>
 #include <nvgpu/hw/gp10b/hw_fifo_gp10b.h>

-static void gr_gp10b_sm_lrf_ecc_overcount_war(bool single_err,
-						u32 sed_status,
-						u32 ded_status,
-						u32 *count_to_adjust,
-						u32 opposite_count)
-{
-	u32 over_count = 0;
-
-	sed_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_b();
-	ded_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_b();
-
-	/* One overcount for each partition on which a SBE occurred but not a
-	   DBE (or vice-versa) */
-	if (single_err) {
-		over_count = (u32)hweight32(sed_status & ~ded_status);
-	} else {
-		over_count = (u32)hweight32(ded_status & ~sed_status);
-	}
-
-	/* If both a SBE and a DBE occur on the same partition, then we have an
-	   overcount for the subpartition if the opposite error counts are
-	   zero. */
-	if (((sed_status & ded_status) != 0U) && (opposite_count == 0U)) {
-		over_count += (u32)hweight32(sed_status & ded_status);
-	}
-
-	if (*count_to_adjust > over_count) {
-		*count_to_adjust -= over_count;
-	} else {
-		*count_to_adjust = 0;
-	}
-}
-
-int gr_gp10b_handle_sm_exception(struct gk20a *g,
-			u32 gpc, u32 tpc, u32 sm,
-			bool *post_event, struct channel_gk20a *fault_ch,
-			u32 *hww_global_esr)
-{
-	int ret = 0;
-	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
-	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
-	u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
-	u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status;
-	u32 lrf_single_count_delta, lrf_double_count_delta;
-	u32 shm_ecc_status;
-
-	ret = gr_gk20a_handle_sm_exception(g,
-		gpc, tpc, sm, post_event, fault_ch, hww_global_esr);
-
-	/* Check for LRF ECC errors. */
-        lrf_ecc_status = gk20a_readl(g,
-			gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
-	lrf_ecc_sed_status = lrf_ecc_status &
-				(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() |
-				 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f() |
-				 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f() |
-				 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f());
-	lrf_ecc_ded_status = lrf_ecc_status &
-				(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f() |
-				 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f() |
-				 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() |
-				 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f());
-	lrf_single_count_delta =
-		gk20a_readl(g,
-			gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() +
-			offset);
-	lrf_double_count_delta =
-		gk20a_readl(g,
-			gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() +
-			offset);
-	gk20a_writel(g,
-		gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset,
-		0);
-	gk20a_writel(g,
-		gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset,
-		0);
-	if (lrf_ecc_sed_status != 0U) {
-		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
-			"Single bit error detected in SM LRF!");
-
-		gr_gp10b_sm_lrf_ecc_overcount_war(true,
-						lrf_ecc_sed_status,
-						lrf_ecc_ded_status,
-						&lrf_single_count_delta,
-						lrf_double_count_delta);
-		g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
-							lrf_single_count_delta;
-	}
-	if (lrf_ecc_ded_status != 0U) {
-		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
-			"Double bit error detected in SM LRF!");
-
-		gr_gp10b_sm_lrf_ecc_overcount_war(false,
-						lrf_ecc_sed_status,
-						lrf_ecc_ded_status,
-						&lrf_double_count_delta,
-						lrf_single_count_delta);
-		g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
-							lrf_double_count_delta;
-	}
-	gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
-			lrf_ecc_status);
-
-	/* Check for SHM ECC errors. */
-        shm_ecc_status = gk20a_readl(g,
-			gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset);
-	if ((shm_ecc_status &
-		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U ||
-		(shm_ecc_status &
-		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f()) != 0U ||
-		(shm_ecc_status &
-		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f()) != 0U ||
-		(shm_ecc_status &
-		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f()) != 0U ) {
-		u32 ecc_stats_reg_val;
-
-		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
-			"Single bit error detected in SM SHM!");
-
-		ecc_stats_reg_val =
-			gk20a_readl(g,
-				gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
-		g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter +=
-			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val);
-		g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter +=
-			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val);
-		ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() |
-					gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m());
-		gk20a_writel(g,
-			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
-			ecc_stats_reg_val);
-	}
-	if ((shm_ecc_status &
-		gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U ||
-		(shm_ecc_status &
-		gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f()) != 0U) {
-		u32 ecc_stats_reg_val;
-
-		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
-			"Double bit error detected in SM SHM!");
-
-		ecc_stats_reg_val =
-			gk20a_readl(g,
-				gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
-		g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter +=
-			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val);
-		ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m());
-		gk20a_writel(g,
-			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
-			ecc_stats_reg_val);
-	}
-	gk20a_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset,
-			shm_ecc_status);
-
-
-	return ret;
-}
-
 void gr_gp10b_set_bes_crop_debug3(struct gk20a *g, u32 data)
 {
 	u32 val;
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.h
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.h
@@ -43,11 +43,6 @@ int gr_gp10b_handle_fecs_error(struct gk20a *g,
 			struct nvgpu_gr_isr_data *isr_data);
 int gr_gp10b_set_cilp_preempt_pending(struct gk20a *g,
 		struct channel_gk20a *fault_ch);
-
-int gr_gp10b_handle_sm_exception(struct gk20a *g,
-			u32 gpc, u32 tpc, u32 sm,
-			bool *post_event, struct channel_gk20a *fault_ch,
-			u32 *hww_global_esr);
 int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 			struct nvgpu_gr_ctx *gr_ctx, bool patch);
 void gr_gp10b_set_bes_crop_debug3(struct gk20a *g, u32 data);
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c
@@ -25,6 +25,8 @@
 #include <nvgpu/class.h>

 #include <nvgpu/gr/config.h>
+#include <nvgpu/gr/gr.h>
+#include <nvgpu/gr/gr_intr.h>

 #include "gr_intr_gp10b.h"

@@ -103,12 +105,165 @@ fail:
 	return -EINVAL;
 }

+static void gr_gp10b_sm_lrf_ecc_overcount_war(bool single_err,
+						u32 sed_status,
+						u32 ded_status,
+						u32 *count_to_adjust,
+						u32 opposite_count)
+{
+	u32 over_count = 0;
+
+	sed_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_b();
+	ded_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_b();
+
+	/* One overcount for each partition on which a SBE occurred but not a
+	   DBE (or vice-versa) */
+	if (single_err) {
+		over_count = (u32)hweight32(sed_status & ~ded_status);
+	} else {
+		over_count = (u32)hweight32(ded_status & ~sed_status);
+	}
+
+	/* If both a SBE and a DBE occur on the same partition, then we have an
+	   overcount for the subpartition if the opposite error counts are
+	   zero. */
+	if (((sed_status & ded_status) != 0U) && (opposite_count == 0U)) {
+		over_count += (u32)hweight32(sed_status & ded_status);
+	}
+
+	if (*count_to_adjust > over_count) {
+		*count_to_adjust -= over_count;
+	} else {
+		*count_to_adjust = 0;
+	}
+}
+
+int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
+			u32 gpc, u32 tpc, u32 sm,
+			bool *post_event, struct channel_gk20a *fault_ch,
+			u32 *hww_global_esr)
+{
+	int ret = 0;
+	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
+	u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status;
+	u32 lrf_single_count_delta, lrf_double_count_delta;
+	u32 shm_ecc_status;
+
+	ret = nvgpu_gr_intr_handle_sm_exception(g,
+		gpc, tpc, sm, post_event, fault_ch, hww_global_esr);
+
+	/* Check for LRF ECC errors. */
+        lrf_ecc_status = nvgpu_readl(g,
+			gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
+	lrf_ecc_sed_status =
+		lrf_ecc_status &
+		(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() |
+		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f() |
+		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f() |
+		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f());
+	lrf_ecc_ded_status =
+		lrf_ecc_status &
+		(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f() |
+		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f() |
+		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() |
+		 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f());
+	lrf_single_count_delta =
+		nvgpu_readl(g,
+			gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() +
+			offset);
+	lrf_double_count_delta =
+		nvgpu_readl(g,
+			gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() +
+			offset);
+	nvgpu_writel(g,
+		gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset, 0);
+	nvgpu_writel(g,
+		gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset, 0);
+	if (lrf_ecc_sed_status != 0U) {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
+			"Single bit error detected in SM LRF!");
+
+		gr_gp10b_sm_lrf_ecc_overcount_war(true,
+						lrf_ecc_sed_status,
+						lrf_ecc_ded_status,
+						&lrf_single_count_delta,
+						lrf_double_count_delta);
+		g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
+							lrf_single_count_delta;
+	}
+	if (lrf_ecc_ded_status != 0U) {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
+			"Double bit error detected in SM LRF!");
+
+		gr_gp10b_sm_lrf_ecc_overcount_war(false,
+						lrf_ecc_sed_status,
+						lrf_ecc_ded_status,
+						&lrf_double_count_delta,
+						lrf_single_count_delta);
+		g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
+							lrf_double_count_delta;
+	}
+	nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
+			lrf_ecc_status);
+
+	/* Check for SHM ECC errors. */
+        shm_ecc_status = nvgpu_readl(g,
+			gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset);
+	if ((shm_ecc_status &
+		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U ||
+		(shm_ecc_status &
+		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f()) != 0U ||
+		(shm_ecc_status &
+		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f()) != 0U ||
+		(shm_ecc_status &
+		gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f()) != 0U ) {
+		u32 ecc_stats_reg_val;
+
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
+			"Single bit error detected in SM SHM!");
+
+		ecc_stats_reg_val =
+			nvgpu_readl(g,
+				gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
+		g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter +=
+			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val);
+		g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter +=
+			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val);
+		ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() |
+					gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m());
+		nvgpu_writel(g,
+			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
+			ecc_stats_reg_val);
+	}
+	if ((shm_ecc_status &
+		gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U ||
+		(shm_ecc_status &
+		gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f()) != 0U) {
+		u32 ecc_stats_reg_val;
+
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
+			"Double bit error detected in SM SHM!");
+
+		ecc_stats_reg_val =
+			nvgpu_readl(g,
+				gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
+		g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter +=
+			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val);
+		ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m());
+		nvgpu_writel(g,
+			gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
+			ecc_stats_reg_val);
+	}
+	nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset,
+			shm_ecc_status);
+
+
+	return ret;
+}
+
 void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
 {
-	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
-	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
-						GPU_LIT_TPC_IN_GPC_STRIDE);
-	u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
+	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
 	u32 esr;
 	u32 ecc_stats_reg_val;

--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.h
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.h
@@ -26,6 +26,7 @@
 #include <nvgpu/types.h>

 struct gk20a;
+struct channel_gk20a;

 #define NVC097_SET_GO_IDLE_TIMEOUT		0x022cU
 #define NVC097_SET_ALPHA_CIRCULAR_BUFFER_SIZE	0x02dcU
@@ -43,4 +44,8 @@ void gp10b_gr_intr_set_go_idle_timeout(struct gk20a *g, u32 data);
 void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc);
 int gp10b_gr_intr_handle_sw_method(struct gk20a *g, u32 addr,
 				     u32 class_num, u32 offset, u32 data);
+int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
+			u32 gpc, u32 tpc, u32 sm,
+			bool *post_event, struct channel_gk20a *fault_ch,
+			u32 *hww_global_esr);
 #endif /* NVGPU_GR_INTR_GP10B_H */
--- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
@@ -265,7 +265,6 @@ static const struct gpu_ops gm20b_ops = {
 		.set_sm_debug_mode = gr_gk20a_set_sm_debug_mode,
 		.bpt_reg_info = gr_gm20b_bpt_reg_info,
 		.handle_fecs_error = gk20a_gr_handle_fecs_error,
-		.handle_sm_exception = gr_gk20a_handle_sm_exception,
 		.get_lrf_tex_ltc_dram_override = NULL,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
@@ -532,6 +531,8 @@ static const struct gpu_ops gm20b_ops = {
 				gm20ab_gr_intr_tpc_exception_sm_enable,
 			.tpc_exception_sm_disable =
 				gm20ab_gr_intr_tpc_exception_sm_disable,
+			.handle_sm_exception =
+				nvgpu_gr_intr_handle_sm_exception,
 			.stall_isr = nvgpu_gr_intr_stall_isr,
 		},
 		.falcon = {
--- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
@@ -296,7 +296,6 @@ static const struct gpu_ops gp10b_ops = {
 		.set_sm_debug_mode = gr_gk20a_set_sm_debug_mode,
 		.bpt_reg_info = gr_gm20b_bpt_reg_info,
 		.handle_fecs_error = gr_gp10b_handle_fecs_error,
-		.handle_sm_exception = gr_gp10b_handle_sm_exception,
 		.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
@@ -597,6 +596,8 @@ static const struct gpu_ops gp10b_ops = {
 				gm20ab_gr_intr_tpc_exception_sm_enable,
 			.tpc_exception_sm_disable =
 				gm20ab_gr_intr_tpc_exception_sm_disable,
+			.handle_sm_exception =
+				gp10b_gr_intr_handle_sm_exception,
 			.stall_isr = nvgpu_gr_intr_stall_isr,
 		},
 		.falcon = {
--- a/drivers/gpu/nvgpu/hal/init/hal_gv100.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv100.c
@@ -398,7 +398,6 @@ static const struct gpu_ops gv100_ops = {
 		.set_sm_debug_mode = gv11b_gr_set_sm_debug_mode,
 		.bpt_reg_info = gv11b_gr_bpt_reg_info,
 		.handle_fecs_error = gr_gv11b_handle_fecs_error,
-		.handle_sm_exception = gr_gk20a_handle_sm_exception,
 		.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
@@ -739,6 +738,8 @@ static const struct gpu_ops gv100_ops = {
 				gm20ab_gr_intr_tpc_exception_sm_enable,
 			.tpc_exception_sm_disable =
 				gm20ab_gr_intr_tpc_exception_sm_disable,
+			.handle_sm_exception =
+				nvgpu_gr_intr_handle_sm_exception,
 			.stall_isr = nvgpu_gr_intr_stall_isr,
 		},
 		.falcon = {
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -369,7 +369,6 @@ static const struct gpu_ops gv11b_ops = {
 		.set_sm_debug_mode = gv11b_gr_set_sm_debug_mode,
 		.bpt_reg_info = gv11b_gr_bpt_reg_info,
 		.handle_fecs_error = gr_gv11b_handle_fecs_error,
-		.handle_sm_exception = gr_gk20a_handle_sm_exception,
 		.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
@@ -716,6 +715,8 @@ static const struct gpu_ops gv11b_ops = {
 				gm20ab_gr_intr_tpc_exception_sm_enable,
 			.tpc_exception_sm_disable =
 				gm20ab_gr_intr_tpc_exception_sm_disable,
+			.handle_sm_exception =
+				nvgpu_gr_intr_handle_sm_exception,
 			.stall_isr = nvgpu_gr_intr_stall_isr,
 		},
 		.falcon = {
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -418,7 +418,6 @@ static const struct gpu_ops tu104_ops = {
 		.set_sm_debug_mode = gv11b_gr_set_sm_debug_mode,
 		.bpt_reg_info = gv11b_gr_bpt_reg_info,
 		.handle_fecs_error = gr_gv11b_handle_fecs_error,
-		.handle_sm_exception = gr_gk20a_handle_sm_exception,
 		.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
 		.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
@@ -769,6 +768,8 @@ static const struct gpu_ops tu104_ops = {
 				gm20ab_gr_intr_tpc_exception_sm_enable,
 			.tpc_exception_sm_disable =
 				gm20ab_gr_intr_tpc_exception_sm_disable,
+			.handle_sm_exception =
+				nvgpu_gr_intr_handle_sm_exception,
 			.stall_isr = nvgpu_gr_intr_stall_isr,
 		},
 		.falcon = {
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -365,10 +365,6 @@ struct gpu_ops {
 			u32 gpc, u32 tpc,
 			bool *post_event, struct channel_gk20a *fault_ch,
 			u32 *hww_global_esr);
-		int (*handle_sm_exception)(struct gk20a *g,
-			u32 gpc, u32 tpc, u32 sm,
-			bool *post_event, struct channel_gk20a *fault_ch,
-			u32 *hww_global_esr);
 		u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g);
 		int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc,
 				u32 sm, struct channel_gk20a *fault_ch);
@@ -845,6 +841,10 @@ struct gpu_ops {
 			void (*tpc_exception_sm_disable)(struct gk20a *g,
 							       u32 offset);
 			void (*tpc_exception_sm_enable)(struct gk20a *g);
+			int (*handle_sm_exception)(struct gk20a *g,
+				u32 gpc, u32 tpc, u32 sm,
+				bool *post_event, struct channel_gk20a *fault_ch,
+				u32 *hww_global_esr);
 			int (*stall_isr)(struct gk20a *g);
 		} intr;

--- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h
@@ -69,5 +69,8 @@ struct channel_gk20a *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
 				u32 curr_ctx, u32 *curr_tsgid);
 void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
 		  struct nvgpu_gr_isr_data *isr_data, u32 error_notifier);
+int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
+		bool *post_event, struct channel_gk20a *fault_ch,
+		u32 *hww_global_esr);
 int nvgpu_gr_intr_stall_isr(struct gk20a *g);
 #endif /* NVGPU_GR_INTR_H */