diff --git a/drivers/gpu/nvgpu/common/gr/gr_intr.c b/drivers/gpu/nvgpu/common/gr/gr_intr.c index b1e39df2e..f046df770 100644 --- a/drivers/gpu/nvgpu/common/gr/gr_intr.c +++ b/drivers/gpu/nvgpu/common/gr/gr_intr.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -71,7 +72,7 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc, "GPC%d TPC%d: SM%d exception pending", gpc, tpc, sm); - tmp_ret = g->ops.gr.handle_sm_exception(g, + tmp_ret = g->ops.gr.intr.handle_sm_exception(g, gpc, tpc, sm, post_event, fault_ch, hww_global_esr); ret = (ret != 0) ? ret : tmp_ret; @@ -153,6 +154,48 @@ static int gr_intr_handle_class_error(struct gk20a *g, return -EINVAL; } +static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, + u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc) +{ + int ret; + struct gr_sm_mcerr_info err_info; + struct channel_gk20a *ch; + struct gr_err_info info; + u32 tsgid, chid, curr_ctx, inst = 0; + + if (g->ops.gr.err_ops.report_gr_err == NULL) { + return; + } + + tsgid = NVGPU_INVALID_TSG_ID; + curr_ctx = g->ops.gr.falcon.get_current_ctx(g); + ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid); + chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID; + if (ch != NULL) { + gk20a_channel_put(ch); + } + + (void) memset(&err_info, 0, sizeof(err_info)); + (void) memset(&info, 0, sizeof(info)); + err_info.curr_ctx = curr_ctx; + err_info.chid = chid; + err_info.tsgid = tsgid; + err_info.hww_warp_esr_pc = hww_warp_esr_pc; + err_info.hww_warp_esr_status = hww_warp_esr_status; + err_info.gpc = gpc; + err_info.tpc = tpc; + err_info.sm = sm; + info.sm_mcerr_info = &err_info; + ret = g->ops.gr.err_ops.report_gr_err(g, + NVGPU_ERR_MODULE_SM, inst, GPU_SM_MACHINE_CHECK_ERROR, + &info); + if (ret != 0) { + nvgpu_err(g, "failed to report SM_EXCEPTION " + "gpc=%u, tpc=%u, sm=%u, esr_status=%x", + gpc, tpc, sm, hww_warp_esr_status); + } +} + /* Used by sw interrupt thread to translate current ctx to chid. * Also used by regops to translate current ctx to chid and tsgid. * For performance, we don't want to go through 128 channels every time. @@ -295,6 +338,112 @@ void nvgpu_gr_intr_set_error_notifier(struct gk20a *g, } } +int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr) +{ + int ret = 0; + bool do_warp_sync = false, early_exit = false, ignore_debugger = false; + bool disable_sm_exceptions = true; + u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); + bool sm_debugger_attached; + u32 global_esr, warp_esr, global_mask; + u64 hww_warp_esr_pc = 0; + + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); + + sm_debugger_attached = g->ops.gr.sm_debugger_attached(g); + + global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm); + *hww_global_esr = global_esr; + warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm); + global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g); + + if (!sm_debugger_attached) { + nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x", + global_esr, warp_esr); + return -EFAULT; + } + + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr); + + /* + * Check and report any fatal wrap errors. + */ + if ((global_esr & ~global_mask) != 0U) { + if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) { + hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g, + offset); + } + gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr, + hww_warp_esr_pc); + } + nvgpu_pg_elpg_protected_call(g, + g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch)); + + if (g->ops.gr.pre_process_sm_exception != NULL) { + ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm, + global_esr, warp_esr, + sm_debugger_attached, + fault_ch, + &early_exit, + &ignore_debugger); + if (ret != 0) { + nvgpu_err(g, "could not pre-process sm error!"); + return ret; + } + } + + if (early_exit) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "returning early"); + return ret; + } + + /* + * Disable forwarding of tpc exceptions, + * the debugger will reenable exceptions after servicing them. + * + * Do not disable exceptions if the only SM exception is BPT_INT + */ + if ((g->ops.gr.esr_bpt_pending_events(global_esr, + NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) { + disable_sm_exceptions = false; + } + + if (!ignore_debugger && disable_sm_exceptions) { + g->ops.gr.intr.tpc_exception_sm_disable(g, offset); + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "SM Exceptions disabled"); + } + + /* if a debugger is present and an error has occurred, do a warp sync */ + if (!ignore_debugger && + ((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) { + nvgpu_log(g, gpu_dbg_intr, "warp sync needed"); + do_warp_sync = true; + } + + if (do_warp_sync) { + ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm, + global_mask, true); + if (ret != 0) { + nvgpu_err(g, "sm did not lock down!"); + return ret; + } + } + + if (ignore_debugger) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "ignore_debugger set, skipping event posting"); + } else { + *post_event = true; + } + + return ret; +} + int nvgpu_gr_intr_handle_gpc_exception(struct gk20a *g, bool *post_event, struct nvgpu_gr_config *gr_config, struct channel_gk20a *fault_ch, u32 *hww_global_esr) diff --git a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c index 2b80d1ed3..f09ce4313 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c @@ -139,7 +139,6 @@ static const struct gpu_ops vgpu_gp10b_ops = { .set_sm_debug_mode = vgpu_gr_set_sm_debug_mode, .bpt_reg_info = NULL, .handle_fecs_error = NULL, - .handle_sm_exception = NULL, .get_lrf_tex_ltc_dram_override = NULL, .update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode, diff --git a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c index cc39c94f0..391078283 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c @@ -165,7 +165,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { .set_sm_debug_mode = vgpu_gr_set_sm_debug_mode, .bpt_reg_info = NULL, .handle_fecs_error = NULL, - .handle_sm_exception = NULL, .get_lrf_tex_ltc_dram_override = NULL, .update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode, diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 9b2121e6b..f5910b380 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include "gr_gk20a.h" @@ -67,51 +66,8 @@ #include "common/gr/gr_priv.h" -#include #include -static void nvgpu_report_gr_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, - u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc) -{ - int ret; - struct gr_sm_mcerr_info err_info; - struct channel_gk20a *ch; - struct gr_err_info info; - u32 tsgid, chid, curr_ctx, inst = 0; - - if (g->ops.gr.err_ops.report_gr_err == NULL) { - return; - } - - tsgid = NVGPU_INVALID_TSG_ID; - curr_ctx = g->ops.gr.falcon.get_current_ctx(g); - ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid); - chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID; - if (ch != NULL) { - gk20a_channel_put(ch); - } - - (void) memset(&err_info, 0, sizeof(err_info)); - (void) memset(&info, 0, sizeof(info)); - err_info.curr_ctx = curr_ctx; - err_info.chid = chid; - err_info.tsgid = tsgid; - err_info.hww_warp_esr_pc = hww_warp_esr_pc; - err_info.hww_warp_esr_status = hww_warp_esr_status; - err_info.gpc = gpc; - err_info.tpc = tpc; - err_info.sm = sm; - info.sm_mcerr_info = &err_info; - ret = g->ops.gr.err_ops.report_gr_err(g, - NVGPU_ERR_MODULE_SM, inst, GPU_SM_MACHINE_CHECK_ERROR, - &info); - if (ret != 0) { - nvgpu_err(g, "failed to report SM_EXCEPTION " - "gpc=%u, tpc=%u, sm=%u, esr_status=%x", - gpc, tpc, sm, hww_warp_esr_status); - } -} - static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid, u32 mailbox_value) { @@ -373,112 +329,6 @@ bool gk20a_gr_sm_debugger_attached(struct gk20a *g) return false; } -int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - bool *post_event, struct channel_gk20a *fault_ch, - u32 *hww_global_esr) -{ - int ret = 0; - bool do_warp_sync = false, early_exit = false, ignore_debugger = false; - bool disable_sm_exceptions = true; - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); - bool sm_debugger_attached; - u32 global_esr, warp_esr, global_mask; - u64 hww_warp_esr_pc = 0; - - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); - - sm_debugger_attached = g->ops.gr.sm_debugger_attached(g); - - global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm); - *hww_global_esr = global_esr; - warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm); - global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g); - - if (!sm_debugger_attached) { - nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x", - global_esr, warp_esr); - return -EFAULT; - } - - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr); - - /* - * Check and report any fatal wrap errors. - */ - if ((global_esr & ~global_mask) != 0U) { - if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) { - hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g, - offset); - } - nvgpu_report_gr_sm_exception(g, gpc, tpc, sm, warp_esr, - hww_warp_esr_pc); - } - nvgpu_pg_elpg_protected_call(g, - g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch)); - - if (g->ops.gr.pre_process_sm_exception != NULL) { - ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm, - global_esr, warp_esr, - sm_debugger_attached, - fault_ch, - &early_exit, - &ignore_debugger); - if (ret != 0) { - nvgpu_err(g, "could not pre-process sm error!"); - return ret; - } - } - - if (early_exit) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "returning early"); - return ret; - } - - /* - * Disable forwarding of tpc exceptions, - * the debugger will reenable exceptions after servicing them. - * - * Do not disable exceptions if the only SM exception is BPT_INT - */ - if ((g->ops.gr.esr_bpt_pending_events(global_esr, - NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) { - disable_sm_exceptions = false; - } - - if (!ignore_debugger && disable_sm_exceptions) { - g->ops.gr.intr.tpc_exception_sm_disable(g, offset); - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "SM Exceptions disabled"); - } - - /* if a debugger is present and an error has occurred, do a warp sync */ - if (!ignore_debugger && - ((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) { - nvgpu_log(g, gpu_dbg_intr, "warp sync needed"); - do_warp_sync = true; - } - - if (do_warp_sync) { - ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm, - global_mask, true); - if (ret != 0) { - nvgpu_err(g, "sm did not lock down!"); - return ret; - } - } - - if (ignore_debugger) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "ignore_debugger set, skipping event posting"); - } else { - *post_event = true; - } - - return ret; -} - void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, u32 *esr_sm_sel) { diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 7dd3488b3..bd63c65bd 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h @@ -96,9 +96,6 @@ void gk20a_gr_suspend_all_sms(struct gk20a *g, int gr_gk20a_set_sm_debug_mode(struct gk20a *g, struct channel_gk20a *ch, u64 sms, bool enable); bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch); -int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - bool *post_event, struct channel_gk20a *fault_ch, - u32 *hww_global_esr); #if defined(CONFIG_GK20A_CYCLE_STATS) int gr_gk20a_css_attach(struct channel_gk20a *ch, /* in - main hw structure */ diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c index 0950ac523..da8205d13 100644 --- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c @@ -57,164 +57,6 @@ #include #include -static void gr_gp10b_sm_lrf_ecc_overcount_war(bool single_err, - u32 sed_status, - u32 ded_status, - u32 *count_to_adjust, - u32 opposite_count) -{ - u32 over_count = 0; - - sed_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_b(); - ded_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_b(); - - /* One overcount for each partition on which a SBE occurred but not a - DBE (or vice-versa) */ - if (single_err) { - over_count = (u32)hweight32(sed_status & ~ded_status); - } else { - over_count = (u32)hweight32(ded_status & ~sed_status); - } - - /* If both a SBE and a DBE occur on the same partition, then we have an - overcount for the subpartition if the opposite error counts are - zero. */ - if (((sed_status & ded_status) != 0U) && (opposite_count == 0U)) { - over_count += (u32)hweight32(sed_status & ded_status); - } - - if (*count_to_adjust > over_count) { - *count_to_adjust -= over_count; - } else { - *count_to_adjust = 0; - } -} - -int gr_gp10b_handle_sm_exception(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm, - bool *post_event, struct channel_gk20a *fault_ch, - u32 *hww_global_esr) -{ - int ret = 0; - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; - u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status; - u32 lrf_single_count_delta, lrf_double_count_delta; - u32 shm_ecc_status; - - ret = gr_gk20a_handle_sm_exception(g, - gpc, tpc, sm, post_event, fault_ch, hww_global_esr); - - /* Check for LRF ECC errors. */ - lrf_ecc_status = gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); - lrf_ecc_sed_status = lrf_ecc_status & - (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f()); - lrf_ecc_ded_status = lrf_ecc_status & - (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f()); - lrf_single_count_delta = - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + - offset); - lrf_double_count_delta = - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + - offset); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset, - 0); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset, - 0); - if (lrf_ecc_sed_status != 0U) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Single bit error detected in SM LRF!"); - - gr_gp10b_sm_lrf_ecc_overcount_war(true, - lrf_ecc_sed_status, - lrf_ecc_ded_status, - &lrf_single_count_delta, - lrf_double_count_delta); - g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += - lrf_single_count_delta; - } - if (lrf_ecc_ded_status != 0U) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Double bit error detected in SM LRF!"); - - gr_gp10b_sm_lrf_ecc_overcount_war(false, - lrf_ecc_sed_status, - lrf_ecc_ded_status, - &lrf_double_count_delta, - lrf_single_count_delta); - g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += - lrf_double_count_delta; - } - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset, - lrf_ecc_status); - - /* Check for SHM ECC errors. */ - shm_ecc_status = gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset); - if ((shm_ecc_status & - gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U || - (shm_ecc_status & - gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f()) != 0U || - (shm_ecc_status & - gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f()) != 0U || - (shm_ecc_status & - gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f()) != 0U ) { - u32 ecc_stats_reg_val; - - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Single bit error detected in SM SHM!"); - - ecc_stats_reg_val = - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset); - g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter += - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val); - g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter += - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val); - ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() | - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m()); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset, - ecc_stats_reg_val); - } - if ((shm_ecc_status & - gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U || - (shm_ecc_status & - gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f()) != 0U) { - u32 ecc_stats_reg_val; - - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Double bit error detected in SM SHM!"); - - ecc_stats_reg_val = - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset); - g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter += - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val); - ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m()); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset, - ecc_stats_reg_val); - } - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset, - shm_ecc_status); - - - return ret; -} - void gr_gp10b_set_bes_crop_debug3(struct gk20a *g, u32 data) { u32 val; diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.h b/drivers/gpu/nvgpu/gp10b/gr_gp10b.h index 80bc8ec3f..fa33c8a1b 100644 --- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.h +++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.h @@ -43,11 +43,6 @@ int gr_gp10b_handle_fecs_error(struct gk20a *g, struct nvgpu_gr_isr_data *isr_data); int gr_gp10b_set_cilp_preempt_pending(struct gk20a *g, struct channel_gk20a *fault_ch); - -int gr_gp10b_handle_sm_exception(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm, - bool *post_event, struct channel_gk20a *fault_ch, - u32 *hww_global_esr); int gr_gp10b_commit_global_cb_manager(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, bool patch); void gr_gp10b_set_bes_crop_debug3(struct gk20a *g, u32 data); diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c index 37d36c905..03c237da1 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c @@ -25,6 +25,8 @@ #include #include +#include +#include #include "gr_intr_gp10b.h" @@ -103,12 +105,165 @@ fail: return -EINVAL; } +static void gr_gp10b_sm_lrf_ecc_overcount_war(bool single_err, + u32 sed_status, + u32 ded_status, + u32 *count_to_adjust, + u32 opposite_count) +{ + u32 over_count = 0; + + sed_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_b(); + ded_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_b(); + + /* One overcount for each partition on which a SBE occurred but not a + DBE (or vice-versa) */ + if (single_err) { + over_count = (u32)hweight32(sed_status & ~ded_status); + } else { + over_count = (u32)hweight32(ded_status & ~sed_status); + } + + /* If both a SBE and a DBE occur on the same partition, then we have an + overcount for the subpartition if the opposite error counts are + zero. */ + if (((sed_status & ded_status) != 0U) && (opposite_count == 0U)) { + over_count += (u32)hweight32(sed_status & ded_status); + } + + if (*count_to_adjust > over_count) { + *count_to_adjust -= over_count; + } else { + *count_to_adjust = 0; + } +} + +int gp10b_gr_intr_handle_sm_exception(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr) +{ + int ret = 0; + u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); + u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status; + u32 lrf_single_count_delta, lrf_double_count_delta; + u32 shm_ecc_status; + + ret = nvgpu_gr_intr_handle_sm_exception(g, + gpc, tpc, sm, post_event, fault_ch, hww_global_esr); + + /* Check for LRF ECC errors. */ + lrf_ecc_status = nvgpu_readl(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); + lrf_ecc_sed_status = + lrf_ecc_status & + (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f()); + lrf_ecc_ded_status = + lrf_ecc_status & + (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f()); + lrf_single_count_delta = + nvgpu_readl(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + + offset); + lrf_double_count_delta = + nvgpu_readl(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + + offset); + nvgpu_writel(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset, 0); + nvgpu_writel(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset, 0); + if (lrf_ecc_sed_status != 0U) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Single bit error detected in SM LRF!"); + + gr_gp10b_sm_lrf_ecc_overcount_war(true, + lrf_ecc_sed_status, + lrf_ecc_ded_status, + &lrf_single_count_delta, + lrf_double_count_delta); + g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += + lrf_single_count_delta; + } + if (lrf_ecc_ded_status != 0U) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Double bit error detected in SM LRF!"); + + gr_gp10b_sm_lrf_ecc_overcount_war(false, + lrf_ecc_sed_status, + lrf_ecc_ded_status, + &lrf_double_count_delta, + lrf_single_count_delta); + g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += + lrf_double_count_delta; + } + nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset, + lrf_ecc_status); + + /* Check for SHM ECC errors. */ + shm_ecc_status = nvgpu_readl(g, + gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset); + if ((shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f()) != 0U || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f()) != 0U || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f()) != 0U ) { + u32 ecc_stats_reg_val; + + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Single bit error detected in SM SHM!"); + + ecc_stats_reg_val = + nvgpu_readl(g, + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset); + g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter += + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val); + g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter += + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val); + ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() | + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m()); + nvgpu_writel(g, + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset, + ecc_stats_reg_val); + } + if ((shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f()) != 0U) { + u32 ecc_stats_reg_val; + + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Double bit error detected in SM SHM!"); + + ecc_stats_reg_val = + nvgpu_readl(g, + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset); + g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter += + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val); + ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m()); + nvgpu_writel(g, + gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset, + ecc_stats_reg_val); + } + nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset, + shm_ecc_status); + + + return ret; +} + void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc) { - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, - GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; + u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); u32 esr; u32 ecc_stats_reg_val; diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.h b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.h index c2204044c..d14276421 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.h +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.h @@ -26,6 +26,7 @@ #include struct gk20a; +struct channel_gk20a; #define NVC097_SET_GO_IDLE_TIMEOUT 0x022cU #define NVC097_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dcU @@ -43,4 +44,8 @@ void gp10b_gr_intr_set_go_idle_timeout(struct gk20a *g, u32 data); void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc); int gp10b_gr_intr_handle_sw_method(struct gk20a *g, u32 addr, u32 class_num, u32 offset, u32 data); +int gp10b_gr_intr_handle_sm_exception(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr); #endif /* NVGPU_GR_INTR_GP10B_H */ diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c index 5576a22ee..bc73b07ba 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c @@ -265,7 +265,6 @@ static const struct gpu_ops gm20b_ops = { .set_sm_debug_mode = gr_gk20a_set_sm_debug_mode, .bpt_reg_info = gr_gm20b_bpt_reg_info, .handle_fecs_error = gk20a_gr_handle_fecs_error, - .handle_sm_exception = gr_gk20a_handle_sm_exception, .get_lrf_tex_ltc_dram_override = NULL, .update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, @@ -532,6 +531,8 @@ static const struct gpu_ops gm20b_ops = { gm20ab_gr_intr_tpc_exception_sm_enable, .tpc_exception_sm_disable = gm20ab_gr_intr_tpc_exception_sm_disable, + .handle_sm_exception = + nvgpu_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, }, .falcon = { diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index 7b1a29481..d39f4cb04 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -296,7 +296,6 @@ static const struct gpu_ops gp10b_ops = { .set_sm_debug_mode = gr_gk20a_set_sm_debug_mode, .bpt_reg_info = gr_gm20b_bpt_reg_info, .handle_fecs_error = gr_gp10b_handle_fecs_error, - .handle_sm_exception = gr_gp10b_handle_sm_exception, .get_lrf_tex_ltc_dram_override = get_ecc_override_val, .update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, @@ -597,6 +596,8 @@ static const struct gpu_ops gp10b_ops = { gm20ab_gr_intr_tpc_exception_sm_enable, .tpc_exception_sm_disable = gm20ab_gr_intr_tpc_exception_sm_disable, + .handle_sm_exception = + gp10b_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, }, .falcon = { diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv100.c b/drivers/gpu/nvgpu/hal/init/hal_gv100.c index eed1038ae..7016a7001 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv100.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv100.c @@ -398,7 +398,6 @@ static const struct gpu_ops gv100_ops = { .set_sm_debug_mode = gv11b_gr_set_sm_debug_mode, .bpt_reg_info = gv11b_gr_bpt_reg_info, .handle_fecs_error = gr_gv11b_handle_fecs_error, - .handle_sm_exception = gr_gk20a_handle_sm_exception, .get_lrf_tex_ltc_dram_override = get_ecc_override_val, .update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode, .get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon, @@ -739,6 +738,8 @@ static const struct gpu_ops gv100_ops = { gm20ab_gr_intr_tpc_exception_sm_enable, .tpc_exception_sm_disable = gm20ab_gr_intr_tpc_exception_sm_disable, + .handle_sm_exception = + nvgpu_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, }, .falcon = { diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 373dbe0dd..86c67ebfd 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -369,7 +369,6 @@ static const struct gpu_ops gv11b_ops = { .set_sm_debug_mode = gv11b_gr_set_sm_debug_mode, .bpt_reg_info = gv11b_gr_bpt_reg_info, .handle_fecs_error = gr_gv11b_handle_fecs_error, - .handle_sm_exception = gr_gk20a_handle_sm_exception, .get_lrf_tex_ltc_dram_override = get_ecc_override_val, .update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode, .get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon, @@ -716,6 +715,8 @@ static const struct gpu_ops gv11b_ops = { gm20ab_gr_intr_tpc_exception_sm_enable, .tpc_exception_sm_disable = gm20ab_gr_intr_tpc_exception_sm_disable, + .handle_sm_exception = + nvgpu_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, }, .falcon = { diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index 2659d384e..a53ec1bfe 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -418,7 +418,6 @@ static const struct gpu_ops tu104_ops = { .set_sm_debug_mode = gv11b_gr_set_sm_debug_mode, .bpt_reg_info = gv11b_gr_bpt_reg_info, .handle_fecs_error = gr_gv11b_handle_fecs_error, - .handle_sm_exception = gr_gk20a_handle_sm_exception, .get_lrf_tex_ltc_dram_override = get_ecc_override_val, .update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode, .get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon, @@ -769,6 +768,8 @@ static const struct gpu_ops tu104_ops = { gm20ab_gr_intr_tpc_exception_sm_enable, .tpc_exception_sm_disable = gm20ab_gr_intr_tpc_exception_sm_disable, + .handle_sm_exception = + nvgpu_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, }, .falcon = { diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 85094c3ac..84032e38b 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -365,10 +365,6 @@ struct gpu_ops { u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr); - int (*handle_sm_exception)(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm, - bool *post_event, struct channel_gk20a *fault_ch, - u32 *hww_global_esr); u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g); int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch); @@ -845,6 +841,10 @@ struct gpu_ops { void (*tpc_exception_sm_disable)(struct gk20a *g, u32 offset); void (*tpc_exception_sm_enable)(struct gk20a *g); + int (*handle_sm_exception)(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr); int (*stall_isr)(struct gk20a *g); } intr; diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h index ff90c47ae..9c7679f12 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h @@ -69,5 +69,8 @@ struct channel_gk20a *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g, u32 curr_ctx, u32 *curr_tsgid); void nvgpu_gr_intr_set_error_notifier(struct gk20a *g, struct nvgpu_gr_isr_data *isr_data, u32 error_notifier); +int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr); int nvgpu_gr_intr_stall_isr(struct gk20a *g); #endif /* NVGPU_GR_INTR_H */