From 0908547ad212adfbba806037c1278c4865aa38eb Mon Sep 17 00:00:00 2001 From: Deepak Nibade Date: Tue, 28 May 2019 17:47:59 +0530 Subject: [PATCH] gpu: nvgpu: move some interrupt hals to hal.gr.intr unit Move some interrupt handling hals from hal.gr.gr unit to hal.gr.intr unit as below g->ops.gr.intr.set_hww_esr_report_mask() g->ops.gr.intr.handle_tpc_sm_ecc_exception() g->ops.gr.intr.get_esr_sm_sel() g->ops.gr.intr.clear_sm_hww() g->ops.gr.intr.handle_ssync_hww() g->ops.gr.intr.log_mme_exception() g->ops.gr.intr.record_sm_error_state() g->ops.gr.intr.get_sm_hww_global_esr() g->ops.gr.intr.get_sm_hww_warp_esr() g->ops.gr.intr.get_sm_no_lock_down_hww_global_esr_mask() g->ops.gr.intr.get_sm_hww_warp_esr_pc() g->ops.gr.intr.tpc_enabled_exceptions() g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val() Rename gv11b_gr_sm_offset() to nvgpu_gr_sm_offset() and move to common.gr.gr unit All of above functions and hals will be needed in safety build Jira NVGPU-3506 Change-Id: I278d528e4b6176b62ff44eb39ef18ef28d37c401 Signed-off-by: Deepak Nibade Reviewed-on: https://git-master.nvidia.com/r/2127753 Reviewed-by: svc-mobile-coverity GVS: Gerrit_Virtual_Submit Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/gr/gr.c | 11 +- drivers/gpu/nvgpu/common/gr/gr_intr.c | 24 +- .../nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c | 13 +- .../nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c | 16 +- drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c | 77 +- drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h | 6 - drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c | 97 -- drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.h | 5 - drivers/gpu/nvgpu/hal/gr/gr/gr_gp10b.c | 4 +- drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c | 831 +----------------- drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.h | 20 - drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.c | 75 -- drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.h | 1 - drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c | 189 +++- drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.h | 15 + drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c | 819 ++++++++++++++++- drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h | 22 + drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c | 75 ++ drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.h | 1 + drivers/gpu/nvgpu/hal/init/hal_gm20b.c | 25 +- drivers/gpu/nvgpu/hal/init/hal_gp10b.c | 25 +- drivers/gpu/nvgpu/hal/init/hal_gv11b.c | 37 +- drivers/gpu/nvgpu/hal/init/hal_tu104.c | 34 +- drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 45 +- drivers/gpu/nvgpu/include/nvgpu/gr/gr.h | 1 + drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c | 2 +- 26 files changed, 1261 insertions(+), 1209 deletions(-) diff --git a/drivers/gpu/nvgpu/common/gr/gr.c b/drivers/gpu/nvgpu/common/gr/gr.c index bf1235ba6..5f3ec6f7a 100644 --- a/drivers/gpu/nvgpu/common/gr/gr.c +++ b/drivers/gpu/nvgpu/common/gr/gr.c @@ -135,6 +135,15 @@ u32 nvgpu_gr_tpc_offset(struct gk20a *g, u32 tpc) return tpc_offset; } +u32 nvgpu_gr_sm_offset(struct gk20a *g, u32 sm) +{ + + u32 sm_pri_stride = nvgpu_get_litter_value(g, GPU_LIT_SM_PRI_STRIDE); + u32 sm_offset = nvgpu_safe_mult_u32(sm_pri_stride, sm); + + return sm_offset; +} + void nvgpu_gr_init(struct gk20a *g) { (void)nvgpu_cond_init(&g->gr->init_wq); @@ -206,7 +215,7 @@ static int gr_init_setup_hw(struct gk20a *g) g->ops.gr.falcon.fecs_host_int_enable(g); g->ops.gr.intr.enable_hww_exceptions(g); - g->ops.gr.set_hww_esr_report_mask(g); + g->ops.gr.intr.set_hww_esr_report_mask(g); /* enable TPC exceptions per GPC */ if (g->ops.gr.intr.enable_gpc_exceptions != NULL) { diff --git a/drivers/gpu/nvgpu/common/gr/gr_intr.c b/drivers/gpu/nvgpu/common/gr/gr_intr.c index 02e6a3f9d..fd913c18f 100644 --- a/drivers/gpu/nvgpu/common/gr/gr_intr.c +++ b/drivers/gpu/nvgpu/common/gr/gr_intr.c @@ -79,12 +79,12 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc, nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d: SM exception pending", gpc, tpc); - if (g->ops.gr.handle_tpc_sm_ecc_exception != NULL) { - g->ops.gr.handle_tpc_sm_ecc_exception(g, gpc, tpc, + if (g->ops.gr.intr.handle_tpc_sm_ecc_exception != NULL) { + g->ops.gr.intr.handle_tpc_sm_ecc_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); } - g->ops.gr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel); + g->ops.gr.intr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel); for (sm = 0; sm < sm_per_tpc; sm++) { @@ -105,7 +105,7 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc, * exceptions to be cleared. Should be cleared * only if SM is locked down or empty. */ - g->ops.gr.clear_sm_hww(g, + g->ops.gr.intr.clear_sm_hww(g, gpc, tpc, sm, *hww_global_esr); } @@ -359,10 +359,10 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, sm_debugger_attached = g->ops.gr.sm_debugger_attached(g); - global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm); + global_esr = g->ops.gr.intr.get_sm_hww_global_esr(g, gpc, tpc, sm); *hww_global_esr = global_esr; - warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm); - global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g); + warp_esr = g->ops.gr.intr.get_sm_hww_warp_esr(g, gpc, tpc, sm); + global_mask = g->ops.gr.intr.get_sm_no_lock_down_hww_global_esr_mask(g); if (!sm_debugger_attached) { nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x", @@ -377,15 +377,15 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, * Check and report any fatal wrap errors. */ if ((global_esr & ~global_mask) != 0U) { - if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) { - hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g, + if (g->ops.gr.intr.get_sm_hww_warp_esr_pc != NULL) { + hww_warp_esr_pc = g->ops.gr.intr.get_sm_hww_warp_esr_pc(g, offset); } gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr, hww_warp_esr_pc); } nvgpu_pg_elpg_protected_call(g, - g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch)); + g->ops.gr.intr.record_sm_error_state(g, gpc, tpc, sm, fault_ch)); if (g->ops.gr.pre_process_sm_exception != NULL) { ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm, @@ -498,9 +498,9 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch, * The mailbox values may vary across chips hence keeping it * as a HAL. */ - if ((g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val != NULL) + if ((g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val != NULL) && (mailbox_value == - g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val())) { + g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val())) { gr_intr_report_ctxsw_error(g, GPU_FECS_CTXSW_CRC_MISMATCH, diff --git a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c index 6d75a35b0..6e046d39f 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c @@ -63,6 +63,7 @@ #include "hal/gr/fecs_trace/fecs_trace_gm20b.h" #include "hal/gr/init/gr_init_gm20b.h" #include "hal/gr/init/gr_init_gp10b.h" +#include "hal/gr/intr/gr_intr_gm20b.h" #include "hal/gr/config/gr_config_gm20b.h" #include "hal/gr/ctxsw_prog/ctxsw_prog_gm20b.h" #include "hal/gr/ctxsw_prog/ctxsw_prog_gp10b.h" @@ -136,7 +137,6 @@ static const struct gpu_ops vgpu_gp10b_ops = { .set_circular_buffer_size = NULL, .get_sm_dsm_perf_regs = gr_gm20b_get_sm_dsm_perf_regs, .get_sm_dsm_perf_ctrl_regs = gr_gm20b_get_sm_dsm_perf_ctrl_regs, - .set_hww_esr_report_mask = NULL, .set_gpc_tpc_mask = NULL, .is_tpc_addr = gr_gm20b_is_tpc_addr, .get_tpc_num = gr_gm20b_get_tpc_num, @@ -149,7 +149,6 @@ static const struct gpu_ops vgpu_gp10b_ops = { .get_lrf_tex_ltc_dram_override = NULL, .update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode, - .record_sm_error_state = gm20b_gr_record_sm_error_state, .clear_sm_error_state = vgpu_gr_clear_sm_error_state, .suspend_contexts = vgpu_gr_suspend_contexts, .resume_contexts = vgpu_gr_resume_contexts, @@ -157,21 +156,13 @@ static const struct gpu_ops vgpu_gp10b_ops = { .wait_for_pause = gr_gk20a_wait_for_pause, .resume_from_pause = NULL, .clear_sm_errors = gr_gk20a_clear_sm_errors, - .tpc_enabled_exceptions = NULL, - .get_esr_sm_sel = gk20a_gr_get_esr_sm_sel, .sm_debugger_attached = NULL, .suspend_single_sm = NULL, .suspend_all_sms = NULL, .resume_single_sm = NULL, .resume_all_sms = NULL, - .get_sm_hww_warp_esr = NULL, - .get_sm_hww_global_esr = NULL, - .get_sm_hww_warp_esr_pc = NULL, - .get_sm_no_lock_down_hww_global_esr_mask = - gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask, .lock_down_sm = NULL, .wait_for_sm_lock_down = NULL, - .clear_sm_hww = NULL, .init_ovr_sm_dsm_perf = gk20a_gr_init_ovr_sm_dsm_perf, .get_ovr_perf_regs = gk20a_gr_get_ovr_perf_regs, .set_boosted_ctx = NULL, @@ -373,6 +364,8 @@ static const struct gpu_ops vgpu_gp10b_ops = { .intr = { .flush_channel_tlb = nvgpu_gr_intr_flush_channel_tlb, + .get_sm_no_lock_down_hww_global_esr_mask = + gm20b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask, }, }, .gpu_class = { diff --git a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c index ea07e374f..d98598324 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c @@ -164,7 +164,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { .set_circular_buffer_size = NULL, .get_sm_dsm_perf_regs = gv11b_gr_get_sm_dsm_perf_regs, .get_sm_dsm_perf_ctrl_regs = gv11b_gr_get_sm_dsm_perf_ctrl_regs, - .set_hww_esr_report_mask = NULL, .set_gpc_tpc_mask = NULL, .is_tpc_addr = gr_gm20b_is_tpc_addr, .get_tpc_num = gr_gm20b_get_tpc_num, @@ -177,7 +176,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { .get_lrf_tex_ltc_dram_override = NULL, .update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode, - .record_sm_error_state = gv11b_gr_record_sm_error_state, .clear_sm_error_state = vgpu_gr_clear_sm_error_state, .suspend_contexts = vgpu_gr_suspend_contexts, .resume_contexts = vgpu_gr_resume_contexts, @@ -185,21 +183,13 @@ static const struct gpu_ops vgpu_gv11b_ops = { .wait_for_pause = gr_gk20a_wait_for_pause, .resume_from_pause = NULL, .clear_sm_errors = gr_gk20a_clear_sm_errors, - .tpc_enabled_exceptions = vgpu_gr_gk20a_tpc_enabled_exceptions, - .get_esr_sm_sel = gv11b_gr_get_esr_sm_sel, .sm_debugger_attached = NULL, .suspend_single_sm = NULL, .suspend_all_sms = NULL, .resume_single_sm = NULL, .resume_all_sms = NULL, - .get_sm_hww_warp_esr = NULL, - .get_sm_hww_global_esr = NULL, - .get_sm_hww_warp_esr_pc = NULL, - .get_sm_no_lock_down_hww_global_esr_mask = - gv11b_gr_get_sm_no_lock_down_hww_global_esr_mask, .lock_down_sm = NULL, .wait_for_sm_lock_down = NULL, - .clear_sm_hww = NULL, .init_ovr_sm_dsm_perf = gv11b_gr_init_ovr_sm_dsm_perf, .get_ovr_perf_regs = gv11b_gr_get_ovr_perf_regs, .set_boosted_ctx = NULL, @@ -212,8 +202,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, .access_smpc_reg = gv11b_gr_access_smpc_reg, .is_egpc_addr = gv11b_gr_pri_is_egpc_addr, - .handle_tpc_sm_ecc_exception = - gr_gv11b_handle_tpc_sm_ecc_exception, .decode_egpc_addr = gv11b_gr_decode_egpc_addr, .decode_priv_addr = gr_gv11b_decode_priv_addr, .create_priv_addr_table = gr_gv11b_create_priv_addr_table, @@ -438,6 +426,10 @@ static const struct gpu_ops vgpu_gv11b_ops = { gv11b_gr_intr_handle_tpc_mpc_exception, .handle_tex_exception = NULL, .flush_channel_tlb = nvgpu_gr_intr_flush_channel_tlb, + .get_sm_no_lock_down_hww_global_esr_mask = + gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask, + .tpc_enabled_exceptions = + vgpu_gr_gk20a_tpc_enabled_exceptions, }, }, .gpu_class = { diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c index 640e0cda4..ce773a85d 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.c @@ -248,12 +248,6 @@ bool gk20a_gr_sm_debugger_attached(struct gk20a *g) return false; } -void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, - u32 *esr_sm_sel) -{ - *esr_sm_sel = 1; -} - static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g, u32 addr, bool is_quad, u32 quad, @@ -1741,12 +1735,12 @@ int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, /* wait for the sm to lock down */ do { - u32 global_esr = g->ops.gr.get_sm_hww_global_esr(g, + u32 global_esr = g->ops.gr.intr.get_sm_hww_global_esr(g, gpc, tpc, sm); dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r() + offset); - warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm); + warp_esr = g->ops.gr.intr.get_sm_hww_warp_esr(g, gpc, tpc, sm); locked_down = (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) == @@ -2167,7 +2161,7 @@ int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state) * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp * masks. */ - global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g); + global_mask = g->ops.gr.intr.get_sm_no_lock_down_hww_global_esr_mask(g); /* Lock down all SMs */ for (sm_id = 0; sm_id < no_of_sm; sm_id++) { @@ -2225,13 +2219,13 @@ int gr_gk20a_clear_sm_errors(struct gk20a *g) tpc++) { for (sm = 0; sm < sm_per_tpc; sm++) { - global_esr = g->ops.gr.get_sm_hww_global_esr(g, + global_esr = g->ops.gr.intr.get_sm_hww_global_esr(g, gpc, tpc, sm); /* clearing hwws, also causes tpc and gpc * exceptions to be cleared */ - g->ops.gr.clear_sm_hww(g, + g->ops.gr.intr.clear_sm_hww(g, gpc, tpc, sm, global_esr); } } @@ -2240,64 +2234,3 @@ int gr_gk20a_clear_sm_errors(struct gk20a *g) return ret; } -u64 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g) -{ - u32 sm_id; - u64 tpc_exception_en = 0; - u32 offset, regval, tpc_offset, gpc_offset; - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); - u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g); - - for (sm_id = 0; sm_id < no_of_sm; sm_id++) { - struct nvgpu_sm_info *sm_info = - nvgpu_gr_config_get_sm_info(g->gr->config, sm_id); - tpc_offset = tpc_in_gpc_stride * - nvgpu_gr_config_get_sm_info_tpc_index(sm_info); - gpc_offset = gpc_stride * - nvgpu_gr_config_get_sm_info_gpc_index(sm_info); - offset = tpc_offset + gpc_offset; - - regval = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + - offset); - /* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */ - tpc_exception_en |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << sm_id; - } - - return tpc_exception_en; -} - -u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm) -{ - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); - u32 hww_warp_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset); - return hww_warp_esr; -} - -u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm) -{ - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); - - u32 hww_global_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); - - return hww_global_esr; -} - -u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g) -{ - /* - * These three interrupts don't require locking down the SM. They can - * be handled by usermode clients as they aren't fatal. Additionally, - * usermode clients may wish to allow some warps to execute while others - * are at breakpoints, as opposed to fatal errors where all warps should - * halt. - */ - u32 global_esr_mask = - gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() | - gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() | - gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f(); - - return global_esr_mask; -} diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h index 16b3ff011..9849cc446 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gk20a.h @@ -36,7 +36,6 @@ enum ctxsw_addr_type; /* sm */ bool gk20a_gr_sm_debugger_attached(struct gk20a *g); -u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g); int gr_gk20a_exec_ctx_ops(struct nvgpu_channel *ch, struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops, u32 num_ctx_wr_ops, u32 num_ctx_rd_ops, @@ -71,8 +70,6 @@ int gk20a_gr_lock_down_sm(struct gk20a *g, bool check_errors); int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors); -u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm); -u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm); bool gr_gk20a_suspend_context(struct nvgpu_channel *ch); bool gr_gk20a_resume_context(struct nvgpu_channel *ch); int gr_gk20a_suspend_contexts(struct gk20a *g, @@ -85,9 +82,6 @@ int gr_gk20a_trigger_suspend(struct gk20a *g); int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state); int gr_gk20a_resume_from_pause(struct gk20a *g); int gr_gk20a_clear_sm_errors(struct gk20a *g); -u64 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g); -void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, - u32 *esr_sm_sel); void gk20a_gr_init_ovr_sm_dsm_perf(void); void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs, u32 **ovr_perf_regs); diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c index 4a46a5f14..080956774 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.c @@ -171,40 +171,6 @@ void gr_gm20b_set_circular_buffer_size(struct gk20a *g, u32 data) } } -void gr_gm20b_set_hww_esr_report_mask(struct gk20a *g) -{ - /* setup sm warp esr report masks */ - gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_mmu_fault_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_overflow_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() | - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f()); - - /* setup sm global esr report mask */ - gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), - gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() | - gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f()); -} - - /* Following are the blocks of registers that the ucode stores in the extended region.*/ /* == ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */ @@ -562,57 +528,6 @@ void gr_gm20b_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state) } } -static void gm20b_gr_read_sm_error_state(struct gk20a *g, - u32 offset, - struct nvgpu_tsg_sm_error_state *sm_error_states) -{ - sm_error_states->hww_global_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); - sm_error_states->hww_warp_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset); - sm_error_states->hww_warp_esr_pc = (u64)(gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset)); - sm_error_states->hww_global_esr_report_mask = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset); - sm_error_states->hww_warp_esr_report_mask = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset); - -} - -int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - struct nvgpu_channel *fault_ch) -{ - int sm_id; - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, - GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; - struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; - struct nvgpu_tsg *tsg = NULL; - - nvgpu_mutex_acquire(&g->dbg_sessions_lock); - - sm_id = gr_gpc0_tpc0_sm_cfg_sm_id_v(gk20a_readl(g, - gr_gpc0_tpc0_sm_cfg_r() + offset)); - - if (fault_ch != NULL) { - tsg = nvgpu_tsg_from_ch(fault_ch); - } - - if (tsg == NULL) { - nvgpu_err(g, "no valid tsg"); - goto record_fail; - } - - sm_error_states = tsg->sm_error_states + sm_id; - gm20b_gr_read_sm_error_state(g, offset, sm_error_states); - -record_fail: - nvgpu_mutex_release(&g->dbg_sessions_lock); - - return sm_id; -} - int gm20b_gr_clear_sm_error_state(struct gk20a *g, struct nvgpu_channel *ch, u32 sm_id) { @@ -662,18 +577,6 @@ fail: return err; } -void gm20b_gr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - u32 global_esr) -{ - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); - - gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, - global_esr); - - /* clear the warp hww */ - gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset, 0); -} - int gm20b_gr_set_mmu_debug_mode(struct gk20a *g, struct nvgpu_channel *ch, bool enable) { diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.h b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.h index 63355f706..f85237451 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.h +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gm20b.h @@ -37,7 +37,6 @@ int gr_gm20b_commit_global_cb_manager(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, bool patch); void gr_gm20b_set_alpha_circular_buffer_size(struct gk20a *g, u32 data); void gr_gm20b_set_circular_buffer_size(struct gk20a *g, u32 data); -void gr_gm20b_set_hww_esr_report_mask(struct gk20a *g); void gr_gm20b_init_sm_dsm_reg_info(void); void gr_gm20b_get_sm_dsm_perf_regs(struct gk20a *g, u32 *num_sm_dsm_perf_regs, @@ -56,12 +55,8 @@ int gr_gm20b_update_pc_sampling(struct nvgpu_channel *c, bool enable); void gr_gm20b_init_cyclestats(struct gk20a *g); void gr_gm20b_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state); -int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, - u32 tpc, u32 sm, struct nvgpu_channel *fault_ch); int gm20b_gr_clear_sm_error_state(struct gk20a *g, struct nvgpu_channel *ch, u32 sm_id); -void gm20b_gr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - u32 global_esr); u32 gr_gm20b_get_pmm_per_chiplet_offset(void); void gm20b_gr_set_debug_mode(struct gk20a *g, bool enable); int gm20b_gr_set_mmu_debug_mode(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gp10b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gp10b.c index 628b58add..c3c958f6e 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gp10b.c +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gp10b.c @@ -554,9 +554,9 @@ int gr_gp10b_pre_process_sm_exception(struct gk20a *g, } /* reset the HWW errors after locking down */ - global_esr_copy = g->ops.gr.get_sm_hww_global_esr(g, + global_esr_copy = g->ops.gr.intr.get_sm_hww_global_esr(g, gpc, tpc, sm); - g->ops.gr.clear_sm_hww(g, + g->ops.gr.intr.clear_sm_hww(g, gpc, tpc, sm, global_esr_copy); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "CILP: HWWs cleared for gpc %d tpc %d\n", diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c index 6be068e2e..4878b6520 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c @@ -61,11 +61,6 @@ #define PRI_BROADCAST_FLAGS_SMPC BIT32(17) -u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void) -{ - return gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v(); -} - void gr_gv11b_powergate_tpc(struct gk20a *g) { u32 tpc_pg_status = g->ops.fuse.fuse_status_opt_tpc_gpc(g, 0); @@ -83,603 +78,6 @@ void gr_gv11b_powergate_tpc(struct gk20a *g) return; } -u32 gv11b_gr_sm_offset(struct gk20a *g, u32 sm) -{ - - u32 sm_pri_stride = nvgpu_get_litter_value(g, GPU_LIT_SM_PRI_STRIDE); - u32 sm_offset = sm_pri_stride * sm; - - return sm_offset; -} - -static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr) -{ - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; - u32 l1_tag_ecc_status, l1_tag_ecc_corrected_err_status = 0; - u32 l1_tag_ecc_uncorrected_err_status = 0; - u32 l1_tag_corrected_err_count_delta = 0; - u32 l1_tag_uncorrected_err_count_delta = 0; - bool is_l1_tag_ecc_corrected_total_err_overflow = false; - bool is_l1_tag_ecc_uncorrected_total_err_overflow = false; - - /* Check for L1 tag ECC errors. */ - l1_tag_ecc_status = gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset); - l1_tag_ecc_corrected_err_status = l1_tag_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()); - l1_tag_ecc_uncorrected_err_status = l1_tag_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()); - - if ((l1_tag_ecc_corrected_err_status == 0U) && (l1_tag_ecc_uncorrected_err_status == 0U)) { - return; - } - - l1_tag_corrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + - offset)); - l1_tag_uncorrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + - offset)); - is_l1_tag_ecc_corrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(l1_tag_ecc_status) != 0U; - is_l1_tag_ecc_uncorrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(l1_tag_ecc_status) != 0U; - - if ((l1_tag_corrected_err_count_delta > 0U) || is_l1_tag_ecc_corrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "corrected error (SBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]", - l1_tag_ecc_corrected_err_status, is_l1_tag_ecc_corrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_l1_tag_ecc_corrected_total_err_overflow) { - l1_tag_corrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s()); - } - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += - l1_tag_corrected_err_count_delta; - if ((l1_tag_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_ECC_CORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); - } - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, - 0); - } - if ((l1_tag_uncorrected_err_count_delta > 0U) || is_l1_tag_ecc_uncorrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Uncorrected error (DBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]", - l1_tag_ecc_uncorrected_err_status, is_l1_tag_ecc_uncorrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_l1_tag_ecc_uncorrected_total_err_overflow) { - l1_tag_uncorrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s()); - } - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += - l1_tag_uncorrected_err_count_delta; - if ((l1_tag_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); - } - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, - 0); - } - - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset, - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f()); -} - -static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr) -{ - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; - u32 lrf_ecc_status, lrf_ecc_corrected_err_status = 0; - u32 lrf_ecc_uncorrected_err_status = 0; - u32 lrf_corrected_err_count_delta = 0; - u32 lrf_uncorrected_err_count_delta = 0; - bool is_lrf_ecc_corrected_total_err_overflow = false; - bool is_lrf_ecc_uncorrected_total_err_overflow = false; - - /* Check for LRF ECC errors. */ - lrf_ecc_status = gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); - lrf_ecc_corrected_err_status = lrf_ecc_status & - (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp4_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp5_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp6_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp7_m()); - lrf_ecc_uncorrected_err_status = lrf_ecc_status & - (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp4_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp5_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp6_m() | - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp7_m()); - - if ((lrf_ecc_corrected_err_status == 0U) && (lrf_ecc_uncorrected_err_status == 0U)) { - return; - } - - lrf_corrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + - offset)); - lrf_uncorrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + - offset)); - is_lrf_ecc_corrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_total_counter_overflow_v(lrf_ecc_status) != 0U; - is_lrf_ecc_uncorrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_total_counter_overflow_v(lrf_ecc_status) != 0U; - - if ((lrf_corrected_err_count_delta > 0U) || is_lrf_ecc_corrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "corrected error (SBE) detected in SM LRF! err_mask [%08x] is_overf [%d]", - lrf_ecc_corrected_err_status, is_lrf_ecc_corrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_lrf_ecc_corrected_total_err_overflow) { - lrf_corrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s()); - } - g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += - lrf_corrected_err_count_delta; - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_LRF_ECC_CORRECTED, 0, - g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset, - 0); - } - if ((lrf_uncorrected_err_count_delta > 0U) || is_lrf_ecc_uncorrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Uncorrected error (DBE) detected in SM LRF! err_mask [%08x] is_overf [%d]", - lrf_ecc_uncorrected_err_status, is_lrf_ecc_uncorrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_lrf_ecc_uncorrected_total_err_overflow) { - lrf_uncorrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s()); - } - g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += - lrf_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_LRF_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, - 0); - } - - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset, - gr_pri_gpc0_tpc0_sm_lrf_ecc_status_reset_task_f()); -} - -static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr) -{ - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; - u32 cbu_ecc_status, cbu_ecc_corrected_err_status = 0; - u32 cbu_ecc_uncorrected_err_status = 0; - u32 cbu_corrected_err_count_delta = 0; - u32 cbu_uncorrected_err_count_delta = 0; - bool is_cbu_ecc_corrected_total_err_overflow = false; - bool is_cbu_ecc_uncorrected_total_err_overflow = false; - - /* Check for CBU ECC errors. */ - cbu_ecc_status = gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset); - cbu_ecc_corrected_err_status = cbu_ecc_status & - (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm0_m() | - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm1_m() | - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm0_m() | - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm1_m()); - cbu_ecc_uncorrected_err_status = cbu_ecc_status & - (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm0_m() | - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm1_m() | - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm0_m() | - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm1_m()); - - if ((cbu_ecc_corrected_err_status == 0U) && (cbu_ecc_uncorrected_err_status == 0U)) { - return; - } - - cbu_corrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + - offset)); - cbu_uncorrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + - offset)); - is_cbu_ecc_corrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_total_counter_overflow_v(cbu_ecc_status) != 0U; - is_cbu_ecc_uncorrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_total_counter_overflow_v(cbu_ecc_status) != 0U; - - if ((cbu_corrected_err_count_delta > 0U) || is_cbu_ecc_corrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "corrected error (SBE) detected in SM CBU! err_mask [%08x] is_overf [%d]", - cbu_ecc_corrected_err_status, is_cbu_ecc_corrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_cbu_ecc_corrected_total_err_overflow) { - cbu_corrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s()); - } - g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter += - cbu_corrected_err_count_delta; - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_CBU_ECC_CORRECTED, - 0, g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset, - 0); - } - if ((cbu_uncorrected_err_count_delta > 0U) || is_cbu_ecc_uncorrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Uncorrected error (DBE) detected in SM CBU! err_mask [%08x] is_overf [%d]", - cbu_ecc_uncorrected_err_status, is_cbu_ecc_uncorrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_cbu_ecc_uncorrected_total_err_overflow) { - cbu_uncorrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s()); - } - g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += - cbu_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_CBU_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, - 0); - } - - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset, - gr_pri_gpc0_tpc0_sm_cbu_ecc_status_reset_task_f()); -} - -static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr) -{ - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; - u32 l1_data_ecc_status, l1_data_ecc_corrected_err_status = 0; - u32 l1_data_ecc_uncorrected_err_status = 0; - u32 l1_data_corrected_err_count_delta = 0; - u32 l1_data_uncorrected_err_count_delta = 0; - bool is_l1_data_ecc_corrected_total_err_overflow = false; - bool is_l1_data_ecc_uncorrected_total_err_overflow = false; - - /* Check for L1 data ECC errors. */ - l1_data_ecc_status = gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset); - l1_data_ecc_corrected_err_status = l1_data_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_1_m()); - l1_data_ecc_uncorrected_err_status = l1_data_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_1_m()); - - if ((l1_data_ecc_corrected_err_status == 0U) && (l1_data_ecc_uncorrected_err_status == 0U)) { - return; - } - - l1_data_corrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + - offset)); - l1_data_uncorrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + - offset)); - is_l1_data_ecc_corrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_total_counter_overflow_v(l1_data_ecc_status) != 0U; - is_l1_data_ecc_uncorrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_total_counter_overflow_v(l1_data_ecc_status) != 0U; - - if ((l1_data_corrected_err_count_delta > 0U) || is_l1_data_ecc_corrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "corrected error (SBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]", - l1_data_ecc_corrected_err_status, is_l1_data_ecc_corrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_l1_data_ecc_corrected_total_err_overflow) { - l1_data_corrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s()); - } - g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter += - l1_data_corrected_err_count_delta; - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_DATA_ECC_CORRECTED, - 0, g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset, - 0); - } - if ((l1_data_uncorrected_err_count_delta > 0U) || is_l1_data_ecc_uncorrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Uncorrected error (DBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]", - l1_data_ecc_uncorrected_err_status, is_l1_data_ecc_uncorrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_l1_data_ecc_uncorrected_total_err_overflow) { - l1_data_uncorrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s()); - } - g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += - l1_data_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter); - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, - 0); - } - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset, - gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f()); -} - -static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr) -{ - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); - u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; - u32 icache_ecc_status, icache_ecc_corrected_err_status = 0; - u32 icache_ecc_uncorrected_err_status = 0; - u32 icache_corrected_err_count_delta = 0; - u32 icache_uncorrected_err_count_delta = 0; - bool is_icache_ecc_corrected_total_err_overflow = false; - bool is_icache_ecc_uncorrected_total_err_overflow = false; - - /* Check for L0 && L1 icache ECC errors. */ - icache_ecc_status = gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset); - icache_ecc_corrected_err_status = icache_ecc_status & - (gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m() | - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m() | - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m() | - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()); - icache_ecc_uncorrected_err_status = icache_ecc_status & - (gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m() | - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m() | - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m() | - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()); - - if ((icache_ecc_corrected_err_status == 0U) && (icache_ecc_uncorrected_err_status == 0U)) { - return; - } - - icache_corrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + - offset)); - icache_uncorrected_err_count_delta = - gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_v( - gk20a_readl(g, - gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + - offset)); - is_icache_ecc_corrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_total_counter_overflow_v(icache_ecc_status) != 0U; - is_icache_ecc_uncorrected_total_err_overflow = - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_total_counter_overflow_v(icache_ecc_status) != 0U; - - if ((icache_corrected_err_count_delta > 0U) || is_icache_ecc_corrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "corrected error (SBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]", - icache_ecc_corrected_err_status, is_icache_ecc_corrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_icache_ecc_corrected_total_err_overflow) { - icache_corrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s()); - } - g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter += - icache_corrected_err_count_delta; - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset, - 0); - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } - } - if ((icache_uncorrected_err_count_delta > 0U) || is_icache_ecc_uncorrected_total_err_overflow) { - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, - "Uncorrected error (DBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]", - icache_ecc_uncorrected_err_status, is_icache_ecc_uncorrected_total_err_overflow); - - /* HW uses 16-bits counter */ - if (is_icache_ecc_uncorrected_total_err_overflow) { - icache_uncorrected_err_count_delta += - BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s()); - } - g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter += - icache_uncorrected_err_count_delta; - gk20a_writel(g, - gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset, - 0); - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } - } - - gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset, - gr_pri_gpc0_tpc0_sm_icache_ecc_status_reset_task_f()); -} - -void gr_gv11b_handle_tpc_sm_ecc_exception(struct gk20a *g, - u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr) -{ - /* Check for L1 tag ECC errors. */ - gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); - - /* Check for LRF ECC errors. */ - gr_gv11b_handle_lrf_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); - - /* Check for CBU ECC errors. */ - gr_gv11b_handle_cbu_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); - - /* Check for L1 data ECC errors. */ - gr_gv11b_handle_l1_data_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); - - /* Check for L0 && L1 icache ECC errors. */ - gr_gv11b_handle_icache_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); -} - void gr_gv11b_set_alpha_circular_buffer_size(struct gk20a *g, u32 data) { struct nvgpu_gr *gr = g->gr; @@ -880,7 +278,7 @@ static void gr_gv11b_dump_gr_sm_regs(struct gk20a *g, for (sm = 0; sm < sm_per_tpc; sm++) { offset = gpc_offset + tpc_offset + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); gr_gv11b_dump_gr_per_sm_regs(g, o, gpc, tpc, sm, offset); @@ -1091,7 +489,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g, /* clear interrupt */ offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); nvgpu_writel(g, gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0); @@ -1212,7 +610,7 @@ clear_intr: /* clear interrupt */ offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); nvgpu_writel(g, gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0); @@ -1277,7 +675,7 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, u32 global_mask = 0, dbgr_control0, global_esr_copy; u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); if ((global_esr & gr_gpc0_tpc0_sm0_hww_global_esr_bpt_int_pending_f()) != 0U) { @@ -1321,9 +719,9 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, } /* reset the HWW errors after locking down */ - global_esr_copy = g->ops.gr.get_sm_hww_global_esr(g, + global_esr_copy = g->ops.gr.intr.get_sm_hww_global_esr(g, gpc, tpc, sm); - g->ops.gr.clear_sm_hww(g, + g->ops.gr.intr.clear_sm_hww(g, gpc, tpc, sm, global_esr_copy); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "CILP: HWWs cleared for " @@ -1367,26 +765,6 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, return 0; } -void gv11b_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, - u32 *esr_sm_sel) -{ - u32 reg_val; - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc); - - reg_val = gk20a_readl(g, gr_gpc0_tpc0_sm_tpc_esr_sm_sel_r() + offset); - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, - "sm tpc esr sm sel reg val: 0x%x", reg_val); - *esr_sm_sel = 0; - if (gr_gpc0_tpc0_sm_tpc_esr_sm_sel_sm0_error_v(reg_val) != 0U) { - *esr_sm_sel = 1; - } - if (gr_gpc0_tpc0_sm_tpc_esr_sm_sel_sm1_error_v(reg_val) != 0U) { - *esr_sm_sel |= BIT32(1); - } - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, - "esr_sm_sel bitmask: 0x%x", *esr_sm_sel); -} - int gv11b_gr_sm_trigger_suspend(struct gk20a *g) { u32 dbgr_control0; @@ -1429,7 +807,7 @@ void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state) offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); /* 64 bit read */ warps_valid = (u64)gk20a_readl(g, @@ -1515,7 +893,7 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g, reg_offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); ops[i].op = REGOP(WRITE_32); ops[i].type = REGOP(TYPE_GR_CTX); @@ -1552,110 +930,6 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g, return err; } -static void gv11b_gr_read_sm_error_state(struct gk20a *g, - u32 offset, - struct nvgpu_tsg_sm_error_state *sm_error_states) -{ - sm_error_states->hww_global_esr = nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset); - - sm_error_states->hww_warp_esr = nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset); - - sm_error_states->hww_warp_esr_pc = hi32_lo32_to_u64((nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset)), - (nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset))); - - sm_error_states->hww_global_esr_report_mask = nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset); - - sm_error_states->hww_warp_esr_report_mask = nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset); -} - -u64 gv11b_gr_get_sm_hww_warp_esr_pc(struct gk20a *g, u32 offset) -{ - u64 hww_warp_esr_pc; - - hww_warp_esr_pc = hi32_lo32_to_u64((nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset)),(nvgpu_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset))); - - return hww_warp_esr_pc; -} - -int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - struct nvgpu_channel *fault_ch) -{ - int ret = 0; - u32 sm_id; - u32 offset, sm_per_tpc, tpc_id; - u32 gpc_offset, gpc_tpc_offset; - struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; - struct nvgpu_tsg *tsg = NULL; - - nvgpu_mutex_acquire(&g->dbg_sessions_lock); - - sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); - gpc_offset = nvgpu_gr_gpc_offset(g, gpc); - gpc_tpc_offset = gpc_offset + nvgpu_gr_tpc_offset(g, tpc); - - tpc_id = gk20a_readl(g, gr_gpc0_gpm_pd_sm_id_r(tpc) + gpc_offset); - sm_id = tpc_id * sm_per_tpc + sm; - - offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm); - - if (fault_ch != NULL) { - tsg = nvgpu_tsg_from_ch(fault_ch); - } - - if (tsg == NULL) { - nvgpu_err(g, "no valid tsg"); - ret = -EINVAL; - goto record_fail; - } - - sm_error_states = tsg->sm_error_states + sm_id; - gv11b_gr_read_sm_error_state(g, offset, sm_error_states); - -record_fail: - nvgpu_mutex_release(&g->dbg_sessions_lock); - - return ret; -} - -void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g) -{ - - /* clear hww */ - gk20a_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_r(), 0xffffffffU); - gk20a_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_r(), 0xffffffffU); - - /* setup sm warp esr report masks */ - gk20a_writel(g, gr_gpcs_tpcs_sms_hww_warp_esr_report_mask_r(), - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_error_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_api_stack_error_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_pc_wrap_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_misaligned_pc_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_pc_overflow_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_misaligned_reg_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_illegal_instr_param_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_oor_reg_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_oor_addr_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_misaligned_addr_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_addr_space_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_fault_report_f() | - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f()); - - /* setup sm global esr report mask. vat_alarm_report is not enabled */ - gk20a_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r(), - gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_multiple_warp_errors_report_f()); -} - bool gv11b_gr_sm_debugger_attached(struct gk20a *g) { u32 debugger_mode; @@ -1685,7 +959,7 @@ void gv11b_gr_suspend_single_sm(struct gk20a *g, u32 dbgr_control0; u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); /* if an SM debugger isn't attached, skip suspend */ if (!g->ops.gr.sm_debugger_attached(g)) { @@ -1779,7 +1053,7 @@ void gv11b_gr_resume_single_sm(struct gk20a *g, */ offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "resuming gpc:%d, tpc:%d, sm%d", gpc, tpc, sm); @@ -1912,48 +1186,6 @@ int gv11b_gr_resume_from_pause(struct gk20a *g) return err; } -u32 gv11b_gr_get_sm_hww_warp_esr(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm) -{ - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + - nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); - - u32 hww_warp_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset); - return hww_warp_esr; -} - -u32 gv11b_gr_get_sm_hww_global_esr(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm) -{ - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + - nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); - - u32 hww_global_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset); - - return hww_global_esr; -} - -u32 gv11b_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g) -{ - /* - * These three interrupts don't require locking down the SM. They can - * be handled by usermode clients as they aren't fatal. Additionally, - * usermode clients may wish to allow some warps to execute while others - * are at breakpoints, as opposed to fatal errors where all warps should - * halt. - */ - u32 global_esr_mask = - gr_gpc0_tpc0_sm0_hww_global_esr_bpt_int_pending_f() | - gr_gpc0_tpc0_sm0_hww_global_esr_bpt_pause_pending_f() | - gr_gpc0_tpc0_sm0_hww_global_esr_single_step_complete_pending_f(); - - return global_esr_mask; -} - static void gv11b_gr_sm_dump_warp_bpt_pause_trap_mask_regs(struct gk20a *g, u32 offset, bool timeout) { @@ -2011,7 +1243,7 @@ int gv11b_gr_wait_for_sm_lock_down(struct gk20a *g, int err; u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d: locking down SM%d", gpc, tpc, sm); @@ -2025,11 +1257,11 @@ int gv11b_gr_wait_for_sm_lock_down(struct gk20a *g, /* wait for the sm to lock down */ do { - global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm); + global_esr = g->ops.gr.intr.get_sm_hww_global_esr(g, gpc, tpc, sm); dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm0_dbgr_status0_r() + offset); - warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm); + warp_esr = g->ops.gr.intr.get_sm_hww_warp_esr(g, gpc, tpc, sm); locked_down = (gr_gpc0_tpc0_sm0_dbgr_status0_locked_down_v(dbgr_status0) == @@ -2099,7 +1331,7 @@ int gv11b_gr_lock_down_sm(struct gk20a *g, { u32 dbgr_control0; u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d SM%d: assert stop trigger", gpc, tpc, sm); @@ -2115,26 +1347,6 @@ int gv11b_gr_lock_down_sm(struct gk20a *g, check_errors); } -void gv11b_gr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - u32 global_esr) -{ - u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); - - gk20a_writel(g, gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset, - global_esr); - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, - "Cleared HWW global esr, current reg val: 0x%x", - gk20a_readl(g, gr_gpc0_tpc0_sm0_hww_global_esr_r() + - offset)); - - gk20a_writel(g, gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0); - nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, - "Cleared HWW warp esr, current reg val: 0x%x", - gk20a_readl(g, gr_gpc0_tpc0_sm0_hww_warp_esr_r() + - offset)); -} - static const u32 _num_ovr_perf_regs = 20; static u32 _ovr_perf_regs[20] = { 0, }; @@ -2550,19 +1762,6 @@ u32 gv11b_gr_get_egpc_base(struct gk20a *g) return EGPC_PRI_BASE; } -int gr_gv11b_handle_ssync_hww(struct gk20a *g, u32 *ssync_esr) -{ - u32 ssync = gk20a_readl(g, gr_ssync_hww_esr_r()); - - if (ssync_esr != NULL) { - *ssync_esr = ssync; - } - nvgpu_err(g, "ssync exception: esr 0x%08x", ssync); - gk20a_writel(g, gr_ssync_hww_esr_r(), - gr_ssync_hww_esr_reset_active_f()); - return -EFAULT; -} - /* * This function will decode a priv address and return the partition * type and numbers @@ -2938,7 +2137,7 @@ int gv11b_gr_clear_sm_error_state(struct gk20a *g, offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc) + - gv11b_gr_sm_offset(g, sm); + nvgpu_gr_sm_offset(g, sm); val = gk20a_readl(g, gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset); gk20a_writel(g, gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset, diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.h b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.h index beadb68d3..ecb45fe3d 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.h @@ -29,11 +29,6 @@ struct gk20a; struct nvgpu_warpstate; struct nvgpu_debug_context; -u32 gr_gv11b_ctxsw_checksum_mismatch_mailbox_val(void); -void gr_gv11b_handle_tpc_sm_ecc_exception(struct gk20a *g, - u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr); void gr_gv11b_set_alpha_circular_buffer_size(struct gk20a *g, u32 data); void gr_gv11b_set_circular_buffer_size(struct gk20a *g, u32 data); int gr_gv11b_dump_gr_status_regs(struct gk20a *g, @@ -43,18 +38,12 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr, u32 warp_esr, bool sm_debugger_attached, struct nvgpu_channel *fault_ch, bool *early_exit, bool *ignore_debugger); -void gv11b_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, - u32 *esr_sm_sel); int gv11b_gr_sm_trigger_suspend(struct gk20a *g); void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state); int gv11b_gr_set_sm_debug_mode(struct gk20a *g, struct nvgpu_channel *ch, u64 sms, bool enable); -u64 gv11b_gr_get_sm_hww_warp_esr_pc(struct gk20a *g, u32 offset); -int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - struct nvgpu_channel *fault_ch); int gv11b_gr_clear_sm_error_state(struct gk20a *g, struct nvgpu_channel *ch, u32 sm_id); -void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g); bool gv11b_gr_sm_debugger_attached(struct gk20a *g); void gv11b_gr_suspend_single_sm(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, @@ -65,19 +54,12 @@ void gv11b_gr_resume_single_sm(struct gk20a *g, u32 gpc, u32 tpc, u32 sm); void gv11b_gr_resume_all_sms(struct gk20a *g); int gv11b_gr_resume_from_pause(struct gk20a *g); -u32 gv11b_gr_get_sm_hww_warp_esr(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm); -u32 gv11b_gr_get_sm_hww_global_esr(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm); -u32 gv11b_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g); int gv11b_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors); int gv11b_gr_lock_down_sm(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors); -void gv11b_gr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - u32 global_esr); void gv11b_gr_init_ovr_sm_dsm_perf(void); void gv11b_gr_init_sm_dsm_reg_info(void); void gv11b_gr_get_sm_dsm_perf_regs(struct gk20a *g, @@ -102,8 +84,6 @@ void gv11b_gr_egpc_etpc_priv_addr_table(struct gk20a *g, u32 addr, u32 gpc_num, u32 tpc_num, u32 broadcast_flags, u32 *priv_addr_table, u32 *t); u32 gv11b_gr_get_egpc_base(struct gk20a *g); -int gr_gv11b_handle_ssync_hww(struct gk20a *g, u32 *ssync_esr); -u32 gv11b_gr_sm_offset(struct gk20a *g, u32 sm); int gr_gv11b_decode_priv_addr(struct gk20a *g, u32 addr, enum ctxsw_addr_type *addr_type, u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num, diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.c index 29d16635c..1bf04dc49 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.c +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.c @@ -109,78 +109,3 @@ void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g, *sm_dsm_perf_ctrl_regs = NULL; *ctrl_register_stride = 0; } - -void gr_tu104_log_mme_exception(struct gk20a *g) -{ - u32 mme_hww_esr = nvgpu_readl(g, gr_mme_hww_esr_r()); - u32 mme_hww_info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); - - if ((mme_hww_esr & - gr_mme_hww_esr_missing_macro_data_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: MISSING_MACRO_DATA"); - } - - if ((mme_hww_esr & - gr_mme_hww_esr_illegal_mme_method_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: ILLEGAL_MME_METHOD"); - } - - if ((mme_hww_esr & - gr_mme_hww_esr_dma_dram_access_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DMA_DRAM_ACCESS_OUT_OF_BOUNDS"); - } - - if ((mme_hww_esr & - gr_mme_hww_esr_dma_illegal_fifo_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DMA_ILLEGAL_FIFO_CONFIG"); - } - - if ((mme_hww_esr & - gr_mme_hww_esr_dma_read_overflow_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DMA_READ_FIFOED_OVERFLOW"); - } - - if ((mme_hww_esr & - gr_mme_hww_esr_dma_fifo_resized_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DMA_FIFO_RESIZED_WHEN_NONIDLE"); - } - - if ((mme_hww_esr & gr_mme_hww_esr_illegal_opcode_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: ILLEGAL_OPCODE"); - } - - if ((mme_hww_esr & gr_mme_hww_esr_branch_in_delay_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: BRANCH_IN_DELAY_SHOT"); - } - - if ((mme_hww_esr & gr_mme_hww_esr_inst_ram_acess_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: INSTR_RAM_ACCESS_OUT_OF_BOUNDS"); - } - - if ((mme_hww_esr & gr_mme_hww_esr_data_ram_access_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DATA_RAM_ACCESS_OUT_OF_BOUNDS"); - } - - if ((mme_hww_esr & gr_mme_hww_esr_dma_read_pb_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DMA_READ_FIFOED_FROM_PB"); - } - - if (gr_mme_hww_esr_info_pc_valid_v(mme_hww_info) == 0x1U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: INFO2 0x%x, INFO3 0x%x, INFO4 0x%x", - nvgpu_readl(g, gr_mme_hww_esr_info2_r()), - nvgpu_readl(g, gr_mme_hww_esr_info3_r()), - nvgpu_readl(g, gr_mme_hww_esr_info4_r())); - } -} diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.h b/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.h index 3242baa2c..89cb645a0 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.h +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_tu104.h @@ -37,5 +37,4 @@ void gr_tu104_init_sm_dsm_reg_info(void); void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g, u32 *num_sm_dsm_perf_ctrl_regs, u32 **sm_dsm_perf_ctrl_regs, u32 *ctrl_register_stride); -void gr_tu104_log_mme_exception(struct gk20a *g); #endif /* NVGPU_GR_TU104_H */ diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c index 61d0c5703..59e5c88ba 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "common/gr/gr_intr_priv.h" @@ -287,8 +288,8 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) if ((exception & gr_exception_ssync_m()) != 0U) { u32 ssync_esr = 0; - if (g->ops.gr.handle_ssync_hww != NULL) { - if (g->ops.gr.handle_ssync_hww(g, &ssync_esr) + if (g->ops.gr.intr.handle_ssync_hww != NULL) { + if (g->ops.gr.intr.handle_ssync_hww(g, &ssync_esr) != 0) { gpc_reset = true; } @@ -309,8 +310,8 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) mme); nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x", mme, info); - if (g->ops.gr.log_mme_exception != NULL) { - g->ops.gr.log_mme_exception(g); + if (g->ops.gr.intr.log_mme_exception != NULL) { + g->ops.gr.intr.log_mme_exception(g); } nvgpu_writel(g, gr_mme_hww_esr_r(), @@ -510,3 +511,183 @@ u32 gm20b_gr_intr_nonstall_isr(struct gk20a *g) } return ops; } + +void gm20b_gr_intr_set_hww_esr_report_mask(struct gk20a *g) +{ + /* setup sm warp esr report masks */ + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_mmu_fault_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_overflow_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() | + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f()); + + /* setup sm global esr report mask */ + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), + gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() | + gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f()); +} + +void gm20b_gr_intr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, + u32 *esr_sm_sel) +{ + *esr_sm_sel = 1; +} + +void gm20b_gr_intr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + u32 global_esr) +{ + u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_gr_tpc_offset(g, tpc)); + + gk20a_writel(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_global_esr_r(), offset), + global_esr); + + /* clear the warp hww */ + gk20a_writel(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_warp_esr_r(), offset), + 0); +} + +static void gm20b_gr_intr_read_sm_error_state(struct gk20a *g, + u32 offset, + struct nvgpu_tsg_sm_error_state *sm_error_states) +{ + sm_error_states->hww_global_esr = gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_global_esr_r(), offset)); + sm_error_states->hww_warp_esr = gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_warp_esr_r(), offset)); + sm_error_states->hww_warp_esr_pc = (u64)(gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_warp_esr_pc_r(), offset))); + sm_error_states->hww_global_esr_report_mask = gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r(), offset)); + sm_error_states->hww_warp_esr_report_mask = gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r(), offset)); + +} + +u32 gm20b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + struct nvgpu_channel *fault_ch) +{ + u32 sm_id; + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, + GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset; + struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; + struct nvgpu_tsg *tsg = NULL; + + offset = nvgpu_safe_add_u32( + nvgpu_safe_mult_u32(gpc_stride, gpc), + nvgpu_safe_mult_u32(tpc_in_gpc_stride, tpc)); + + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + + sm_id = gr_gpc0_tpc0_sm_cfg_sm_id_v( + gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_cfg_r(), offset))); + + if (fault_ch != NULL) { + tsg = nvgpu_tsg_from_ch(fault_ch); + } + + if (tsg == NULL) { + nvgpu_err(g, "no valid tsg"); + goto record_fail; + } + + sm_error_states = tsg->sm_error_states + sm_id; + gm20b_gr_intr_read_sm_error_state(g, offset, sm_error_states); + +record_fail: + nvgpu_mutex_release(&g->dbg_sessions_lock); + + return sm_id; +} + +u32 gm20b_gr_intr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, + u32 sm) +{ + u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_gr_tpc_offset(g, tpc)); + + u32 hww_global_esr = gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_global_esr_r(), offset)); + + return hww_global_esr; +} + +u32 gm20b_gr_intr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm) +{ + u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_gr_tpc_offset(g, tpc)); + u32 hww_warp_esr = gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_hww_warp_esr_r(), offset)); + return hww_warp_esr; +} + +u32 gm20b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g) +{ + /* + * These three interrupts don't require locking down the SM. They can + * be handled by usermode clients as they aren't fatal. Additionally, + * usermode clients may wish to allow some warps to execute while others + * are at breakpoints, as opposed to fatal errors where all warps should + * halt. + */ + u32 global_esr_mask = + gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() | + gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() | + gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f(); + + return global_esr_mask; +} + +u64 gm20b_gr_intr_tpc_enabled_exceptions(struct gk20a *g) +{ + u32 sm_id; + u64 tpc_exception_en = 0; + u32 offset, regval, tpc_offset, gpc_offset; + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g); + struct nvgpu_gr_config *config = nvgpu_gr_get_config_ptr(g); + + for (sm_id = 0; sm_id < no_of_sm; sm_id++) { + struct nvgpu_sm_info *sm_info = + nvgpu_gr_config_get_sm_info(config, sm_id); + tpc_offset = tpc_in_gpc_stride * + nvgpu_gr_config_get_sm_info_tpc_index(sm_info); + gpc_offset = gpc_stride * + nvgpu_gr_config_get_sm_info_gpc_index(sm_info); + offset = nvgpu_safe_add_u32(tpc_offset, gpc_offset); + + regval = gk20a_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), offset)); + /* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */ + tpc_exception_en |= + (u64)gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << + (u64)sm_id; + } + + return tpc_exception_en; +} + diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.h b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.h index cd129ad6f..605697d4e 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.h +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.h @@ -67,4 +67,19 @@ void gm20b_gr_intr_enable_gpc_exceptions(struct gk20a *g, u32 gm20b_gr_intr_nonstall_isr(struct gk20a *g); void gm20ab_gr_intr_tpc_exception_sm_disable(struct gk20a *g, u32 offset); void gm20ab_gr_intr_tpc_exception_sm_enable(struct gk20a *g); + +void gm20b_gr_intr_set_hww_esr_report_mask(struct gk20a *g); +void gm20b_gr_intr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, + u32 *esr_sm_sel); +void gm20b_gr_intr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + u32 global_esr); +u32 gm20b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + struct nvgpu_channel *fault_ch); + +u32 gm20b_gr_intr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, + u32 sm); +u32 gm20b_gr_intr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm); +u32 gm20b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g); +u64 gm20b_gr_intr_tpc_enabled_exceptions(struct gk20a *g); + #endif /* NVGPU_GR_INTR_GM20B_H */ diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c index 8d4f1f4a3..93dafc9be 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c @@ -251,7 +251,7 @@ void gv11b_gr_intr_set_shader_exceptions(struct gk20a *g, u32 data) nvgpu_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r(), 0); } else { - g->ops.gr.set_hww_esr_report_mask(g); + g->ops.gr.intr.set_hww_esr_report_mask(g); } } @@ -696,3 +696,820 @@ void gv11b_gr_intr_enable_gpc_exceptions(struct gk20a *g, gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1U) | gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1U))); } + +void gv11b_gr_intr_set_hww_esr_report_mask(struct gk20a *g) +{ + + /* clear hww */ + nvgpu_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_r(), 0xffffffffU); + nvgpu_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_r(), 0xffffffffU); + + /* setup sm warp esr report masks */ + nvgpu_writel(g, gr_gpcs_tpcs_sms_hww_warp_esr_report_mask_r(), + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_error_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_api_stack_error_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_pc_wrap_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_misaligned_pc_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_pc_overflow_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_misaligned_reg_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_illegal_instr_param_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_oor_reg_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_oor_addr_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_misaligned_addr_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_addr_space_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_fault_report_f() | + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f()); + + /* setup sm global esr report mask. vat_alarm_report is not enabled */ + nvgpu_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r(), + gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_multiple_warp_errors_report_f()); +} + +static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset; + u32 l1_tag_ecc_status, l1_tag_ecc_corrected_err_status = 0; + u32 l1_tag_ecc_uncorrected_err_status = 0; + u32 l1_tag_corrected_err_count_delta = 0; + u32 l1_tag_uncorrected_err_count_delta = 0; + bool is_l1_tag_ecc_corrected_total_err_overflow = false; + bool is_l1_tag_ecc_uncorrected_total_err_overflow = false; + + offset = nvgpu_safe_add_u32( + nvgpu_safe_mult_u32(gpc_stride, gpc), + nvgpu_safe_mult_u32(tpc_in_gpc_stride, tpc)); + + /* Check for L1 tag ECC errors. */ + l1_tag_ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(), offset)); + l1_tag_ecc_corrected_err_status = l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()); + l1_tag_ecc_uncorrected_err_status = l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()); + + if ((l1_tag_ecc_corrected_err_status == 0U) && (l1_tag_ecc_uncorrected_err_status == 0U)) { + return; + } + + l1_tag_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r(), + offset))); + l1_tag_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r(), + offset))); + is_l1_tag_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(l1_tag_ecc_status) != 0U; + is_l1_tag_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(l1_tag_ecc_status) != 0U; + + if ((l1_tag_corrected_err_count_delta > 0U) || is_l1_tag_ecc_corrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]", + l1_tag_ecc_corrected_err_status, is_l1_tag_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_l1_tag_ecc_corrected_total_err_overflow) { + l1_tag_corrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s()); + } + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += + l1_tag_corrected_err_count_delta; + if ((l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + } + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r(), offset), + 0); + } + if ((l1_tag_uncorrected_err_count_delta > 0U) || is_l1_tag_ecc_uncorrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]", + l1_tag_ecc_uncorrected_err_status, is_l1_tag_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_l1_tag_ecc_uncorrected_total_err_overflow) { + l1_tag_uncorrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s()); + } + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += + l1_tag_uncorrected_err_count_delta; + if ((l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + } + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r(), offset), + 0); + } + + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(), offset), + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f()); +} + +static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset; + u32 lrf_ecc_status, lrf_ecc_corrected_err_status = 0; + u32 lrf_ecc_uncorrected_err_status = 0; + u32 lrf_corrected_err_count_delta = 0; + u32 lrf_uncorrected_err_count_delta = 0; + bool is_lrf_ecc_corrected_total_err_overflow = false; + bool is_lrf_ecc_uncorrected_total_err_overflow = false; + + offset = nvgpu_safe_add_u32( + nvgpu_safe_mult_u32(gpc_stride, gpc), + nvgpu_safe_mult_u32(tpc_in_gpc_stride, tpc)); + + /* Check for LRF ECC errors. */ + lrf_ecc_status = nvgpu_readl(g, + nvgpu_safe_add_u32(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(), + offset)); + lrf_ecc_corrected_err_status = lrf_ecc_status & + (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp4_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp5_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp6_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp7_m()); + lrf_ecc_uncorrected_err_status = lrf_ecc_status & + (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp4_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp5_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp6_m() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp7_m()); + + if ((lrf_ecc_corrected_err_status == 0U) && (lrf_ecc_uncorrected_err_status == 0U)) { + return; + } + + lrf_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r(), + offset))); + lrf_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), + offset))); + is_lrf_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_total_counter_overflow_v(lrf_ecc_status) != 0U; + is_lrf_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_total_counter_overflow_v(lrf_ecc_status) != 0U; + + if ((lrf_corrected_err_count_delta > 0U) || is_lrf_ecc_corrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM LRF! err_mask [%08x] is_overf [%d]", + lrf_ecc_corrected_err_status, is_lrf_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_lrf_ecc_corrected_total_err_overflow) { + lrf_corrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s()); + } + g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += + lrf_corrected_err_count_delta; + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_LRF_ECC_CORRECTED, 0, + g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter); + nvgpu_writel(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset, + 0); + } + if ((lrf_uncorrected_err_count_delta > 0U) || is_lrf_ecc_uncorrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM LRF! err_mask [%08x] is_overf [%d]", + lrf_ecc_uncorrected_err_status, is_lrf_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_lrf_ecc_uncorrected_total_err_overflow) { + lrf_uncorrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s()); + } + g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += + lrf_uncorrected_err_count_delta; + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_LRF_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter); + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset), + 0); + } + + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(), offset), + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_reset_task_f()); +} + +static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset; + u32 cbu_ecc_status, cbu_ecc_corrected_err_status = 0; + u32 cbu_ecc_uncorrected_err_status = 0; + u32 cbu_corrected_err_count_delta = 0; + u32 cbu_uncorrected_err_count_delta = 0; + bool is_cbu_ecc_corrected_total_err_overflow = false; + bool is_cbu_ecc_uncorrected_total_err_overflow = false; + + offset = nvgpu_safe_add_u32( + nvgpu_safe_mult_u32(gpc_stride, gpc), + nvgpu_safe_mult_u32(tpc_in_gpc_stride, tpc)); + + /* Check for CBU ECC errors. */ + cbu_ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r(), offset)); + cbu_ecc_corrected_err_status = cbu_ecc_status & + (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm0_m() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm1_m() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm0_m() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm1_m()); + cbu_ecc_uncorrected_err_status = cbu_ecc_status & + (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm0_m() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm1_m() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm0_m() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm1_m()); + + if ((cbu_ecc_corrected_err_status == 0U) && (cbu_ecc_uncorrected_err_status == 0U)) { + return; + } + + cbu_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r(), + offset))); + cbu_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), + offset))); + is_cbu_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_total_counter_overflow_v(cbu_ecc_status) != 0U; + is_cbu_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_total_counter_overflow_v(cbu_ecc_status) != 0U; + + if ((cbu_corrected_err_count_delta > 0U) || is_cbu_ecc_corrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM CBU! err_mask [%08x] is_overf [%d]", + cbu_ecc_corrected_err_status, is_cbu_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_cbu_ecc_corrected_total_err_overflow) { + cbu_corrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s()); + } + g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter += + cbu_corrected_err_count_delta; + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_CBU_ECC_CORRECTED, + 0, g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter); + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r(), offset), + 0); + } + if ((cbu_uncorrected_err_count_delta > 0U) || is_cbu_ecc_uncorrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM CBU! err_mask [%08x] is_overf [%d]", + cbu_ecc_uncorrected_err_status, is_cbu_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_cbu_ecc_uncorrected_total_err_overflow) { + cbu_uncorrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s()); + } + g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += + cbu_uncorrected_err_count_delta; + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_CBU_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset), + 0); + } + + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r(), offset), + gr_pri_gpc0_tpc0_sm_cbu_ecc_status_reset_task_f()); +} + +static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset; + u32 l1_data_ecc_status, l1_data_ecc_corrected_err_status = 0; + u32 l1_data_ecc_uncorrected_err_status = 0; + u32 l1_data_corrected_err_count_delta = 0; + u32 l1_data_uncorrected_err_count_delta = 0; + bool is_l1_data_ecc_corrected_total_err_overflow = false; + bool is_l1_data_ecc_uncorrected_total_err_overflow = false; + + offset = nvgpu_safe_add_u32( + nvgpu_safe_mult_u32(gpc_stride, gpc), + nvgpu_safe_mult_u32(tpc_in_gpc_stride, tpc)); + + /* Check for L1 data ECC errors. */ + l1_data_ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r(), offset)); + l1_data_ecc_corrected_err_status = l1_data_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_1_m()); + l1_data_ecc_uncorrected_err_status = l1_data_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_1_m()); + + if ((l1_data_ecc_corrected_err_status == 0U) && (l1_data_ecc_uncorrected_err_status == 0U)) { + return; + } + + l1_data_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r(), + offset))); + l1_data_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), + offset))); + is_l1_data_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_total_counter_overflow_v(l1_data_ecc_status) != 0U; + is_l1_data_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_total_counter_overflow_v(l1_data_ecc_status) != 0U; + + if ((l1_data_corrected_err_count_delta > 0U) || is_l1_data_ecc_corrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]", + l1_data_ecc_corrected_err_status, is_l1_data_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_l1_data_ecc_corrected_total_err_overflow) { + l1_data_corrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s()); + } + g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter += + l1_data_corrected_err_count_delta; + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter); + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r(), offset), + 0); + } + if ((l1_data_uncorrected_err_count_delta > 0U) || is_l1_data_ecc_uncorrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]", + l1_data_ecc_uncorrected_err_status, is_l1_data_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_l1_data_ecc_uncorrected_total_err_overflow) { + l1_data_uncorrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s()); + } + g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += + l1_data_uncorrected_err_count_delta; + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset), + 0); + } + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r(), offset), + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f()); +} + +static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset; + u32 icache_ecc_status, icache_ecc_corrected_err_status = 0; + u32 icache_ecc_uncorrected_err_status = 0; + u32 icache_corrected_err_count_delta = 0; + u32 icache_uncorrected_err_count_delta = 0; + bool is_icache_ecc_corrected_total_err_overflow = false; + bool is_icache_ecc_uncorrected_total_err_overflow = false; + + offset = nvgpu_safe_add_u32( + nvgpu_safe_mult_u32(gpc_stride, gpc), + nvgpu_safe_mult_u32(tpc_in_gpc_stride, tpc)); + + /* Check for L0 && L1 icache ECC errors. */ + icache_ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_icache_ecc_status_r(), offset)); + icache_ecc_corrected_err_status = icache_ecc_status & + (gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()); + icache_ecc_uncorrected_err_status = icache_ecc_status & + (gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()); + + if ((icache_ecc_corrected_err_status == 0U) && (icache_ecc_uncorrected_err_status == 0U)) { + return; + } + + icache_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(), + offset))); + icache_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_v( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(), + offset))); + is_icache_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_total_counter_overflow_v(icache_ecc_status) != 0U; + is_icache_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_total_counter_overflow_v(icache_ecc_status) != 0U; + + if ((icache_corrected_err_count_delta > 0U) || is_icache_ecc_corrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]", + icache_ecc_corrected_err_status, is_icache_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_icache_ecc_corrected_total_err_overflow) { + icache_corrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s()); + } + g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter += + icache_corrected_err_count_delta; + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(), offset), + 0); + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + } + if ((icache_uncorrected_err_count_delta > 0U) || is_icache_ecc_uncorrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]", + icache_ecc_uncorrected_err_status, is_icache_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + if (is_icache_ecc_uncorrected_total_err_overflow) { + icache_uncorrected_err_count_delta += + BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s()); + } + g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter += + icache_uncorrected_err_count_delta; + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(), offset), + 0); + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + } + + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pri_gpc0_tpc0_sm_icache_ecc_status_r(), offset), + gr_pri_gpc0_tpc0_sm_icache_ecc_status_reset_task_f()); +} + +void gv11b_gr_intr_handle_tpc_sm_ecc_exception(struct gk20a *g, + u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr) +{ + /* Check for L1 tag ECC errors. */ + gv11b_gr_intr_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + + /* Check for LRF ECC errors. */ + gv11b_gr_intr_handle_lrf_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + + /* Check for CBU ECC errors. */ + gv11b_gr_intr_handle_cbu_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + + /* Check for L1 data ECC errors. */ + gv11b_gr_intr_handle_l1_data_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + + /* Check for L0 && L1 icache ECC errors. */ + gv11b_gr_intr_handle_icache_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); +} + +void gv11b_gr_intr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, + u32 *esr_sm_sel) +{ + u32 reg_val; + u32 offset; + + offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_gr_tpc_offset(g, tpc)); + + reg_val = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm_tpc_esr_sm_sel_r(), offset)); + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "sm tpc esr sm sel reg val: 0x%x", reg_val); + *esr_sm_sel = 0; + if (gr_gpc0_tpc0_sm_tpc_esr_sm_sel_sm0_error_v(reg_val) != 0U) { + *esr_sm_sel = 1; + } + if (gr_gpc0_tpc0_sm_tpc_esr_sm_sel_sm1_error_v(reg_val) != 0U) { + *esr_sm_sel |= BIT32(1); + } + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "esr_sm_sel bitmask: 0x%x", *esr_sm_sel); +} + +void gv11b_gr_intr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + u32 global_esr) +{ + u32 offset; + + offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_safe_add_u32(nvgpu_gr_tpc_offset(g, tpc), + nvgpu_gr_sm_offset(g, sm))); + + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_global_esr_r(), offset), + global_esr); + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "Cleared HWW global esr, current reg val: 0x%x", + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_global_esr_r(), offset))); + + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_r(), offset), 0); + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "Cleared HWW warp esr, current reg val: 0x%x", + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_r(), offset))); +} + +int gv11b_gr_intr_handle_ssync_hww(struct gk20a *g, u32 *ssync_esr) +{ + u32 ssync = nvgpu_readl(g, gr_ssync_hww_esr_r()); + + if (ssync_esr != NULL) { + *ssync_esr = ssync; + } + nvgpu_err(g, "ssync exception: esr 0x%08x", ssync); + nvgpu_writel(g, gr_ssync_hww_esr_r(), + gr_ssync_hww_esr_reset_active_f()); + return -EFAULT; +} + +static void gv11b_gr_intr_read_sm_error_state(struct gk20a *g, + u32 offset, + struct nvgpu_tsg_sm_error_state *sm_error_states) +{ + sm_error_states->hww_global_esr = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_global_esr_r(), offset)); + + sm_error_states->hww_warp_esr = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_r(), offset)); + + sm_error_states->hww_warp_esr_pc = hi32_lo32_to_u64( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r(), offset)), + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r(), offset))); + + sm_error_states->hww_global_esr_report_mask = nvgpu_readl(g, + nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r(), + offset)); + + sm_error_states->hww_warp_esr_report_mask = nvgpu_readl(g, + nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r(), + offset)); +} + +u32 gv11b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + struct nvgpu_channel *fault_ch) +{ + u32 sm_id; + u32 offset, sm_per_tpc, tpc_id; + u32 gpc_offset, gpc_tpc_offset; + struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; + struct nvgpu_tsg *tsg = NULL; + + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + + sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); + gpc_offset = nvgpu_gr_gpc_offset(g, gpc); + gpc_tpc_offset = nvgpu_safe_add_u32(gpc_offset, + nvgpu_gr_tpc_offset(g, tpc)); + + tpc_id = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_gpm_pd_sm_id_r(tpc), gpc_offset)); + sm_id = nvgpu_safe_add_u32( + nvgpu_safe_mult_u32(tpc_id, sm_per_tpc), + sm); + + offset = nvgpu_safe_add_u32(gpc_tpc_offset, + nvgpu_gr_sm_offset(g, sm)); + + if (fault_ch != NULL) { + tsg = nvgpu_tsg_from_ch(fault_ch); + } + + if (tsg == NULL) { + nvgpu_err(g, "no valid tsg"); + goto record_fail; + } + + sm_error_states = tsg->sm_error_states + sm_id; + gv11b_gr_intr_read_sm_error_state(g, offset, sm_error_states); + +record_fail: + nvgpu_mutex_release(&g->dbg_sessions_lock); + + return sm_id; +} + +u32 gv11b_gr_intr_get_sm_hww_warp_esr(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm) +{ + u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_safe_add_u32(nvgpu_gr_tpc_offset(g, tpc), + nvgpu_gr_sm_offset(g, sm))); + + u32 hww_warp_esr = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_r(), offset)); + return hww_warp_esr; +} + +u32 gv11b_gr_intr_get_sm_hww_global_esr(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm) +{ + u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_safe_add_u32(nvgpu_gr_tpc_offset(g, tpc), + nvgpu_gr_sm_offset(g, sm))); + + u32 hww_global_esr = nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_global_esr_r(), offset)); + + return hww_global_esr; +} + +u32 gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g) +{ + /* + * These three interrupts don't require locking down the SM. They can + * be handled by usermode clients as they aren't fatal. Additionally, + * usermode clients may wish to allow some warps to execute while others + * are at breakpoints, as opposed to fatal errors where all warps should + * halt. + */ + u32 global_esr_mask = + gr_gpc0_tpc0_sm0_hww_global_esr_bpt_int_pending_f() | + gr_gpc0_tpc0_sm0_hww_global_esr_bpt_pause_pending_f() | + gr_gpc0_tpc0_sm0_hww_global_esr_single_step_complete_pending_f(); + + return global_esr_mask; +} + +u64 gv11b_gr_intr_get_sm_hww_warp_esr_pc(struct gk20a *g, u32 offset) +{ + u64 hww_warp_esr_pc; + + hww_warp_esr_pc = hi32_lo32_to_u64( + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r(), offset)), + nvgpu_readl(g, nvgpu_safe_add_u32( + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r(), offset))); + + return hww_warp_esr_pc; +} + +u32 gv11b_gr_intr_ctxsw_checksum_mismatch_mailbox_val(void) +{ + return gr_fecs_ctxsw_mailbox_value_ctxsw_checksum_mismatch_v(); +} diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h index cc2ce8874..8518434d4 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h @@ -81,4 +81,26 @@ void gv11b_gr_intr_enable_exceptions(struct gk20a *g, void gv11b_gr_intr_enable_gpc_exceptions(struct gk20a *g, struct nvgpu_gr_config *gr_config); +void gv11b_gr_intr_set_hww_esr_report_mask(struct gk20a *g); +void gv11b_gr_intr_handle_tpc_sm_ecc_exception(struct gk20a *g, + u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr); +void gv11b_gr_intr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, + u32 *esr_sm_sel); +void gv11b_gr_intr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + u32 global_esr); +int gv11b_gr_intr_handle_ssync_hww(struct gk20a *g, u32 *ssync_esr); +u32 gv11b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, + struct nvgpu_channel *fault_ch); + +u32 gv11b_gr_intr_get_sm_hww_warp_esr(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm); +u32 gv11b_gr_intr_get_sm_hww_global_esr(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm); +u32 gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g); +u64 gv11b_gr_intr_get_sm_hww_warp_esr_pc(struct gk20a *g, u32 offset); + +u32 gv11b_gr_intr_ctxsw_checksum_mismatch_mailbox_val(void); + #endif /* NVGPU_GR_INTR_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c index 7b3f4b496..e4caa53d4 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c @@ -155,3 +155,78 @@ void tu104_gr_intr_enable_gpc_exceptions(struct gk20a *g, gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1U) | gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1U))); } + +void tu104_gr_intr_log_mme_exception(struct gk20a *g) +{ + u32 mme_hww_esr = nvgpu_readl(g, gr_mme_hww_esr_r()); + u32 mme_hww_info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); + + if ((mme_hww_esr & + gr_mme_hww_esr_missing_macro_data_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: MISSING_MACRO_DATA"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_illegal_mme_method_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: ILLEGAL_MME_METHOD"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_dram_access_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_DRAM_ACCESS_OUT_OF_BOUNDS"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_illegal_fifo_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_ILLEGAL_FIFO_CONFIG"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_read_overflow_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_READ_FIFOED_OVERFLOW"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_fifo_resized_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_FIFO_RESIZED_WHEN_NONIDLE"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_illegal_opcode_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: ILLEGAL_OPCODE"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_branch_in_delay_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: BRANCH_IN_DELAY_SHOT"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_inst_ram_acess_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: INSTR_RAM_ACCESS_OUT_OF_BOUNDS"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_data_ram_access_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DATA_RAM_ACCESS_OUT_OF_BOUNDS"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_dma_read_pb_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_READ_FIFOED_FROM_PB"); + } + + if (gr_mme_hww_esr_info_pc_valid_v(mme_hww_info) == 0x1U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: INFO2 0x%x, INFO3 0x%x, INFO4 0x%x", + nvgpu_readl(g, gr_mme_hww_esr_info2_r()), + nvgpu_readl(g, gr_mme_hww_esr_info3_r()), + nvgpu_readl(g, gr_mme_hww_esr_info4_r())); + } +} diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.h b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.h index c0380e002..718d8e926 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.h +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.h @@ -52,5 +52,6 @@ int tu104_gr_intr_handle_sw_method(struct gk20a *g, u32 addr, u32 class_num, u32 offset, u32 data); void tu104_gr_intr_enable_gpc_exceptions(struct gk20a *g, struct nvgpu_gr_config *gr_config); +void tu104_gr_intr_log_mme_exception(struct gk20a *g); #endif /* NVGPU_GR_INTR_TU104_H */ diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c index 6d48f76ba..bb2b03cb3 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c @@ -152,7 +152,6 @@ static const struct gpu_ops gm20b_ops = { .set_circular_buffer_size = gr_gm20b_set_circular_buffer_size, .get_sm_dsm_perf_regs = gr_gm20b_get_sm_dsm_perf_regs, .get_sm_dsm_perf_ctrl_regs = gr_gm20b_get_sm_dsm_perf_ctrl_regs, - .set_hww_esr_report_mask = gr_gm20b_set_hww_esr_report_mask, .set_gpc_tpc_mask = gr_gm20b_set_gpc_tpc_mask, .is_tpc_addr = gr_gm20b_is_tpc_addr, .get_tpc_num = gr_gm20b_get_tpc_num, @@ -166,7 +165,6 @@ static const struct gpu_ops gm20b_ops = { .update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, .set_mmu_debug_mode = gm20b_gr_set_mmu_debug_mode, - .record_sm_error_state = gm20b_gr_record_sm_error_state, .clear_sm_error_state = gm20b_gr_clear_sm_error_state, .suspend_contexts = gr_gk20a_suspend_contexts, .resume_contexts = gr_gk20a_resume_contexts, @@ -174,21 +172,13 @@ static const struct gpu_ops gm20b_ops = { .wait_for_pause = gr_gk20a_wait_for_pause, .resume_from_pause = gr_gk20a_resume_from_pause, .clear_sm_errors = gr_gk20a_clear_sm_errors, - .tpc_enabled_exceptions = gr_gk20a_tpc_enabled_exceptions, - .get_esr_sm_sel = gk20a_gr_get_esr_sm_sel, .sm_debugger_attached = gk20a_gr_sm_debugger_attached, .suspend_single_sm = gk20a_gr_suspend_single_sm, .suspend_all_sms = gk20a_gr_suspend_all_sms, .resume_single_sm = gk20a_gr_resume_single_sm, .resume_all_sms = gk20a_gr_resume_all_sms, - .get_sm_hww_warp_esr = gk20a_gr_get_sm_hww_warp_esr, - .get_sm_hww_global_esr = gk20a_gr_get_sm_hww_global_esr, - .get_sm_hww_warp_esr_pc = NULL, - .get_sm_no_lock_down_hww_global_esr_mask = - gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask, .lock_down_sm = gk20a_gr_lock_down_sm, .wait_for_sm_lock_down = gk20a_gr_wait_for_sm_lock_down, - .clear_sm_hww = gm20b_gr_clear_sm_hww, .init_ovr_sm_dsm_perf = gk20a_gr_init_ovr_sm_dsm_perf, .get_ovr_perf_regs = gk20a_gr_get_ovr_perf_regs, .decode_priv_addr = gr_gk20a_decode_priv_addr, @@ -197,7 +187,6 @@ static const struct gpu_ops gm20b_ops = { .get_offset_in_gpccs_segment = gr_gk20a_get_offset_in_gpccs_segment, .set_debug_mode = gm20b_gr_set_debug_mode, - .log_mme_exception = NULL, .reset = nvgpu_gr_reset, .esr_bpt_pending_events = gm20b_gr_esr_bpt_pending_events, .halt_pipe = nvgpu_gr_halt_pipe, @@ -444,6 +433,20 @@ static const struct gpu_ops gm20b_ops = { nvgpu_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, .flush_channel_tlb = nvgpu_gr_intr_flush_channel_tlb, + .set_hww_esr_report_mask = + gm20b_gr_intr_set_hww_esr_report_mask, + .get_esr_sm_sel = gm20b_gr_intr_get_esr_sm_sel, + .clear_sm_hww = gm20b_gr_intr_clear_sm_hww, + .record_sm_error_state = + gm20b_gr_intr_record_sm_error_state, + .get_sm_hww_warp_esr = + gm20b_gr_intr_get_sm_hww_warp_esr, + .get_sm_hww_global_esr = + gm20b_gr_intr_get_sm_hww_global_esr, + .get_sm_no_lock_down_hww_global_esr_mask = + gm20b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask, + .tpc_enabled_exceptions = + gm20b_gr_intr_tpc_enabled_exceptions, }, .falcon = { .read_fecs_ctxsw_mailbox = diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index 3aa74c01a..aa70eab0f 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -180,7 +180,6 @@ static const struct gpu_ops gp10b_ops = { .set_circular_buffer_size = gr_gp10b_set_circular_buffer_size, .get_sm_dsm_perf_regs = gr_gm20b_get_sm_dsm_perf_regs, .get_sm_dsm_perf_ctrl_regs = gr_gm20b_get_sm_dsm_perf_ctrl_regs, - .set_hww_esr_report_mask = gr_gm20b_set_hww_esr_report_mask, .set_gpc_tpc_mask = gr_gp10b_set_gpc_tpc_mask, .is_tpc_addr = gr_gm20b_is_tpc_addr, .get_tpc_num = gr_gm20b_get_tpc_num, @@ -194,7 +193,6 @@ static const struct gpu_ops gp10b_ops = { .update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode, .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, .set_mmu_debug_mode = NULL, - .record_sm_error_state = gm20b_gr_record_sm_error_state, .clear_sm_error_state = gm20b_gr_clear_sm_error_state, .suspend_contexts = gr_gp10b_suspend_contexts, .resume_contexts = gr_gk20a_resume_contexts, @@ -202,21 +200,13 @@ static const struct gpu_ops gp10b_ops = { .wait_for_pause = gr_gk20a_wait_for_pause, .resume_from_pause = gr_gk20a_resume_from_pause, .clear_sm_errors = gr_gk20a_clear_sm_errors, - .tpc_enabled_exceptions = gr_gk20a_tpc_enabled_exceptions, - .get_esr_sm_sel = gk20a_gr_get_esr_sm_sel, .sm_debugger_attached = gk20a_gr_sm_debugger_attached, .suspend_single_sm = gk20a_gr_suspend_single_sm, .suspend_all_sms = gk20a_gr_suspend_all_sms, .resume_single_sm = gk20a_gr_resume_single_sm, .resume_all_sms = gk20a_gr_resume_all_sms, - .get_sm_hww_warp_esr = gp10b_gr_get_sm_hww_warp_esr, - .get_sm_hww_global_esr = gk20a_gr_get_sm_hww_global_esr, - .get_sm_hww_warp_esr_pc = NULL, - .get_sm_no_lock_down_hww_global_esr_mask = - gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask, .lock_down_sm = gk20a_gr_lock_down_sm, .wait_for_sm_lock_down = gk20a_gr_wait_for_sm_lock_down, - .clear_sm_hww = gm20b_gr_clear_sm_hww, .init_ovr_sm_dsm_perf = gk20a_gr_init_ovr_sm_dsm_perf, .get_ovr_perf_regs = gk20a_gr_get_ovr_perf_regs, #ifdef NVGPU_FEATURE_CHANNEL_TSG_SCHEDULING @@ -230,7 +220,6 @@ static const struct gpu_ops gp10b_ops = { .get_offset_in_gpccs_segment = gr_gk20a_get_offset_in_gpccs_segment, .set_debug_mode = gm20b_gr_set_debug_mode, - .log_mme_exception = NULL, .reset = nvgpu_gr_reset, .esr_bpt_pending_events = gm20b_gr_esr_bpt_pending_events, .halt_pipe = nvgpu_gr_halt_pipe, @@ -510,6 +499,20 @@ static const struct gpu_ops gp10b_ops = { gp10b_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, .flush_channel_tlb = nvgpu_gr_intr_flush_channel_tlb, + .set_hww_esr_report_mask = + gm20b_gr_intr_set_hww_esr_report_mask, + .get_esr_sm_sel = gm20b_gr_intr_get_esr_sm_sel, + .clear_sm_hww = gm20b_gr_intr_clear_sm_hww, + .record_sm_error_state = + gm20b_gr_intr_record_sm_error_state, + .get_sm_hww_warp_esr = + gm20b_gr_intr_get_sm_hww_warp_esr, + .get_sm_hww_global_esr = + gm20b_gr_intr_get_sm_hww_global_esr, + .get_sm_no_lock_down_hww_global_esr_mask = + gm20b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask, + .tpc_enabled_exceptions = + gm20b_gr_intr_tpc_enabled_exceptions, }, .falcon = { .read_fecs_ctxsw_mailbox = diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 76e3cd5df..568fdbb95 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -222,7 +222,6 @@ static const struct gpu_ops gv11b_ops = { .set_circular_buffer_size = gr_gv11b_set_circular_buffer_size, .get_sm_dsm_perf_regs = gv11b_gr_get_sm_dsm_perf_regs, .get_sm_dsm_perf_ctrl_regs = gv11b_gr_get_sm_dsm_perf_ctrl_regs, - .set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask, .set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask, .is_tpc_addr = gr_gm20b_is_tpc_addr, .get_tpc_num = gr_gm20b_get_tpc_num, @@ -239,7 +238,6 @@ static const struct gpu_ops gv11b_ops = { .set_pmm_register = gr_gv100_set_pmm_register, .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, .init_hwpm_pmm_register = gr_gv100_init_hwpm_pmm_register, - .record_sm_error_state = gv11b_gr_record_sm_error_state, .clear_sm_error_state = gv11b_gr_clear_sm_error_state, .suspend_contexts = gr_gp10b_suspend_contexts, .resume_contexts = gr_gk20a_resume_contexts, @@ -247,21 +245,13 @@ static const struct gpu_ops gv11b_ops = { .wait_for_pause = gr_gk20a_wait_for_pause, .resume_from_pause = gv11b_gr_resume_from_pause, .clear_sm_errors = gr_gk20a_clear_sm_errors, - .tpc_enabled_exceptions = gr_gk20a_tpc_enabled_exceptions, - .get_esr_sm_sel = gv11b_gr_get_esr_sm_sel, .sm_debugger_attached = gv11b_gr_sm_debugger_attached, .suspend_single_sm = gv11b_gr_suspend_single_sm, .suspend_all_sms = gv11b_gr_suspend_all_sms, .resume_single_sm = gv11b_gr_resume_single_sm, .resume_all_sms = gv11b_gr_resume_all_sms, - .get_sm_hww_warp_esr = gv11b_gr_get_sm_hww_warp_esr, - .get_sm_hww_global_esr = gv11b_gr_get_sm_hww_global_esr, - .get_sm_hww_warp_esr_pc = gv11b_gr_get_sm_hww_warp_esr_pc, - .get_sm_no_lock_down_hww_global_esr_mask = - gv11b_gr_get_sm_no_lock_down_hww_global_esr_mask, .lock_down_sm = gv11b_gr_lock_down_sm, .wait_for_sm_lock_down = gv11b_gr_wait_for_sm_lock_down, - .clear_sm_hww = gv11b_gr_clear_sm_hww, .init_ovr_sm_dsm_perf = gv11b_gr_init_ovr_sm_dsm_perf, .get_ovr_perf_regs = gv11b_gr_get_ovr_perf_regs, #ifdef NVGPU_FEATURE_CHANNEL_TSG_SCHEDULING @@ -276,10 +266,7 @@ static const struct gpu_ops gv11b_ops = { .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, .access_smpc_reg = gv11b_gr_access_smpc_reg, .is_egpc_addr = gv11b_gr_pri_is_egpc_addr, - .handle_tpc_sm_ecc_exception = - gr_gv11b_handle_tpc_sm_ecc_exception, .decode_egpc_addr = gv11b_gr_decode_egpc_addr, - .handle_ssync_hww = gr_gv11b_handle_ssync_hww, .decode_priv_addr = gr_gv11b_decode_priv_addr, .create_priv_addr_table = gr_gv11b_create_priv_addr_table, .split_fbpa_broadcast_addr = gr_gk20a_split_fbpa_broadcast_addr, @@ -287,9 +274,6 @@ static const struct gpu_ops gv11b_ops = { gr_gk20a_get_offset_in_gpccs_segment, .set_debug_mode = gm20b_gr_set_debug_mode, .set_mmu_debug_mode = gm20b_gr_set_mmu_debug_mode, - .log_mme_exception = NULL, - .get_ctxsw_checksum_mismatch_mailbox_val = - gr_gv11b_ctxsw_checksum_mismatch_mailbox_val, .reset = nvgpu_gr_reset, .esr_bpt_pending_events = gv11b_gr_esr_bpt_pending_events, .halt_pipe = nvgpu_gr_halt_pipe, @@ -609,6 +593,27 @@ static const struct gpu_ops gv11b_ops = { nvgpu_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, .flush_channel_tlb = nvgpu_gr_intr_flush_channel_tlb, + .set_hww_esr_report_mask = + gv11b_gr_intr_set_hww_esr_report_mask, + .handle_tpc_sm_ecc_exception = + gv11b_gr_intr_handle_tpc_sm_ecc_exception, + .get_esr_sm_sel = gv11b_gr_intr_get_esr_sm_sel, + .clear_sm_hww = gv11b_gr_intr_clear_sm_hww, + .handle_ssync_hww = gv11b_gr_intr_handle_ssync_hww, + .record_sm_error_state = + gv11b_gr_intr_record_sm_error_state, + .get_sm_hww_warp_esr = + gv11b_gr_intr_get_sm_hww_warp_esr, + .get_sm_hww_warp_esr_pc = + gv11b_gr_intr_get_sm_hww_warp_esr_pc, + .get_sm_hww_global_esr = + gv11b_gr_intr_get_sm_hww_global_esr, + .get_sm_no_lock_down_hww_global_esr_mask = + gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask, + .tpc_enabled_exceptions = + gm20b_gr_intr_tpc_enabled_exceptions, + .get_ctxsw_checksum_mismatch_mailbox_val = + gv11b_gr_intr_ctxsw_checksum_mismatch_mailbox_val, }, .falcon = { .handle_fecs_ecc_error = diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index 8d3b7271a..340faf62f 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -270,7 +270,6 @@ static const struct gpu_ops tu104_ops = { .set_circular_buffer_size = gr_gv11b_set_circular_buffer_size, .get_sm_dsm_perf_regs = gv11b_gr_get_sm_dsm_perf_regs, .get_sm_dsm_perf_ctrl_regs = gr_tu104_get_sm_dsm_perf_ctrl_regs, - .set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask, .set_gpc_tpc_mask = gr_gv100_set_gpc_tpc_mask, .is_tpc_addr = gr_gm20b_is_tpc_addr, .get_tpc_num = gr_gm20b_get_tpc_num, @@ -287,7 +286,6 @@ static const struct gpu_ops tu104_ops = { .set_mmu_debug_mode = gm20b_gr_set_mmu_debug_mode, .update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode, .init_hwpm_pmm_register = gr_gv100_init_hwpm_pmm_register, - .record_sm_error_state = gv11b_gr_record_sm_error_state, .clear_sm_error_state = gv11b_gr_clear_sm_error_state, .suspend_contexts = gr_gp10b_suspend_contexts, .resume_contexts = gr_gk20a_resume_contexts, @@ -295,21 +293,13 @@ static const struct gpu_ops tu104_ops = { .wait_for_pause = gr_gk20a_wait_for_pause, .resume_from_pause = gv11b_gr_resume_from_pause, .clear_sm_errors = gr_gk20a_clear_sm_errors, - .tpc_enabled_exceptions = gr_gk20a_tpc_enabled_exceptions, - .get_esr_sm_sel = gv11b_gr_get_esr_sm_sel, .sm_debugger_attached = gv11b_gr_sm_debugger_attached, .suspend_single_sm = gv11b_gr_suspend_single_sm, .suspend_all_sms = gv11b_gr_suspend_all_sms, .resume_single_sm = gv11b_gr_resume_single_sm, .resume_all_sms = gv11b_gr_resume_all_sms, - .get_sm_hww_warp_esr = gv11b_gr_get_sm_hww_warp_esr, - .get_sm_hww_global_esr = gv11b_gr_get_sm_hww_global_esr, - .get_sm_hww_warp_esr_pc = gv11b_gr_get_sm_hww_warp_esr_pc, - .get_sm_no_lock_down_hww_global_esr_mask = - gv11b_gr_get_sm_no_lock_down_hww_global_esr_mask, .lock_down_sm = gv11b_gr_lock_down_sm, .wait_for_sm_lock_down = gv11b_gr_wait_for_sm_lock_down, - .clear_sm_hww = gv11b_gr_clear_sm_hww, .init_ovr_sm_dsm_perf = gv11b_gr_init_ovr_sm_dsm_perf, .get_ovr_perf_regs = gv11b_gr_get_ovr_perf_regs, #ifdef NVGPU_FEATURE_CHANNEL_TSG_SCHEDULING @@ -324,17 +314,13 @@ static const struct gpu_ops tu104_ops = { .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, .access_smpc_reg = gv11b_gr_access_smpc_reg, .is_egpc_addr = gv11b_gr_pri_is_egpc_addr, - .handle_tpc_sm_ecc_exception = - gr_gv11b_handle_tpc_sm_ecc_exception, .decode_egpc_addr = gv11b_gr_decode_egpc_addr, - .handle_ssync_hww = gr_gv11b_handle_ssync_hww, .decode_priv_addr = gr_gv11b_decode_priv_addr, .create_priv_addr_table = gr_gv11b_create_priv_addr_table, .split_fbpa_broadcast_addr = gr_gv100_split_fbpa_broadcast_addr, .get_offset_in_gpccs_segment = gr_tu104_get_offset_in_gpccs_segment, .set_debug_mode = gm20b_gr_set_debug_mode, - .log_mme_exception = gr_tu104_log_mme_exception, .reset = nvgpu_gr_reset, .esr_bpt_pending_events = gv11b_gr_esr_bpt_pending_events, .halt_pipe = nvgpu_gr_halt_pipe, @@ -650,6 +636,26 @@ static const struct gpu_ops tu104_ops = { nvgpu_gr_intr_handle_sm_exception, .stall_isr = nvgpu_gr_intr_stall_isr, .flush_channel_tlb = nvgpu_gr_intr_flush_channel_tlb, + .set_hww_esr_report_mask = + gv11b_gr_intr_set_hww_esr_report_mask, + .handle_tpc_sm_ecc_exception = + gv11b_gr_intr_handle_tpc_sm_ecc_exception, + .get_esr_sm_sel = gv11b_gr_intr_get_esr_sm_sel, + .clear_sm_hww = gv11b_gr_intr_clear_sm_hww, + .handle_ssync_hww = gv11b_gr_intr_handle_ssync_hww, + .log_mme_exception = tu104_gr_intr_log_mme_exception, + .record_sm_error_state = + gv11b_gr_intr_record_sm_error_state, + .get_sm_hww_warp_esr = + gv11b_gr_intr_get_sm_hww_warp_esr, + .get_sm_hww_warp_esr_pc = + gv11b_gr_intr_get_sm_hww_warp_esr_pc, + .get_sm_hww_global_esr = + gv11b_gr_intr_get_sm_hww_global_esr, + .get_sm_no_lock_down_hww_global_esr_mask = + gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask, + .tpc_enabled_exceptions = + gm20b_gr_intr_tpc_enabled_exceptions, }, .falcon = { .handle_fecs_ecc_error = diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index b74727ca5..7ac9a067e 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -280,7 +280,6 @@ struct gpu_ops { void (*get_ovr_perf_regs)(struct gk20a *g, u32 *num_ovr_perf_regs, u32 **ovr_perf_regsr); - void (*set_hww_esr_report_mask)(struct gk20a *g); void (*set_gpc_tpc_mask)(struct gk20a *g, u32 gpc_index); int (*decode_egpc_addr)(struct gk20a *g, u32 addr, enum ctxsw_addr_type *addr_type, @@ -325,27 +324,11 @@ struct gpu_ops { bool sm_debugger_attached, struct nvgpu_channel *fault_ch, bool *early_exit, bool *ignore_debugger); - u32 (*get_sm_hww_warp_esr)(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm); - u32 (*get_sm_hww_global_esr)(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm); - u64 (*get_sm_hww_warp_esr_pc)(struct gk20a *g, u32 offset); - u32 (*get_sm_no_lock_down_hww_global_esr_mask)(struct gk20a *g); int (*lock_down_sm)(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors); int (*wait_for_sm_lock_down)(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors); - void (*clear_sm_hww)(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - u32 global_esr); - void (*get_esr_sm_sel)(struct gk20a *g, u32 gpc, u32 tpc, - u32 *esr_sm_sel); - void (*handle_tpc_sm_ecc_exception)(struct gk20a *g, - u32 gpc, u32 tpc, - bool *post_event, struct nvgpu_channel *fault_ch, - u32 *hww_global_esr); u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g); - int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc, - u32 sm, struct nvgpu_channel *fault_ch); int (*clear_sm_error_state)(struct gk20a *g, struct nvgpu_channel *ch, u32 sm_id); int (*suspend_contexts)(struct gk20a *g, @@ -366,7 +349,6 @@ struct gpu_ops { int (*wait_for_pause)(struct gk20a *g, struct nvgpu_warpstate *w_state); int (*resume_from_pause)(struct gk20a *g); int (*clear_sm_errors)(struct gk20a *g); - u64 (*tpc_enabled_exceptions)(struct gk20a *g); bool (*sm_debugger_attached)(struct gk20a *g); void (*suspend_single_sm)(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, @@ -376,7 +358,6 @@ struct gpu_ops { void (*resume_single_sm)(struct gk20a *g, u32 gpc, u32 tpc, u32 sm); void (*resume_all_sms)(struct gk20a *g); - int (*handle_ssync_hww)(struct gk20a *g, u32 *ssync_esr); int (*add_ctxsw_reg_pm_fbpa)(struct gk20a *g, struct ctxsw_buf_offset_map_entry *map, struct netlist_aiv_list *regs, @@ -403,7 +384,6 @@ struct gpu_ops { void (*set_debug_mode)(struct gk20a *g, bool enable); int (*set_mmu_debug_mode)(struct gk20a *g, struct nvgpu_channel *ch, bool enable); - void (*log_mme_exception)(struct gk20a *g); int (*reset)(struct gk20a *g); bool (*esr_bpt_pending_events)(u32 global_esr, enum nvgpu_event_id_type bpt_event); @@ -856,9 +836,30 @@ struct gpu_ops { u32 *hww_global_esr); int (*stall_isr)(struct gk20a *g); void (*flush_channel_tlb)(struct gk20a *g); + void (*set_hww_esr_report_mask)(struct gk20a *g); + void (*handle_tpc_sm_ecc_exception)(struct gk20a *g, + u32 gpc, u32 tpc, + bool *post_event, struct nvgpu_channel *fault_ch, + u32 *hww_global_esr); + void (*get_esr_sm_sel)(struct gk20a *g, u32 gpc, u32 tpc, + u32 *esr_sm_sel); + void (*clear_sm_hww)(struct gk20a *g, u32 gpc, u32 tpc, + u32 sm, u32 global_esr); + int (*handle_ssync_hww)(struct gk20a *g, u32 *ssync_esr); + void (*log_mme_exception)(struct gk20a *g); + u32 (*record_sm_error_state)(struct gk20a *g, u32 gpc, + u32 tpc, u32 sm, struct nvgpu_channel *fault_ch); + u32 (*get_sm_hww_warp_esr)(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm); + u32 (*get_sm_hww_global_esr)(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm); + u64 (*get_sm_hww_warp_esr_pc)(struct gk20a *g, + u32 offset); + u32 (*get_sm_no_lock_down_hww_global_esr_mask)( + struct gk20a *g); + u64 (*tpc_enabled_exceptions)(struct gk20a *g); + u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void); } intr; - - u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void); } gr; struct { diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h b/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h index e7b9bdb78..012b8ca1d 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h @@ -35,6 +35,7 @@ int nvgpu_gr_reset(struct gk20a *g); int nvgpu_gr_init_support(struct gk20a *g); u32 nvgpu_gr_gpc_offset(struct gk20a *g, u32 gpc); u32 nvgpu_gr_tpc_offset(struct gk20a *g, u32 tpc); +u32 nvgpu_gr_sm_offset(struct gk20a *g, u32 sm); int nvgpu_gr_suspend(struct gk20a *g); void nvgpu_gr_wait_initialized(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c index 42325ebb2..2a88c9fd9 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c @@ -807,7 +807,7 @@ static int nvgpu_gpu_ioctl_has_any_exception( u64 tpc_exception_en; nvgpu_mutex_acquire(&g->dbg_sessions_lock); - tpc_exception_en = g->ops.gr.tpc_enabled_exceptions(g); + tpc_exception_en = g->ops.gr.intr.tpc_enabled_exceptions(g); nvgpu_mutex_release(&g->dbg_sessions_lock); args->tpc_exception_en_sm_mask = tpc_exception_en;