diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c index ad056493f..cf5fe52d9 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c @@ -121,75 +121,10 @@ u32 gm20b_gr_intr_read_pending_interrupts(struct gk20a *g, return gr_intr; } -bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) +static u32 gr_gm20b_intr_check_gr_ssync_exception(struct gk20a *g, + u32 exception) { - bool gpc_reset = false; - u32 exception = nvgpu_readl(g, gr_exception_r()); - - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "exception %08x\n", exception); - - if ((exception & gr_exception_fe_m()) != 0U) { - u32 fe = nvgpu_readl(g, gr_fe_hww_esr_r()); - u32 info = nvgpu_readl(g, gr_fe_hww_esr_info_r()); - - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_FE_EXCEPTION, - fe, 0); - nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x", - fe, info); - nvgpu_writel(g, gr_fe_hww_esr_r(), - gr_fe_hww_esr_reset_active_f()); - gpc_reset = true; - } - - if ((exception & gr_exception_memfmt_m()) != 0U) { - u32 memfmt = nvgpu_readl(g, gr_memfmt_hww_esr_r()); - - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_MEMFMT_EXCEPTION, - memfmt, 0); - nvgpu_err(g, "memfmt exception: esr %08x", memfmt); - nvgpu_writel(g, gr_memfmt_hww_esr_r(), - gr_memfmt_hww_esr_reset_active_f()); - gpc_reset = true; - } - - if ((exception & gr_exception_pd_m()) != 0U) { - u32 pd = nvgpu_readl(g, gr_pd_hww_esr_r()); - - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_PD_EXCEPTION, - pd, 0); - nvgpu_err(g, "pd exception: esr 0x%08x", pd); - nvgpu_writel(g, gr_pd_hww_esr_r(), - gr_pd_hww_esr_reset_active_f()); - gpc_reset = true; - } - - if ((exception & gr_exception_scc_m()) != 0U) { - u32 scc = nvgpu_readl(g, gr_scc_hww_esr_r()); - - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_SCC_EXCEPTION, - scc, 0); - nvgpu_err(g, "scc exception: esr 0x%08x", scc); - nvgpu_writel(g, gr_scc_hww_esr_r(), - gr_scc_hww_esr_reset_active_f()); - gpc_reset = true; - } - - if ((exception & gr_exception_ds_m()) != 0U) { - u32 ds = nvgpu_readl(g, gr_ds_hww_esr_r()); - - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_DS_EXCEPTION, - ds, 0); - nvgpu_err(g, "ds exception: esr: 0x%08x", ds); - nvgpu_writel(g, gr_ds_hww_esr_r(), - gr_ds_hww_esr_reset_task_f()); - gpc_reset = true; - } + u32 reset_gpc = 0U; if ((exception & gr_exception_ssync_m()) != 0U) { u32 ssync_esr = 0; @@ -197,7 +132,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) if (g->ops.gr.intr.handle_ssync_hww != NULL) { if (g->ops.gr.intr.handle_ssync_hww(g, &ssync_esr) != 0) { - gpc_reset = true; + reset_gpc = 1U; } } else { nvgpu_err(g, "unhandled ssync exception"); @@ -206,7 +141,12 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) GPU_PGRAPH_SSYNC_EXCEPTION, ssync_esr, 0); } + return reset_gpc; +} +static u32 gr_gm20b_intr_check_gr_mme_exception(struct gk20a *g, + u32 exception) +{ if ((exception & gr_exception_mme_m()) != 0U) { u32 mme = nvgpu_readl(g, gr_mme_hww_esr_r()); u32 info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); @@ -222,9 +162,14 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) nvgpu_writel(g, gr_mme_hww_esr_r(), gr_mme_hww_esr_reset_active_f()); - gpc_reset = true; + return 1U; } + return 0U; +} +static u32 gr_gm20b_intr_check_gr_sked_exception(struct gk20a *g, + u32 exception) +{ if ((exception & gr_exception_sked_m()) != 0U) { u32 sked = nvgpu_readl(g, gr_sked_hww_esr_r()); @@ -234,15 +179,120 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) nvgpu_err(g, "sked exception: esr 0x%08x", sked); nvgpu_writel(g, gr_sked_hww_esr_r(), gr_sked_hww_esr_reset_active_f()); - gpc_reset = true; + return 1U; } + return 0U; +} + +static u32 gr_gm20b_intr_check_gr_fe_exception(struct gk20a *g, u32 exception) +{ + if ((exception & gr_exception_fe_m()) != 0U) { + u32 fe = nvgpu_readl(g, gr_fe_hww_esr_r()); + u32 info = nvgpu_readl(g, gr_fe_hww_esr_info_r()); + + nvgpu_gr_intr_report_exception(g, 0, + GPU_PGRAPH_FE_EXCEPTION, + fe, 0); + nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x", + fe, info); + nvgpu_writel(g, gr_fe_hww_esr_r(), + gr_fe_hww_esr_reset_active_f()); + return 1U; + } + return 0U; +} + +static u32 gr_gm20b_intr_check_gr_memfmt_exception(struct gk20a *g, + u32 exception) +{ + if ((exception & gr_exception_memfmt_m()) != 0U) { + u32 memfmt = nvgpu_readl(g, gr_memfmt_hww_esr_r()); + + nvgpu_gr_intr_report_exception(g, 0, + GPU_PGRAPH_MEMFMT_EXCEPTION, + memfmt, 0); + nvgpu_err(g, "memfmt exception: esr %08x", memfmt); + nvgpu_writel(g, gr_memfmt_hww_esr_r(), + gr_memfmt_hww_esr_reset_active_f()); + return 1U; + } + return 0U; +} + +static u32 gr_gm20b_intr_check_gr_pd_exception(struct gk20a *g, + u32 exception) +{ + if ((exception & gr_exception_pd_m()) != 0U) { + u32 pd = nvgpu_readl(g, gr_pd_hww_esr_r()); + + nvgpu_gr_intr_report_exception(g, 0, + GPU_PGRAPH_PD_EXCEPTION, + pd, 0); + nvgpu_err(g, "pd exception: esr 0x%08x", pd); + nvgpu_writel(g, gr_pd_hww_esr_r(), + gr_pd_hww_esr_reset_active_f()); + return 1U; + } + return 0U; +} + +static u32 gr_gm20b_intr_check_gr_scc_exception(struct gk20a *g, + u32 exception) +{ + if ((exception & gr_exception_scc_m()) != 0U) { + u32 scc = nvgpu_readl(g, gr_scc_hww_esr_r()); + + nvgpu_gr_intr_report_exception(g, 0, + GPU_PGRAPH_SCC_EXCEPTION, + scc, 0); + nvgpu_err(g, "scc exception: esr 0x%08x", scc); + nvgpu_writel(g, gr_scc_hww_esr_r(), + gr_scc_hww_esr_reset_active_f()); + return 1U; + } + return 0U; +} + +static u32 gr_gm20b_intr_check_gr_ds_exception(struct gk20a *g, + u32 exception) +{ + if ((exception & gr_exception_ds_m()) != 0U) { + u32 ds = nvgpu_readl(g, gr_ds_hww_esr_r()); + + nvgpu_gr_intr_report_exception(g, 0, + GPU_PGRAPH_DS_EXCEPTION, + ds, 0); + nvgpu_err(g, "ds exception: esr: 0x%08x", ds); + nvgpu_writel(g, gr_ds_hww_esr_r(), + gr_ds_hww_esr_reset_task_f()); + return 1U; + } + return 0U; +} + +bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) +{ + u32 gpc_reset = 0U; + u32 exception = nvgpu_readl(g, gr_exception_r()); + + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "exception %08x\n", exception); + + gpc_reset = gr_gm20b_intr_check_gr_fe_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_memfmt_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_pd_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_scc_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_ds_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_ssync_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_mme_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_sked_exception(g, exception); /* check if a gpc exception has occurred */ if ((exception & gr_exception_gpc_m()) != 0U) { *is_gpc_exception = true; } - return gpc_reset; + return (gpc_reset != 0U)? true: false; } u32 gm20b_gr_intr_read_gpc_tpc_exception(u32 gpc_exception) diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c index c129557be..ed48c9bea 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c @@ -368,6 +368,44 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, gr_pri_gpc0_gcc_l15_ecc_status_reset_task_f()); } +static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, + u32 ecc_status, u32 gpc, + u32 correct_err, u32 uncorrect_err) +{ + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != + 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, + GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, + 0, correct_err); + nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); + } + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != + 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, + GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, + 0, uncorrect_err); + nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); + } + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != + 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, + GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, + 0, correct_err); + nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); + } + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != + 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, + GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, + 0, uncorrect_err); + nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); + } +} + void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err) { @@ -430,10 +468,12 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if (corrected_overflow != 0U) { corrected_delta = nvgpu_safe_add_u32(corrected_delta, BIT32(gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s())); + nvgpu_info(g, "mmu l1tlb ecc counter corrected overflow!"); } if (uncorrected_overflow != 0U) { uncorrected_delta = nvgpu_safe_add_u32(uncorrected_delta, BIT32(gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s())); + nvgpu_info(g, "mmu l1tlb ecc counter uncorrected overflow!"); } *corrected_err = nvgpu_safe_add_u32(*corrected_err, corrected_delta); @@ -443,41 +483,8 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, nvgpu_log(g, gpu_dbg_intr, "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != - 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, - 0, (u32)*corrected_err); - nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); - } - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != - 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, - 0, (u32)*uncorrected_err); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); - } - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != - 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, - 0, (u32)*corrected_err); - nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); - } - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != - 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, - 0, (u32)*uncorrected_err); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); - } - if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { - nvgpu_info(g, "mmu l1tlb ecc counter overflow!"); - } + gv11b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc, + (u32)*corrected_err, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "ecc error address: 0x%x", ecc_addr); @@ -486,6 +493,39 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, (u32)*corrected_err, (u32)*uncorrected_err); } +static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g, + u32 ecc_status, u32 ecc_addr, u32 gpc, + u32 correct_err, u32 uncorrect_err) +{ + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, + gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, correct_err); + nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); + } + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, + gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, uncorrect_err); + nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); + } + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, + gpc, GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, + ecc_addr, correct_err); + nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); + } + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, + gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, uncorrect_err); + nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); + } +} void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err) { @@ -553,34 +593,9 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, nvgpu_log(g, gpu_dbg_intr, "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, - gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, - ecc_addr, (u32)*corrected_err); - nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); - } - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, - gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, - ecc_addr, (u32)*uncorrected_err); - nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); - } - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, - gpc, GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, - ecc_addr, (u32)*corrected_err); - nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); - } - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, - gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, - ecc_addr, (u32)*uncorrected_err); - nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); - } + gv11b_gr_intr_report_gpccs_ecc_err(g, ecc_status, ecc_addr, gpc, + (u32)*corrected_err, (u32)*uncorrected_err); + if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { nvgpu_info(g, "gpccs ecc counter overflow!"); } @@ -746,6 +761,60 @@ void gv11b_gr_intr_set_hww_esr_report_mask(struct gk20a *g) gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_multiple_warp_errors_report_f()); } +static void gv11b_gr_intr_report_l1_tag_uncorrected_err(struct gk20a *g, + u32 l1_tag_ecc_status, u32 gpc, u32 tpc) +{ + if ((l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + } +} + +static void gv11b_gr_intr_report_l1_tag_corrected_err(struct gk20a *g, + u32 l1_tag_ecc_status, u32 gpc, u32 tpc) +{ + if ((l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((l1_tag_ecc_status & + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + } +} + static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct nvgpu_channel *fault_ch, u32 *hww_global_esr) @@ -813,28 +882,7 @@ static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 nvgpu_safe_add_u32( g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter, l1_tag_corrected_err_count_delta); - if ((l1_tag_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_ECC_CORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); - } + gv11b_gr_intr_report_l1_tag_corrected_err(g, l1_tag_ecc_status, gpc, tpc); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r(), offset), 0); @@ -855,28 +903,7 @@ static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 nvgpu_safe_add_u32( g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter, l1_tag_uncorrected_err_count_delta); - if ((l1_tag_ecc_status & - (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((l1_tag_ecc_status & - gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); - } + gv11b_gr_intr_report_l1_tag_uncorrected_err(g, l1_tag_ecc_status, gpc, tpc); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r(), offset), 0); @@ -1203,6 +1230,72 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f()); } +static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g, + u32 icache_ecc_status, u32 gpc, u32 tpc) +{ + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + } +} + +static void gv11b_gr_intr_report_icache_corrected_err(struct gk20a *g, + u32 icache_ecc_status, u32 gpc, u32 tpc) +{ + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } + if ((icache_ecc_status & + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << 8) | tpc, + GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, + 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); + } +} + static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct nvgpu_channel *fault_ch, u32 *hww_global_esr) @@ -1272,34 +1365,7 @@ static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32 nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(), offset), 0); - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, - 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); - } + gv11b_gr_intr_report_icache_corrected_err(g, icache_ecc_status, gpc, tpc); } if ((icache_uncorrected_err_count_delta > 0U) || is_icache_ecc_uncorrected_total_err_overflow) { nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, @@ -1320,38 +1386,11 @@ static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32 nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(), offset), 0); - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } - if ((icache_ecc_status & - gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << 8) | tpc, - GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); - } + gv11b_gr_intr_report_icache_uncorrected_err(g, icache_ecc_status, gpc, tpc); } nvgpu_writel(g, nvgpu_safe_add_u32( - gr_pri_gpc0_tpc0_sm_icache_ecc_status_r(), offset), + gr_pri_gpc0_tpc0_sm_icache_ecc_status_r(), offset), gr_pri_gpc0_tpc0_sm_icache_ecc_status_reset_task_f()); } diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c index 5bcda8c7c..20745927b 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_tu104.c @@ -158,23 +158,8 @@ void tu104_gr_intr_enable_gpc_exceptions(struct gk20a *g, gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1U))); } -void tu104_gr_intr_log_mme_exception(struct gk20a *g) +static void gr_tu104_check_dma_exception(struct gk20a *g, u32 mme_hww_esr) { - u32 mme_hww_esr = nvgpu_readl(g, gr_mme_hww_esr_r()); - u32 mme_hww_info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); - - if ((mme_hww_esr & - gr_mme_hww_esr_missing_macro_data_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: MISSING_MACRO_DATA"); - } - - if ((mme_hww_esr & - gr_mme_hww_esr_illegal_mme_method_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: ILLEGAL_MME_METHOD"); - } - if ((mme_hww_esr & gr_mme_hww_esr_dma_dram_access_pending_f()) != 0U) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, @@ -199,6 +184,45 @@ void tu104_gr_intr_log_mme_exception(struct gk20a *g) "GR MME EXCEPTION: DMA_FIFO_RESIZED_WHEN_NONIDLE"); } + if ((mme_hww_esr & gr_mme_hww_esr_dma_read_pb_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_READ_FIFOED_FROM_PB"); + } +} + +static void gr_tu104_check_ram_access_exception(struct gk20a *g, u32 mme_hww_esr) +{ + if ((mme_hww_esr & gr_mme_hww_esr_inst_ram_acess_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: INSTR_RAM_ACCESS_OUT_OF_BOUNDS"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_data_ram_access_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DATA_RAM_ACCESS_OUT_OF_BOUNDS"); + } +} + +void tu104_gr_intr_log_mme_exception(struct gk20a *g) +{ + u32 mme_hww_esr = nvgpu_readl(g, gr_mme_hww_esr_r()); + u32 mme_hww_info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); + + gr_tu104_check_dma_exception(g, mme_hww_esr); + gr_tu104_check_ram_access_exception(g, mme_hww_esr); + + if ((mme_hww_esr & + gr_mme_hww_esr_missing_macro_data_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: MISSING_MACRO_DATA"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_illegal_mme_method_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: ILLEGAL_MME_METHOD"); + } + if ((mme_hww_esr & gr_mme_hww_esr_illegal_opcode_pending_f()) != 0U) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GR MME EXCEPTION: ILLEGAL_OPCODE"); @@ -209,21 +233,6 @@ void tu104_gr_intr_log_mme_exception(struct gk20a *g) "GR MME EXCEPTION: BRANCH_IN_DELAY_SHOT"); } - if ((mme_hww_esr & gr_mme_hww_esr_inst_ram_acess_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: INSTR_RAM_ACCESS_OUT_OF_BOUNDS"); - } - - if ((mme_hww_esr & gr_mme_hww_esr_data_ram_access_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DATA_RAM_ACCESS_OUT_OF_BOUNDS"); - } - - if ((mme_hww_esr & gr_mme_hww_esr_dma_read_pb_pending_f()) != 0U) { - nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, - "GR MME EXCEPTION: DMA_READ_FIFOED_FROM_PB"); - } - if (gr_mme_hww_esr_info_pc_valid_v(mme_hww_info) == 0x1U) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GR MME EXCEPTION: INFO2 0x%x, INFO3 0x%x, INFO4 0x%x",