diff --git a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c index af4d41075..c20a9a2e6 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c @@ -213,8 +213,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { .set_ctxsw_preemption_mode = vgpu_gr_set_ctxsw_preemption_mode, .is_etpc_addr = gv11b_gr_pri_is_etpc_addr, .egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table, - .handle_gpc_gpcmmu_exception = - gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, .access_smpc_reg = gv11b_gr_access_smpc_reg, @@ -419,6 +417,8 @@ static const struct gpu_ops vgpu_gv11b_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpcmmu_exception = + gv11b_gr_intr_handle_gpc_gpcmmu_exception, .handle_gpc_gpccs_exception = gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 177a417f9..4cd4e3ff0 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -1921,10 +1921,11 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, } /* Handle GPCMMU exceptions */ - if (g->ops.gr.handle_gpc_gpcmmu_exception != NULL) { - tmp_ret = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc, - gpc_exception); - ret = (ret != 0) ? ret : tmp_ret; + if (g->ops.gr.intr.handle_gpc_gpcmmu_exception != NULL) { + g->ops.gr.intr.handle_gpc_gpcmmu_exception(g, gpc, + gpc_exception, + &g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter, + &g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); } } diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index b94b333c1..209eb7f33 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -457,8 +457,6 @@ static const struct gpu_ops gv100_ops = { .set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode, .is_etpc_addr = gv11b_gr_pri_is_etpc_addr, .egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table, - .handle_gpc_gpcmmu_exception = - gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, .access_smpc_reg = gv11b_gr_access_smpc_reg, @@ -705,6 +703,8 @@ static const struct gpu_ops gv100_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpcmmu_exception = + gv11b_gr_intr_handle_gpc_gpcmmu_exception, .handle_gpc_gpccs_exception = gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index c11900acd..8befc11f7 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -819,129 +819,6 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, return 0; } -static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc, - u32 exception) -{ - int ret = 0; - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 offset = gpc_stride * gpc; - u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; - u32 corrected_delta, uncorrected_delta; - u32 corrected_overflow, uncorrected_overflow; - u32 hww_esr; - - hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset); - - if ((hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() | - gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())) == 0U) { - return ret; - } - - ecc_status = gk20a_readl(g, - gr_gpc0_mmu_l1tlb_ecc_status_r() + offset); - ecc_addr = gk20a_readl(g, - gr_gpc0_mmu_l1tlb_ecc_address_r() + offset); - corrected_cnt = gk20a_readl(g, - gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset); - uncorrected_cnt = gk20a_readl(g, - gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset); - - corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v( - corrected_cnt); - uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v( - uncorrected_cnt); - corrected_overflow = ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m(); - - uncorrected_overflow = ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m(); - - - /* clear the interrupt */ - if ((corrected_delta > 0U) || (corrected_overflow != 0U)) { - gk20a_writel(g, - gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + - offset, 0); - } - if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) { - gk20a_writel(g, - gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + - offset, 0); - } - - gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset, - gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f()); - - /* Handle overflow */ - if (corrected_overflow != 0U) { - corrected_delta += - BIT32(gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s()); - } - if (uncorrected_overflow != 0U) { - uncorrected_delta += - BIT32(gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s()); - } - - g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter += - corrected_delta; - g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter += - uncorrected_delta; - nvgpu_log(g, gpu_dbg_intr, - "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, - GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, - 0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); - } - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); - } - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, - GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, - 0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); - } - if ((ecc_status & - gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); - } - if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { - nvgpu_info(g, "mmu l1tlb ecc counter overflow!"); - } - - nvgpu_log(g, gpu_dbg_intr, - "ecc error address: 0x%x", ecc_addr); - nvgpu_log(g, gpu_dbg_intr, - "ecc error count corrected: %d, uncorrected %d", - g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter, - g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); - - return ret; -} - - -int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, - u32 gpc_exception) -{ - if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m()) != 0U) { - return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc, - gpc_exception); - } - return 0; -} - void gr_gv11b_set_go_idle_timeout(struct gk20a *g, u32 data) { gk20a_writel(g, gr_fe_go_idle_timeout_r(), data); diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 9561ea00e..a65b418d5 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -84,8 +84,6 @@ int gr_gv11b_handle_tpc_sm_ecc_exception(struct gk20a *g, int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr); -int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, - u32 gpc_exception); void gr_gv11b_enable_gpc_exceptions(struct gk20a *g); int gr_gv11b_handle_sw_method(struct gk20a *g, u32 addr, u32 class_num, u32 offset, u32 data); diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 208e87a1e..7f811e7dc 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -410,8 +410,6 @@ static const struct gpu_ops gv11b_ops = { .set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode, .is_etpc_addr = gv11b_gr_pri_is_etpc_addr, .egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table, - .handle_gpc_gpcmmu_exception = - gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, .access_smpc_reg = gv11b_gr_access_smpc_reg, @@ -664,6 +662,8 @@ static const struct gpu_ops gv11b_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpcmmu_exception = + gv11b_gr_intr_handle_gpc_gpcmmu_exception, .handle_gpc_gpccs_exception = gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c index 660e11e9d..764091bbd 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c @@ -31,6 +31,120 @@ #include +void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err) +{ + u32 offset = nvgpu_gr_gpc_offset(g, gpc); + u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; + u32 corrected_delta, uncorrected_delta; + u32 corrected_overflow, uncorrected_overflow; + u32 hww_esr; + + if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m()) == 0U) { + return; + } + + hww_esr = nvgpu_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset); + + if ((hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() | + gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())) == 0U) { + return; + } + + ecc_status = nvgpu_readl(g, + gr_gpc0_mmu_l1tlb_ecc_status_r() + offset); + ecc_addr = nvgpu_readl(g, + gr_gpc0_mmu_l1tlb_ecc_address_r() + offset); + corrected_cnt = nvgpu_readl(g, + gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset); + uncorrected_cnt = nvgpu_readl(g, + gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset); + + corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v( + corrected_cnt); + uncorrected_delta = + gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v( + uncorrected_cnt); + corrected_overflow = ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m(); + + uncorrected_overflow = ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m(); + + /* clear the interrupt */ + if ((corrected_delta > 0U) || (corrected_overflow != 0U)) { + nvgpu_writel(g, + gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + + offset, 0); + } + if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) { + nvgpu_writel(g, + gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + + offset, 0); + } + + nvgpu_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset, + gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f()); + + /* Handle overflow */ + if (corrected_overflow != 0U) { + corrected_delta += + BIT32(gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s()); + } + if (uncorrected_overflow != 0U) { + uncorrected_delta += + BIT32(gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s()); + } + + *corrected_err += corrected_delta; + *uncorrected_err += uncorrected_delta; + + nvgpu_log(g, gpu_dbg_intr, + "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); + + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != + 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, + 0, (u32)*corrected_err); + nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); + } + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != + 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, + 0, (u32)*uncorrected_err); + nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); + } + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != + 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, + 0, (u32)*corrected_err); + nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); + } + if ((ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != + 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, + 0, (u32)*uncorrected_err); + nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); + } + if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { + nvgpu_info(g, "mmu l1tlb ecc counter overflow!"); + } + + nvgpu_log(g, gpu_dbg_intr, + "ecc error address: 0x%x", ecc_addr); + nvgpu_log(g, gpu_dbg_intr, + "ecc error count corrected: %d, uncorrected %d", + (u32)*corrected_err, (u32)*uncorrected_err); +} + void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err) { diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h index 3939a19a6..3f156494a 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h @@ -28,6 +28,8 @@ struct gk20a; struct nvgpu_gr_config; +void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc); diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 6ae1a0d23..bdfe16121 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -374,8 +374,6 @@ struct gpu_ops { int (*handle_gcc_exception)(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr); - int (*handle_gpc_gpcmmu_exception)(struct gk20a *g, u32 gpc, - u32 gpc_exception); int (*init_ecc)(struct gk20a *g); u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g); int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc, @@ -782,6 +780,9 @@ struct gpu_ops { } init; struct { + void (*handle_gpc_gpcmmu_exception)(struct gk20a *g, + u32 gpc, u32 gpc_exception, + u32 *corrected_err, u32 *uncorrected_err); void (*handle_gpc_gpccs_exception)(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index a66d50e2c..725103de7 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -479,8 +479,6 @@ static const struct gpu_ops tu104_ops = { .set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode, .is_etpc_addr = gv11b_gr_pri_is_etpc_addr, .egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table, - .handle_gpc_gpcmmu_exception = - gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, .access_smpc_reg = gv11b_gr_access_smpc_reg, @@ -738,6 +736,8 @@ static const struct gpu_ops tu104_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpcmmu_exception = + gv11b_gr_intr_handle_gpc_gpcmmu_exception, .handle_gpc_gpccs_exception = gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception,