diff --git a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c index a3b2cf9c6..ad054bfd7 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c @@ -213,8 +213,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, - .handle_gpc_gpccs_exception = - gr_gv11b_handle_gpc_gpccs_exception, .access_smpc_reg = gv11b_gr_access_smpc_reg, .is_egpc_addr = gv11b_gr_pri_is_egpc_addr, .handle_gcc_exception = gr_gv11b_handle_gcc_exception, @@ -416,6 +414,8 @@ static const struct gpu_ops vgpu_gv11b_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpccs_exception = + gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, .handle_tpc_mpc_exception = gv11b_gr_intr_handle_tpc_mpc_exception, diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 3aaae8dec..bdbaf09ab 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -2086,10 +2086,11 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, } /* Handle GPCCS exceptions */ - if (g->ops.gr.handle_gpc_gpccs_exception != NULL) { - tmp_ret = g->ops.gr.handle_gpc_gpccs_exception(g, gpc, - gpc_exception); - ret = (ret != 0) ? ret : tmp_ret; + if (g->ops.gr.intr.handle_gpc_gpccs_exception != NULL) { + g->ops.gr.intr.handle_gpc_gpccs_exception(g, gpc, + gpc_exception, + &g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter, + &g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); } /* Handle GPCMMU exceptions */ diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index fcc130cb1..f15abc921 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -458,8 +458,6 @@ static const struct gpu_ops gv100_ops = { gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, - .handle_gpc_gpccs_exception = - gr_gv11b_handle_gpc_gpccs_exception, .access_smpc_reg = gv11b_gr_access_smpc_reg, .is_egpc_addr = gv11b_gr_pri_is_egpc_addr, .handle_gcc_exception = gr_gv11b_handle_gcc_exception, @@ -706,6 +704,8 @@ static const struct gpu_ops gv100_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpccs_exception = + gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, .handle_tpc_mpc_exception = gv11b_gr_intr_handle_tpc_mpc_exception, diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 2ec2a51df..c76a1d4e1 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -930,108 +930,6 @@ static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc, return ret; } -static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, - u32 exception) -{ - int ret = 0; - u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); - u32 offset = gpc_stride * gpc; - u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; - u32 corrected_delta, uncorrected_delta; - u32 corrected_overflow, uncorrected_overflow; - u32 hww_esr; - - hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset); - - if ((hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() | - gr_gpc0_gpccs_hww_esr_ecc_corrected_m())) == 0U) { - return ret; - } - - ecc_status = gk20a_readl(g, - gr_gpc0_gpccs_falcon_ecc_status_r() + offset); - ecc_addr = gk20a_readl(g, - gr_gpc0_gpccs_falcon_ecc_address_r() + offset); - corrected_cnt = gk20a_readl(g, - gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + offset); - uncorrected_cnt = gk20a_readl(g, - gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset); - - corrected_delta = gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v( - corrected_cnt); - uncorrected_delta = gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v( - uncorrected_cnt); - corrected_overflow = ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_corrected_err_total_counter_overflow_m(); - - uncorrected_overflow = ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_total_counter_overflow_m(); - - - /* clear the interrupt */ - if ((corrected_delta > 0U) || (corrected_overflow != 0U)) { - gk20a_writel(g, - gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + - offset, 0); - } - if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) { - gk20a_writel(g, - gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + - offset, 0); - } - - gk20a_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset, - gr_gpc0_gpccs_falcon_ecc_status_reset_task_f()); - - g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter += - corrected_delta; - g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter += - uncorrected_delta; - nvgpu_log(g, gpu_dbg_intr, - "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, - GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, - ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); - } - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, - GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, - ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); - } - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, - GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, - ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); - } - if ((ecc_status & - gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, - GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, - ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); - nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); - } - if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { - nvgpu_info(g, "gpccs ecc counter overflow!"); - } - - nvgpu_log(g, gpu_dbg_intr, - "ecc error row address: 0x%x", - gr_gpc0_gpccs_falcon_ecc_address_row_address_v(ecc_addr)); - - nvgpu_log(g, gpu_dbg_intr, - "ecc error count corrected: %d, uncorrected %d", - g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter, - g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); - - return ret; -} int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, u32 gpc_exception) @@ -1043,17 +941,6 @@ int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, return 0; } -int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, - u32 gpc_exception) -{ - if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpccs_m()) != 0U) { - return gr_gv11b_handle_gpccs_ecc_exception(g, gpc, - gpc_exception); - } - - return 0; -} - void gr_gv11b_set_go_idle_timeout(struct gk20a *g, u32 data) { gk20a_writel(g, gr_fe_go_idle_timeout_r(), data); diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 4a970f1d3..9561ea00e 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -86,8 +86,6 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 *hww_global_esr); int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, u32 gpc_exception); -int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, - u32 gpc_exception); void gr_gv11b_enable_gpc_exceptions(struct gk20a *g); int gr_gv11b_handle_sw_method(struct gk20a *g, u32 addr, u32 class_num, u32 offset, u32 data); diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 75aa36290..2fa573d09 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -411,8 +411,6 @@ static const struct gpu_ops gv11b_ops = { gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, - .handle_gpc_gpccs_exception = - gr_gv11b_handle_gpc_gpccs_exception, .access_smpc_reg = gv11b_gr_access_smpc_reg, .is_egpc_addr = gv11b_gr_pri_is_egpc_addr, .handle_gcc_exception = gr_gv11b_handle_gcc_exception, @@ -665,6 +663,8 @@ static const struct gpu_ops gv11b_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpccs_exception = + gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, .handle_tpc_mpc_exception = gv11b_gr_intr_handle_tpc_mpc_exception, diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c index b5af82d6b..660e11e9d 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c @@ -31,6 +31,110 @@ #include +void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err) +{ + u32 offset = nvgpu_gr_gpc_offset(g, gpc); + u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; + u32 corrected_delta, uncorrected_delta; + u32 corrected_overflow, uncorrected_overflow; + u32 hww_esr; + + if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpccs_m()) == 0U) { + return; + } + + hww_esr = nvgpu_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset); + + if ((hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() | + gr_gpc0_gpccs_hww_esr_ecc_corrected_m())) == 0U) { + return; + } + + ecc_status = nvgpu_readl(g, + gr_gpc0_gpccs_falcon_ecc_status_r() + offset); + ecc_addr = nvgpu_readl(g, + gr_gpc0_gpccs_falcon_ecc_address_r() + offset); + corrected_cnt = nvgpu_readl(g, + gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + offset); + uncorrected_cnt = nvgpu_readl(g, + gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset); + + corrected_delta = + gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v( + corrected_cnt); + uncorrected_delta = + gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v( + uncorrected_cnt); + corrected_overflow = ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_corrected_err_total_counter_overflow_m(); + + uncorrected_overflow = ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_total_counter_overflow_m(); + + + /* clear the interrupt */ + if ((corrected_delta > 0U) || (corrected_overflow != 0U)) { + nvgpu_writel(g, + gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + + offset, 0); + } + if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) { + nvgpu_writel(g, + gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + + offset, 0); + } + + nvgpu_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset, + gr_gpc0_gpccs_falcon_ecc_status_reset_task_f()); + + *corrected_err += corrected_delta; + *corrected_err += uncorrected_delta; + + nvgpu_log(g, gpu_dbg_intr, + "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); + + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, (u32)*corrected_err); + nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); + } + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, (u32)*uncorrected_err); + nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); + } + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, + ecc_addr, (u32)*corrected_err); + nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); + } + if ((ecc_status & + gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, (u32)*uncorrected_err); + nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); + } + if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { + nvgpu_info(g, "gpccs ecc counter overflow!"); + } + + nvgpu_log(g, gpu_dbg_intr, + "ecc error row address: 0x%x", + gr_gpc0_gpccs_falcon_ecc_address_row_address_v(ecc_addr)); + + nvgpu_log(g, gpu_dbg_intr, + "ecc error count corrected: %d, uncorrected %d", + (u32)*corrected_err, (u32)*uncorrected_err); +} + void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc) { u32 esr; @@ -80,7 +184,7 @@ void gv11b_gr_intr_enable_hww_exceptions(struct gk20a *g) /* For now leave POR values */ nvgpu_log(g, gpu_dbg_info, "gr_sked_hww_esr_en_r 0x%08x", - gk20a_readl(g, gr_sked_hww_esr_en_r())); + nvgpu_readl(g, gr_sked_hww_esr_en_r())); } void gv11b_gr_intr_enable_exceptions(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h index 5b7c4cf3c..3939a19a6 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h @@ -28,6 +28,8 @@ struct gk20a; struct nvgpu_gr_config; +void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc); void gv11b_gr_intr_enable_hww_exceptions(struct gk20a *g); void gv11b_gr_intr_enable_exceptions(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 89a3b364f..537397c4e 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -374,8 +374,6 @@ struct gpu_ops { int (*handle_gcc_exception)(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr); - int (*handle_gpc_gpccs_exception)(struct gk20a *g, u32 gpc, - u32 gpc_exception); int (*handle_gpc_gpcmmu_exception)(struct gk20a *g, u32 gpc, u32 gpc_exception); int (*init_ecc)(struct gk20a *g); @@ -782,6 +780,9 @@ struct gpu_ops { } init; struct { + void (*handle_gpc_gpccs_exception)(struct gk20a *g, + u32 gpc, u32 gpc_exception, + u32 *corrected_err, u32 *uncorrected_err); u32 (*get_tpc_exception)(struct gk20a *g, u32 offset, struct nvgpu_gr_tpc_exception *pending_tpc); void (*handle_tpc_mpc_exception)(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index dfb32acd4..3900d8a46 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -480,8 +480,6 @@ static const struct gpu_ops tu104_ops = { gr_gv11b_handle_gpc_gpcmmu_exception, .get_egpc_base = gv11b_gr_get_egpc_base, .get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num, - .handle_gpc_gpccs_exception = - gr_gv11b_handle_gpc_gpccs_exception, .access_smpc_reg = gv11b_gr_access_smpc_reg, .is_egpc_addr = gv11b_gr_pri_is_egpc_addr, .handle_gcc_exception = gr_gv11b_handle_gcc_exception, @@ -739,6 +737,8 @@ static const struct gpu_ops tu104_ops = { gv11b_gr_init_commit_gfxp_wfi_timeout, }, .intr = { + .handle_gpc_gpccs_exception = + gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, .handle_tpc_mpc_exception = gv11b_gr_intr_handle_tpc_mpc_exception,