diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c index 6d86625ec..30d081f34 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c @@ -236,117 +236,131 @@ int gv11b_pmu_bootstrap(struct gk20a *g, struct nvgpu_pmu *pmu, return err; } -void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) +static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) +{ + if ((ecc_status & + pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { + nvgpu_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); + nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); + } + if ((ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + nvgpu_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); + nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); + } + if ((ecc_status & + pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { + nvgpu_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_DMEM_ECC_CORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); + nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); + } + if ((ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + nvgpu_pmu_report_ecc_error(g, 0, + GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); + nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); + } +} + +static void gv11b_pmu_handle_ecc_irq(struct gk20a *g) { u32 intr1; u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; u32 corrected_delta, uncorrected_delta; u32 corrected_overflow, uncorrected_overflow; + intr1 = gk20a_readl(g, pwr_pmu_ecc_intr_status_r()); + if ((intr1 & + (pwr_pmu_ecc_intr_status_corrected_m() | + pwr_pmu_ecc_intr_status_uncorrected_m())) == 0U) { + return; + } + + ecc_status = gk20a_readl(g, + pwr_pmu_falcon_ecc_status_r()); + ecc_addr = gk20a_readl(g, + pwr_pmu_falcon_ecc_address_r()); + corrected_cnt = gk20a_readl(g, + pwr_pmu_falcon_ecc_corrected_err_count_r()); + uncorrected_cnt = gk20a_readl(g, + pwr_pmu_falcon_ecc_uncorrected_err_count_r()); + + corrected_delta = + pwr_pmu_falcon_ecc_corrected_err_count_total_v(corrected_cnt); + uncorrected_delta = + pwr_pmu_falcon_ecc_uncorrected_err_count_total_v(uncorrected_cnt); + corrected_overflow = ecc_status & + pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m(); + + uncorrected_overflow = ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_total_counter_overflow_m(); + corrected_overflow = ecc_status & + pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m(); + + /* clear the interrupt */ + if (((intr1 & pwr_pmu_ecc_intr_status_corrected_m()) != 0U) || + (corrected_overflow != 0U)) { + gk20a_writel(g, pwr_pmu_falcon_ecc_corrected_err_count_r(), 0); + } + if (((intr1 & pwr_pmu_ecc_intr_status_uncorrected_m()) != 0U) || + (uncorrected_overflow != 0U)) { + gk20a_writel(g, + pwr_pmu_falcon_ecc_uncorrected_err_count_r(), 0); + } + + gk20a_writel(g, pwr_pmu_falcon_ecc_status_r(), + pwr_pmu_falcon_ecc_status_reset_task_f()); + + /* update counters per slice */ + if (corrected_overflow != 0U) { + corrected_delta += + BIT32(pwr_pmu_falcon_ecc_corrected_err_count_total_s()); + } + if (uncorrected_overflow != 0U) { + uncorrected_delta += + BIT32(pwr_pmu_falcon_ecc_uncorrected_err_count_total_s()); + } + + g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter += corrected_delta; + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter += + uncorrected_delta; + + nvgpu_log(g, gpu_dbg_intr, + "pmu ecc interrupt intr1: 0x%x", intr1); + + gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr); + + if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { + nvgpu_info(g, "ecc counter overflow!"); + } + + nvgpu_log(g, gpu_dbg_intr, + "ecc error row address: 0x%x", + pwr_pmu_falcon_ecc_address_row_address_v(ecc_addr)); + + nvgpu_log(g, gpu_dbg_intr, + "ecc error count corrected: %d, uncorrected %d", + g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter, + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); +} + +void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) +{ /* * handle the ECC interrupt */ if ((intr0 & pwr_falcon_irqstat_ext_ecc_parity_true_f()) != 0U) { - intr1 = gk20a_readl(g, pwr_pmu_ecc_intr_status_r()); - if ((intr1 & - (pwr_pmu_ecc_intr_status_corrected_m() | - pwr_pmu_ecc_intr_status_uncorrected_m())) != 0U) { - - ecc_status = gk20a_readl(g, - pwr_pmu_falcon_ecc_status_r()); - ecc_addr = gk20a_readl(g, - pwr_pmu_falcon_ecc_address_r()); - corrected_cnt = gk20a_readl(g, - pwr_pmu_falcon_ecc_corrected_err_count_r()); - uncorrected_cnt = gk20a_readl(g, - pwr_pmu_falcon_ecc_uncorrected_err_count_r()); - - corrected_delta = - pwr_pmu_falcon_ecc_corrected_err_count_total_v(corrected_cnt); - uncorrected_delta = - pwr_pmu_falcon_ecc_uncorrected_err_count_total_v(uncorrected_cnt); - corrected_overflow = ecc_status & - pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m(); - - uncorrected_overflow = ecc_status & - pwr_pmu_falcon_ecc_status_uncorrected_err_total_counter_overflow_m(); - corrected_overflow = ecc_status & - pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m(); - - /* clear the interrupt */ - if (((intr1 & pwr_pmu_ecc_intr_status_corrected_m()) != 0U) || - (corrected_overflow != 0U)) { - gk20a_writel(g, pwr_pmu_falcon_ecc_corrected_err_count_r(), 0); - } - if (((intr1 & pwr_pmu_ecc_intr_status_uncorrected_m()) != 0U) || - (uncorrected_overflow != 0U)) { - gk20a_writel(g, - pwr_pmu_falcon_ecc_uncorrected_err_count_r(), 0); - } - - gk20a_writel(g, pwr_pmu_falcon_ecc_status_r(), - pwr_pmu_falcon_ecc_status_reset_task_f()); - - /* update counters per slice */ - if (corrected_overflow != 0U) { - corrected_delta += BIT32(pwr_pmu_falcon_ecc_corrected_err_count_total_s()); - } - if (uncorrected_overflow != 0U) { - uncorrected_delta += BIT32(pwr_pmu_falcon_ecc_uncorrected_err_count_total_s()); - } - - g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter += corrected_delta; - g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter += uncorrected_delta; - - nvgpu_log(g, gpu_dbg_intr, - "pmu ecc interrupt intr1: 0x%x", intr1); - - if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - nvgpu_pmu_report_ecc_error(g, 0, - GPU_PMU_FALCON_IMEM_ECC_CORRECTED, - ecc_addr, - g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, - "imem ecc error corrected"); - } - if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - nvgpu_pmu_report_ecc_error(g, 0, - GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, - ecc_addr, - g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, - "imem ecc error uncorrected"); - } - if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - nvgpu_pmu_report_ecc_error(g, 0, - GPU_PMU_FALCON_DMEM_ECC_CORRECTED, - ecc_addr, - g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, - "dmem ecc error corrected"); - } - if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - nvgpu_pmu_report_ecc_error(g, 0, - GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, - ecc_addr, - g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, - "dmem ecc error uncorrected"); - } - - if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { - nvgpu_info(g, "ecc counter overflow!"); - } - - nvgpu_log(g, gpu_dbg_intr, - "ecc error row address: 0x%x", - pwr_pmu_falcon_ecc_address_row_address_v(ecc_addr)); - - nvgpu_log(g, gpu_dbg_intr, - "ecc error count corrected: %d, uncorrected %d", - g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter, - g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); - } + gv11b_pmu_handle_ecc_irq(g); } }