diff --git a/drivers/gpu/nvgpu/common/ltc/ltc.c b/drivers/gpu/nvgpu/common/ltc/ltc.c index 6f2917d2d..52aa8c167 100644 --- a/drivers/gpu/nvgpu/common/ltc/ltc.c +++ b/drivers/gpu/nvgpu/common/ltc/ltc.c @@ -227,6 +227,10 @@ void nvgpu_ltc_ecc_free(struct gk20a *g) ecc->ltc.ecc_ded_count[ltc] = NULL; } + if (ecc->ltc.rstg_ecc_parity_count != NULL) { + nvgpu_kfree(g, ecc->ltc.rstg_ecc_parity_count[ltc]); + } + if (ecc->ltc.tstg_ecc_parity_count != NULL) { nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count[ltc]); } @@ -242,6 +246,9 @@ void nvgpu_ltc_ecc_free(struct gk20a *g) nvgpu_kfree(g, ecc->ltc.ecc_ded_count); ecc->ltc.ecc_ded_count = NULL; + nvgpu_kfree(g, ecc->ltc.rstg_ecc_parity_count); + ecc->ltc.rstg_ecc_parity_count = NULL; + nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count); ecc->ltc.tstg_ecc_parity_count = NULL; diff --git a/drivers/gpu/nvgpu/hal/init/hal_ga100.c b/drivers/gpu/nvgpu/hal/init/hal_ga100.c index 7f4c7b7e9..5e4163121 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_ga100.c +++ b/drivers/gpu/nvgpu/hal/init/hal_ga100.c @@ -374,7 +374,7 @@ static const struct gops_ltc_intr ga100_ops_ltc_intr = { }; static const struct gops_ltc ga100_ops_ltc = { - .ecc_init = gv11b_lts_ecc_init, + .ecc_init = ga10b_lts_ecc_init, .init_ltc_support = nvgpu_init_ltc_support, .ltc_remove_support = nvgpu_ltc_remove_support, .determine_L2_size_bytes = gp10b_determine_L2_size_bytes, diff --git a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c index d149d3d8a..905a6d5b7 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c @@ -343,7 +343,7 @@ static const struct gops_ltc_intr ga10b_ops_ltc_intr = { }; static const struct gops_ltc ga10b_ops_ltc = { - .ecc_init = gv11b_lts_ecc_init, + .ecc_init = ga10b_lts_ecc_init, .init_ltc_support = nvgpu_init_ltc_support, .ltc_remove_support = nvgpu_ltc_remove_support, .determine_L2_size_bytes = ga10b_determine_L2_size_bytes, diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c index c00c10d5d..4d03aed67 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c @@ -28,7 +28,6 @@ #include #include -#include "hal/ltc/intr/ltc_intr_gv11b.h" #include "ltc_intr_ga10b.h" #include @@ -388,6 +387,210 @@ void ga10b_ltc_intr_configure(struct gk20a *g) ga10b_ltc_intr3_configure(g); } +static void ga10b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g, + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta) +{ + bool is_rstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) == + ltc_ltc0_lts0_l2_cache_ecc_address_subunit_rstg_v()); + + if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) { + nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected"); + + if (!is_rstg_ecc_addr) { + nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to RSTG"); + return; + } + + g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter, + uncorrected_delta); + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter); + } + + if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { + nvgpu_err(g, "rstg ecc error corrected"); + /* This error is not expected to occur in ga10x and hence, + * this scenario is considered as a fatal error. + */ + BUG(); + } +} + +static void ga10b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g, + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta) +{ + bool is_tstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) == + ltc_ltc0_lts0_l2_cache_ecc_address_subunit_tstg_v()); + + if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) { + nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); + + if (!is_tstg_ecc_addr) { + nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to TSTG"); + return; + } + + g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, + uncorrected_delta); + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter); + } + + if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { + nvgpu_err(g, "tstg ecc error corrected"); + /* This error is not expected to occur in ga10b and hence, + * this scenario is considered as a fatal error. + */ + BUG(); + } +} + +static bool ga10b_ltc_intr_is_dstg_data_bank(u32 ecc_addr) +{ + u32 ecc_ram = ltc_ltc0_lts0_l2_cache_ecc_address_ram_v(ecc_addr); + bool is_dstg_data_bank = false; + + if ((ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank0_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank1_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank2_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank3_v())) { + is_dstg_data_bank = true; + } + + return is_dstg_data_bank; +} + +static bool ga10b_ltc_intr_is_dstg_be_ram(u32 ecc_addr) +{ + u32 ecc_ram = ltc_ltc0_lts0_l2_cache_ecc_address_ram_v(ecc_addr); + bool is_dstg_be_ram = false; + + if ((ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram0_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram1_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram2_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram3_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram4_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram5_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram6_v()) || + (ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram7_v())) { + is_dstg_be_ram = true; + } + + return is_dstg_be_ram; +} + +static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g, + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 corrected_delta, u32 uncorrected_delta) +{ + bool is_dstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) == + ltc_ltc0_lts0_l2_cache_ecc_address_subunit_dstg_v()); + + if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) { + nvgpu_log(g, gpu_dbg_intr, "dstg ecc error (SEC) corrected"); + + if (!is_dstg_ecc_addr) { + nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to DSTG"); + return; + } + + g->ecc.ltc.ecc_sec_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.ecc_sec_count[ltc][slice].counter, + corrected_delta); + + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr, + g->ecc.ltc.ecc_sec_count[ltc][slice].counter); + + /* + * Using a SEC code will allow correction of an SBE (Single Bit + * Error). But the current HW doesn't have the ability to clear + * out the SBE from the RAMs for a read access. So before the + * SBE turns into a DBE (Double Bit Error), a SW flush is + * preferred. + */ + if (g->ops.mm.cache.l2_flush(g, true) != 0) { + nvgpu_err(g, "l2_flush failed"); + BUG(); + } + } + + if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) { + nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected"); + + if (!is_dstg_ecc_addr) { + nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to DSTG"); + return; + } + + if (ga10b_ltc_intr_is_dstg_data_bank(ecc_addr)) { + nvgpu_err(g, "Double bit error detected in GPU L2!"); + + g->ecc.ltc.ecc_ded_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.ecc_ded_count[ltc][slice].counter, + uncorrected_delta); + + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.ecc_ded_count[ltc][slice].counter); + } else if (ga10b_ltc_intr_is_dstg_be_ram(ecc_addr)) { + nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected"); + + g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter, + uncorrected_delta); + + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter); + } else { + nvgpu_err(g, "unsupported uncorrected dstg ecc error"); + BUG(); + } + } +} + +static void ga10b_ltc_intr_init_counters(struct gk20a *g, + u32 uncorrected_delta, u32 uncorrected_overflow, + u32 corrected_delta, u32 corrected_overflow, + u32 offset) +{ + if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) { + nvgpu_writel(g, + nvgpu_safe_add_u32( + ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(), + offset), 0); + } + + if ((corrected_delta > 0U) || (corrected_overflow != 0U)) { + nvgpu_writel(g, + nvgpu_safe_add_u32( + ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(), + offset), 0); + } +} + static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice, u32 offset, u32 ltc_intr3) { @@ -395,7 +598,18 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice, u32 corrected_delta, uncorrected_delta; u32 corrected_overflow, uncorrected_overflow; - /* Detect and handle ECC PARITY errors */ + /* + * Detect and handle ECC PARITY errors and SEC-DED errors. + * SEC errors are reported as DSTG corrected errors and + * DED errors are reported as DSTG uncorrected errors. + * Below are the supported errors: + * + * 1. UNCORRECTED_ERR_RSTG - signals a parity error in RSTG RAMS, for now only CBC RAMS + * 2. UNCORRECTED_ERR_TSTG - signals a parity error in TSTG RAMS + * 3. UNCORRECTED_ERR_DSTG - signals a parity error in DSTG RAMS, non-data RAMS + * and DED in data RAMS. + * 4. CORRECTED_ERR_DSTG - signals an ecc corrected error in DSTG data RAMS (SEC) + */ if ((ltc_intr3 & (ltc_ltcs_ltss_intr3_ecc_uncorrected_m() | ltc_ltcs_ltss_intr3_ecc_corrected_m())) != 0U) { @@ -404,81 +618,65 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice, ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset)); ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset)); - corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32( - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(), - offset)); + uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(), offset)); - corrected_delta = - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v( - corrected_cnt); uncorrected_delta = ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt); - corrected_overflow = ecc_status & - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m(); uncorrected_overflow = ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m(); - gv11b_ltc_intr_init_counters(g, - uncorrected_delta, uncorrected_overflow, offset); + corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32( + ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(), + offset)); + + corrected_delta = + ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(corrected_cnt); + + corrected_overflow = ecc_status & + ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m(); + + ga10b_ltc_intr_init_counters(g, + uncorrected_delta, uncorrected_overflow, + corrected_delta, corrected_overflow, offset); nvgpu_writel(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset), ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f()); /* update counters per slice */ - if (corrected_overflow != 0U) { - corrected_delta += BIT32( - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s()); - } if (uncorrected_overflow != 0U) { + nvgpu_info(g, "uncorrected ecc counter overflow!"); uncorrected_delta += BIT32( ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s()); } - g->ecc.ltc.ecc_sec_count[ltc][slice].counter = - nvgpu_safe_add_u32( - g->ecc.ltc.ecc_sec_count[ltc][slice].counter, - corrected_delta); - g->ecc.ltc.ecc_ded_count[ltc][slice].counter = - nvgpu_safe_add_u32( - g->ecc.ltc.ecc_ded_count[ltc][slice].counter, - uncorrected_delta); + if (corrected_overflow != 0U) { + nvgpu_info(g, "corrected ecc counter overflow!"); + corrected_delta += BIT32( + ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s()); + } + nvgpu_log(g, gpu_dbg_intr, - "ltc:%d lts: %d cache ecc interrupt intr: 0x%x", - ltc, slice, ltc_intr3); + "ecc status 0x%08x error address: 0x%08x subunit: %u corrected_delta: 0x%08x uncorrected_delta: 0x%08x", + ecc_status, ecc_addr, + ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr), + corrected_delta, uncorrected_delta); - /* This check has been added to ensure that the slice id is less - * than 8-bits and hence, it can be packed as part of LSB 8-bits - * along with the LTC id while reporting LTC related ECC errors. - */ - if (slice > U8_MAX) { - nvgpu_log(g, gpu_dbg_intr, "Invalid slice id=%d", - slice); - slice = slice & 0xFFU; - } - - gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice, + ga10b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice, ecc_status, ecc_addr, uncorrected_delta); - gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice, + ga10b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice, ecc_status, ecc_addr, uncorrected_delta); - gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice, + ga10b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice, ecc_status, ecc_addr, - uncorrected_delta); - - if ((corrected_overflow != 0U) || - (uncorrected_overflow != 0U)) { - nvgpu_info(g, "ecc counter overflow!"); - } - - nvgpu_log(g, gpu_dbg_intr, "ecc error address: 0x%x", ecc_addr); + corrected_delta, uncorrected_delta); } } diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b.h b/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b.h index e73206824..b176667cf 100644 --- a/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b.h +++ b/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b.h @@ -40,6 +40,7 @@ void ga10b_ltc_set_zbc_depth_entry(struct gk20a *g, u32 depth_val, u32 index); void ga10b_ltc_init_fs_state(struct gk20a *g); void ga10b_ltc_lts_set_mgmt_setup(struct gk20a *g); u64 ga10b_determine_L2_size_bytes(struct gk20a *g); +int ga10b_lts_ecc_init(struct gk20a *g); #ifdef CONFIG_NVGPU_DEBUGGER u32 ga10b_ltc_pri_shared_addr(struct gk20a *g, u32 addr); diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b_fusa.c index fac3bd598..3ff4fbfb5 100644 --- a/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/ltc_ga10b_fusa.c @@ -29,6 +29,7 @@ #include #include "hal/gr/gr/gr_gk20a.h" +#include "ltc_gv11b.h" #include "ltc_ga10b.h" #include @@ -234,3 +235,25 @@ u64 ga10b_determine_L2_size_bytes(struct gk20a *g) return size; } + +int ga10b_lts_ecc_init(struct gk20a *g) +{ + int err = 0; + + err = gv11b_lts_ecc_init(g); + if (err != 0) { + goto done; + } + + err = NVGPU_ECC_COUNTER_INIT_PER_LTS(rstg_ecc_parity_count); + if (err != 0) { + goto done; + } + +done: + if (err != 0) { + nvgpu_err(g, "ecc counter allocate failed, err=%d", err); + } + + return err; +} diff --git a/drivers/gpu/nvgpu/include/nvgpu/ecc.h b/drivers/gpu/nvgpu/include/nvgpu/ecc.h index cce0ec39f..8068318db 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/ecc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/ecc.h @@ -204,6 +204,8 @@ struct nvgpu_ecc { * unit. */ struct { + /** L2 cache slice RSTG ECC PARITY error count. */ + struct nvgpu_ecc_stat **rstg_ecc_parity_count; /** L2 cache slice TSTG ECC PARITY error count. */ struct nvgpu_ecc_stat **tstg_ecc_parity_count; /** L2 cache slice DSTG BE ECC PARITY error count. */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 0ab22c64c..aba00e1cb 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -293,6 +293,7 @@ struct gr_exception_info { #define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U) #define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U) #define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U) +#define GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED (5U) #define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U) /** * @}