diff --git a/drivers/gpu/nvgpu/common/ltc/ltc.c b/drivers/gpu/nvgpu/common/ltc/ltc.c index d2fd2783b..6f2917d2d 100644 --- a/drivers/gpu/nvgpu/common/ltc/ltc.c +++ b/drivers/gpu/nvgpu/common/ltc/ltc.c @@ -226,6 +226,14 @@ void nvgpu_ltc_ecc_free(struct gk20a *g) nvgpu_kfree(g, ecc->ltc.ecc_ded_count[ltc]); ecc->ltc.ecc_ded_count[ltc] = NULL; } + + if (ecc->ltc.tstg_ecc_parity_count != NULL) { + nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count[ltc]); + } + + if (ecc->ltc.dstg_be_ecc_parity_count != NULL) { + nvgpu_kfree(g, ecc->ltc.dstg_be_ecc_parity_count[ltc]); + } } nvgpu_kfree(g, ecc->ltc.ecc_sec_count); @@ -233,4 +241,10 @@ void nvgpu_ltc_ecc_free(struct gk20a *g) nvgpu_kfree(g, ecc->ltc.ecc_ded_count); ecc->ltc.ecc_ded_count = NULL; + + nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count); + ecc->ltc.tstg_ecc_parity_count = NULL; + + nvgpu_kfree(g, ecc->ltc.dstg_be_ecc_parity_count); + ecc->ltc.dstg_be_ecc_parity_count = NULL; } diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c index a6459e6a4..c00c10d5d 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c @@ -391,7 +391,7 @@ void ga10b_ltc_intr_configure(struct gk20a *g) static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice, u32 offset, u32 ltc_intr3) { - u32 ecc_status, ecc_addr, dstg_ecc_addr, corrected_cnt, uncorrected_cnt; + u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; u32 corrected_delta, uncorrected_delta; u32 corrected_overflow, uncorrected_overflow; @@ -404,8 +404,6 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice, ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset)); ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset)); - dstg_ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32( - ltc_ltc0_lts0_dstg_ecc_address_r(), offset)); corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(), offset)); @@ -425,7 +423,6 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice, ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m(); gv11b_ltc_intr_init_counters(g, - corrected_delta, corrected_overflow, uncorrected_delta, uncorrected_overflow, offset); nvgpu_writel(g, nvgpu_safe_add_u32( @@ -465,14 +462,16 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice, } gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice, - ecc_status, ecc_addr); + ecc_status, ecc_addr, + uncorrected_delta); gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice, - ecc_status, ecc_addr); + ecc_status, ecc_addr, + uncorrected_delta); gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice, - ecc_status, dstg_ecc_addr, - ecc_addr); + ecc_status, ecc_addr, + uncorrected_delta); if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.h b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.h index 577fb9b7d..b467a7b4d 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.h +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.h @@ -36,15 +36,16 @@ void gv11b_ltc_intr_en_illegal_compstat(struct gk20a *g, bool enable); #endif void gv11b_ltc_intr_init_counters(struct gk20a *g, - u32 corrected_delta, u32 corrected_overflow, u32 uncorrected_delta, u32 uncorrected_overflow, u32 offset); void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g, - u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr); + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta); void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g, - u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr); + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta); void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g, - u32 ltc, u32 slice, u32 ecc_status, u32 dstg_ecc_addr, - u32 ecc_addr); + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta); #endif diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c index df06194ae..bd3e0dac6 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c @@ -83,16 +83,9 @@ void gv11b_ltc_intr_en_illegal_compstat(struct gk20a *g, bool enable) #endif void gv11b_ltc_intr_init_counters(struct gk20a *g, - u32 corrected_delta, u32 corrected_overflow, u32 uncorrected_delta, u32 uncorrected_overflow, u32 offset) { - if ((corrected_delta > 0U) || (corrected_overflow != 0U)) { - nvgpu_writel(g, - nvgpu_safe_add_u32( - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(), - offset), 0); - } if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) { nvgpu_writel(g, nvgpu_safe_add_u32( @@ -102,17 +95,9 @@ void gv11b_ltc_intr_init_counters(struct gk20a *g, } void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g, - u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr) + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta) { - if ((ecc_status & - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) - != 0U) { - nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected"); - /* This error is not expected to occur in gv11b and hence, - * this scenario is considered as a fatal error. - */ - BUG(); - } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) { @@ -122,86 +107,59 @@ void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g, */ BUG(); } - } void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g, - u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr) + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta) { - if ((ecc_status & - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) - != 0U) { - nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected"); - /* This error is not expected to occur in gv11b and hence, - * this scenario is considered as a fatal error. - */ - BUG(); - } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) { + g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, + uncorrected_delta); + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.ecc_ded_count[ltc][slice].counter); + g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); } } void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g, - u32 ltc, u32 slice, u32 ecc_status, u32 dstg_ecc_addr, - u32 ecc_addr) + u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr, + u32 uncorrected_delta) { - if ((ecc_status & - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) - != 0U) { - if ((dstg_ecc_addr & - ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) - == 0U) { - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr, - g->ecc.ltc.ecc_sec_count[ltc][slice].counter); - } else { - /* This error is not expected to occur in gv11b and - * hence, this scenario is considered as a fatal error. - */ - BUG(); - } - nvgpu_log(g, gpu_dbg_intr, "dstg ecc error corrected"); - } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) { - if ((dstg_ecc_addr & - ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) { - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.ecc_ded_count[ltc][slice].counter); - } else { - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.ecc_ded_count[ltc][slice].counter); - } - nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected"); + g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter, + uncorrected_delta); + + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, + g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter); + nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected"); } } -static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, +static void gv11b_ltc_intr_handle_ecc_parity_interrupts(struct gk20a *g, u32 ltc, u32 slice) { u32 offset; u32 ltc_intr3; - u32 ecc_status, ecc_addr, dstg_ecc_addr, corrected_cnt, uncorrected_cnt; - u32 corrected_delta, uncorrected_delta; - u32 corrected_overflow, uncorrected_overflow; + u32 ecc_status, ecc_addr, uncorrected_cnt; + u32 uncorrected_delta; + u32 uncorrected_overflow; u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE); u32 lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE); @@ -210,39 +168,42 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, ltc_intr3 = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_intr3_r(), offset)); - /* Detect and handle ECC PARITY errors */ - if ((ltc_intr3 & - (ltc_ltcs_ltss_intr3_ecc_uncorrected_m() | - ltc_ltcs_ltss_intr3_ecc_corrected_m())) != 0U) { + nvgpu_log(g, gpu_dbg_intr, + "ltc:%u lts: %u cache ecc interrupt intr3: 0x%08x", + ltc, slice, ltc_intr3); + /* Corrected ECC parity errors not expected */ + if ((ltc_intr3 & ltc_ltcs_ltss_intr3_ecc_corrected_m()) != 0U) { + nvgpu_err(g, "corrected parity error not expected"); + /* This error is not expected to occur in gv11b and hence, + * this scenario is considered as a fatal error. + */ + BUG(); + } + + /* Detect and handle uncorrected ECC PARITY errors */ + if ((ltc_intr3 & ltc_ltcs_ltss_intr3_ecc_uncorrected_m()) != 0U) { ecc_status = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset)); ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset)); - dstg_ecc_addr = nvgpu_readl(g, - nvgpu_safe_add_u32( - ltc_ltc0_lts0_dstg_ecc_address_r(), offset)); - corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32( - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(), - offset)); + + nvgpu_log(g, gpu_dbg_intr, + "ecc status 0x%08x error address: 0x%08x", + ecc_status, ecc_addr); + uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32( ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(), offset)); - corrected_delta = - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v( - corrected_cnt); uncorrected_delta = ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt); - corrected_overflow = ecc_status & - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m(); uncorrected_overflow = ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m(); gv11b_ltc_intr_init_counters(g, - corrected_delta, corrected_overflow, uncorrected_delta, uncorrected_overflow, offset); nvgpu_writel(g, @@ -251,60 +212,142 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f()); /* update counters per slice */ - if (corrected_overflow != 0U) { - corrected_delta += BIT32( - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s()); - } if (uncorrected_overflow != 0U) { - uncorrected_delta += BIT32( - ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s()); - } - - g->ecc.ltc.ecc_sec_count[ltc][slice].counter = - nvgpu_safe_add_u32( - g->ecc.ltc.ecc_sec_count[ltc][slice].counter, - corrected_delta); - g->ecc.ltc.ecc_ded_count[ltc][slice].counter = - nvgpu_safe_add_u32( - g->ecc.ltc.ecc_ded_count[ltc][slice].counter, - uncorrected_delta); - nvgpu_log(g, gpu_dbg_intr, - "ltc:%d lts: %d cache ecc interrupt intr: 0x%x", - ltc, slice, ltc_intr3); - - /* This check has been added to ensure that the slice id is less - * than 8-bits and hence, it can be packed as part of LSB 8-bits - * along with the LTC id while reporting LTC related ECC errors. - */ - if (slice > U8_MAX) { - nvgpu_log(g, gpu_dbg_intr, "Invalid slice id=%d", - slice); - slice = slice & 0xFFU; + nvgpu_info(g, "ecc counter overflow!"); + uncorrected_delta = + nvgpu_wrapping_add_u32(uncorrected_delta, + BIT32(ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s())); } gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice, - ecc_status, ecc_addr); + ecc_status, ecc_addr, + uncorrected_delta); gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice, - ecc_status, ecc_addr); + ecc_status, ecc_addr, + uncorrected_delta); gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice, - ecc_status, dstg_ecc_addr, - ecc_addr); - - if ((corrected_overflow != 0U) || - (uncorrected_overflow != 0U)) { - nvgpu_info(g, "ecc counter overflow!"); - } - - nvgpu_log(g, gpu_dbg_intr, "ecc error address: 0x%x", ecc_addr); + ecc_status, ecc_addr, + uncorrected_delta); nvgpu_writel(g, nvgpu_safe_add_u32(ltc_ltc0_lts0_intr3_r(), offset), ltc_intr3); } +} - gp10b_ltc_intr_handle_lts_interrupts(g, ltc, slice); +static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 ltc, u32 slice) +{ + u32 offset; + u32 ltc_intr; + u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE); + u32 lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE); + + offset = nvgpu_safe_add_u32(nvgpu_safe_mult_u32(ltc_stride, ltc), + nvgpu_safe_mult_u32(lts_stride, slice)); + ltc_intr = nvgpu_readl(g, nvgpu_safe_add_u32( + ltc_ltc0_lts0_intr_r(), offset)); + + nvgpu_log(g, gpu_dbg_intr, + "ltc:%u lts: %u cache ecc interrupt intr: 0x%08x", + ltc, slice, ltc_intr); + + /* Detect and handle SEC ECC errors */ + if ((ltc_intr & + ltc_ltcs_ltss_intr_ecc_sec_error_pending_f()) != 0U) { + u32 ecc_stats_reg_val; + u32 dstg_ecc_addr; + + ecc_stats_reg_val = + nvgpu_readl(g, nvgpu_safe_add_u32( + ltc_ltc0_lts0_dstg_ecc_report_r(), offset)); + dstg_ecc_addr = nvgpu_readl(g, + nvgpu_safe_add_u32( + ltc_ltc0_lts0_dstg_ecc_address_r(), offset)); + + nvgpu_err(g, "Single bit error detected in GPU L2!"); + nvgpu_err(g, "ecc_report_r: %08x dstg_ecc_addr: %08x", + ecc_stats_reg_val, dstg_ecc_addr); + + g->ecc.ltc.ecc_sec_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.ecc_sec_count[ltc][slice].counter, + ltc_ltc0_lts0_dstg_ecc_report_sec_count_v( + ecc_stats_reg_val)); + ecc_stats_reg_val &= + ~(ltc_ltc0_lts0_dstg_ecc_report_sec_count_m()); + nvgpu_writel(g, + nvgpu_safe_add_u32( + ltc_ltc0_lts0_dstg_ecc_report_r(), offset), + ecc_stats_reg_val); + + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_DSTG_ECC_CORRECTED, dstg_ecc_addr, + g->ecc.ltc.ecc_sec_count[ltc][slice].counter); + + /* + * Using a SEC code will allow correction of an SBE (Single Bit + * Error). But the current HW doesn't have the ability to clear + * out the SBE from the RAMs for a read access. So before the + * SBE turns into a DBE (Double Bit Error), a SW flush is + * preferred. + */ + if (g->ops.mm.cache.l2_flush(g, true) != 0) { + nvgpu_err(g, "l2_flush failed"); + BUG(); + } + } + + /* Detect and handle DED ECC errors */ + if ((ltc_intr & + ltc_ltcs_ltss_intr_ecc_ded_error_pending_f()) != 0U) { + u32 ecc_stats_reg_val; + u32 dstg_ecc_addr; + + ecc_stats_reg_val = + nvgpu_readl(g, nvgpu_safe_add_u32( + ltc_ltc0_lts0_dstg_ecc_report_r(), offset)); + dstg_ecc_addr = nvgpu_readl(g, + nvgpu_safe_add_u32( + ltc_ltc0_lts0_dstg_ecc_address_r(), offset)); + + nvgpu_err(g, "Double bit error detected in GPU L2!"); + nvgpu_err(g, "ecc_report_r: %08x dstg_ecc_addr: %08x", + ecc_stats_reg_val, dstg_ecc_addr); + + g->ecc.ltc.ecc_ded_count[ltc][slice].counter = + nvgpu_wrapping_add_u32( + g->ecc.ltc.ecc_ded_count[ltc][slice].counter, + ltc_ltc0_lts0_dstg_ecc_report_ded_count_v( + ecc_stats_reg_val)); + ecc_stats_reg_val &= + ~(ltc_ltc0_lts0_dstg_ecc_report_ded_count_m()); + nvgpu_writel(g, + nvgpu_safe_add_u32( + ltc_ltc0_lts0_dstg_ecc_report_r(), offset), + ecc_stats_reg_val); + + nvgpu_report_ecc_err(g, + NVGPU_ERR_MODULE_LTC, + (ltc << 8U) | slice, + GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, dstg_ecc_addr, + g->ecc.ltc.ecc_ded_count[ltc][slice].counter); + } + + nvgpu_writel(g, nvgpu_safe_add_u32(ltc_ltc0_lts0_intr_r(), offset), + ltc_intr); +} + + +static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, + u32 ltc, u32 slice) +{ + gv11b_ltc_intr_handle_ecc_parity_interrupts(g, ltc, slice); + + gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(g, ltc, slice); } void gv11b_ltc_intr_isr(struct gk20a *g, u32 ltc) diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b_fusa.c index d7933963c..8b3412274 100644 --- a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b_fusa.c @@ -70,6 +70,16 @@ int gv11b_lts_ecc_init(struct gk20a *g) goto done; } + err = NVGPU_ECC_COUNTER_INIT_PER_LTS(tstg_ecc_parity_count); + if (err != 0) { + goto done; + } + + err = NVGPU_ECC_COUNTER_INIT_PER_LTS(dstg_be_ecc_parity_count); + if (err != 0) { + goto done; + } + done: if (err != 0) { nvgpu_err(g, "ecc counter allocate failed, err=%d", err); diff --git a/drivers/gpu/nvgpu/include/nvgpu/ecc.h b/drivers/gpu/nvgpu/include/nvgpu/ecc.h index a531c5032..cce0ec39f 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/ecc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/ecc.h @@ -204,9 +204,13 @@ struct nvgpu_ecc { * unit. */ struct { - /** ltc-lts sec count. */ + /** L2 cache slice TSTG ECC PARITY error count. */ + struct nvgpu_ecc_stat **tstg_ecc_parity_count; + /** L2 cache slice DSTG BE ECC PARITY error count. */ + struct nvgpu_ecc_stat **dstg_be_ecc_parity_count; + /** L2 cache slice SEC error count. */ struct nvgpu_ecc_stat **ecc_sec_count; - /** ltc-lts ded count. */ + /** L2 cache slice DED error count. */ struct nvgpu_ecc_stat **ecc_ded_count; } ltc; diff --git a/userspace/units/ltc/nvgpu-ltc.c b/userspace/units/ltc/nvgpu-ltc.c index 5cf5c87cd..687b3a3a6 100644 --- a/userspace/units/ltc/nvgpu-ltc.c +++ b/userspace/units/ltc/nvgpu-ltc.c @@ -284,12 +284,38 @@ static void nvgpu_init_gr_manager(struct gk20a *g) gr_syspipe->num_gpc = 1; } +static int ltc_ecc_init_fault_check(struct unit_module *m, struct gk20a *g, + unsigned int number) +{ + struct nvgpu_posix_fault_inj *kmem_fi = + nvgpu_kmem_get_fault_injection(); + int err; + + /* Re-Init dependent ECC unit */ + err = nvgpu_ecc_init_support(g); + if (err != 0) { + unit_err(m, "ecc init failed\n"); + return err; + } + + nvgpu_posix_enable_fault_injection(kmem_fi, true, number); + err = g->ops.ltc.ecc_init(g); + if (err == 0) { + unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n"); + return -1; + } + + return 0; +} + int test_ltc_ecc_init_free(struct unit_module *m, struct gk20a *g, void *args) { int ret = UNIT_SUCCESS; int err; struct nvgpu_ecc_stat **save_sec_ptr = g->ecc.ltc.ecc_sec_count; struct nvgpu_ecc_stat **save_ded_ptr = g->ecc.ltc.ecc_ded_count; + struct nvgpu_ecc_stat **save_tstg_ecc_ptr = g->ecc.ltc.tstg_ecc_parity_count; + struct nvgpu_ecc_stat **save_dstg_ecc_ptr = g->ecc.ltc.dstg_be_ecc_parity_count; struct nvgpu_posix_fault_inj *kmem_fi = nvgpu_kmem_get_fault_injection(); @@ -312,14 +338,15 @@ int test_ltc_ecc_init_free(struct unit_module *m, struct gk20a *g, void *args) g->ecc.ltc.ecc_sec_count = NULL; g->ecc.ltc.ecc_ded_count = NULL; + g->ecc.ltc.tstg_ecc_parity_count = NULL; + g->ecc.ltc.dstg_be_ecc_parity_count = NULL; /* - * Call with failure on first kzalloc + * Call with failure on first kzalloc for sec_ecc_count */ - nvgpu_posix_enable_fault_injection(kmem_fi, true, 0); - err = g->ops.ltc.ecc_init(g); - if (err == 0) { - unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n"); + err = ltc_ecc_init_fault_check(m, g, 0); + if (err) { + unit_err(m, "sec_ecc_count alloc fault check failed\n"); ret = UNIT_FAIL; goto done; } @@ -328,28 +355,42 @@ int test_ltc_ecc_init_free(struct unit_module *m, struct gk20a *g, void *args) * Call with failure on third kzalloc for the 2nd array dimension and to * validate unrolling. */ - nvgpu_posix_enable_fault_injection(kmem_fi, true, 2); - err = g->ops.ltc.ecc_init(g); - if (err == 0) { - unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n"); + err = ltc_ecc_init_fault_check(m, g, 2); + if (err) { + unit_err(m, "sec_ecc_count alloc for LTC 1 fault check failed\n"); ret = UNIT_FAIL; goto done; } - /* Re-Init dependent ECC unit */ - err = nvgpu_ecc_init_support(g); - if (err != 0) { - unit_return_fail(m, "ecc init failed\n"); + /* + * Call with failure on 4th kzalloc for ded_ecc_count and get more + * branch/line coverage. + */ + err = ltc_ecc_init_fault_check(m, g, 4); + if (err) { + unit_err(m, "dec_ecc_count alloc fault check failed\n"); + ret = UNIT_FAIL; + goto done; } /* - * Call with failure on 4th kzalloc for second stat and get more + * Call with failure on 8th kzalloc for tstg_ecc_parity_count and get more * branch/line coverage. */ - nvgpu_posix_enable_fault_injection(kmem_fi, true, 4); - err = g->ops.ltc.ecc_init(g); - if (err == 0) { - unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n"); + err = ltc_ecc_init_fault_check(m, g, 8); + if (err) { + unit_err(m, "tstg_ecc_parity_count alloc fault check failed\n"); + ret = UNIT_FAIL; + goto done; + } + + /* + * Call with failure on 11th kzalloc for dstg_be_ecc_parity_count and get more + * branch/line coverage. + */ + err = ltc_ecc_init_fault_check(m, g, 11); + if (err) { + unit_err(m, "dstg_be_ecc_parity_count alloc fault check failed\n"); ret = UNIT_FAIL; goto done; } @@ -373,6 +414,8 @@ done: nvgpu_posix_enable_fault_injection(kmem_fi, false, 0); g->ecc.ltc.ecc_sec_count = save_sec_ptr; g->ecc.ltc.ecc_ded_count = save_ded_ptr; + g->ecc.ltc.tstg_ecc_parity_count = save_tstg_ecc_ptr; + g->ecc.ltc.dstg_be_ecc_parity_count = save_dstg_ecc_ptr; nvgpu_gr_free(g); return ret; @@ -464,105 +507,102 @@ int test_ltc_intr(struct unit_module *m, struct gk20a *g, void *args) goto done; } + err = NVGPU_ECC_COUNTER_INIT_PER_LTS(tstg_ecc_parity_count); + if (err != 0) { + unit_err(m, "failed to init tstg_ecc_parity_count\n"); + err = UNIT_FAIL; + goto done; + } + + err = NVGPU_ECC_COUNTER_INIT_PER_LTS(dstg_be_ecc_parity_count); + if (err != 0) { + unit_err(m, "failed to init dstg_be_ecc_parity_count\n"); + err = UNIT_FAIL; + goto done; + } + /* test with no intr pending */ g->ops.ltc.intr.isr(g, 0); + /* test with corrected intr, expect BUG */ + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), + ltc_ltcs_ltss_intr3_ecc_corrected_m()); + EXPECT_BUG(g->ops.ltc.intr.isr(g, 0)); + /* test with intr, but no corrected or uncorrected bits */ nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); g->ops.ltc.intr.isr(g, 0); - /* set corrected & uncorrected overflow bits */ + /* set uncorrected overflow bits */ nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m() | ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m()); nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); g->ops.ltc.intr.isr(g, 0); - /* set corrected & uncorrected overflow bits in second instance */ + /* set uncorrected overflow bits in second instance */ nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r() + offset1, - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m() | ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m()); nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r() + offset1, ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); g->ops.ltc.intr.isr(g, 0); - /* set corrected overflow bit independently for branch coverage */ - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m()); - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), - ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); - g->ops.ltc.intr.isr(g, 0); - - /* set uncorrected overflow bit independently for branch coverage */ - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), - ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m()); - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), - ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); - g->ops.ltc.intr.isr(g, 0); - /* - * Clear the corrected & uncorrected overflow bits. And for branch - * coverage, set the uncorrected & corrected err counts. + * Clear the uncorrected overflow bits. And for branch + * coverage, set the uncorrected err count. */ nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), 0x0); - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(), - ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_m()); nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(), ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_m()); nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); g->ops.ltc.intr.isr(g, 0); - /* set dstg bits with data RAM */ + /* set rstg bits */ nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m() | - ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()); - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), - ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); - g->ops.ltc.intr.isr(g, 0); - - /* set dstg bits with byte enable (BE) RAM */ - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m() | - ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()); - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_dstg_ecc_address_r(), - ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()); - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), - ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); - EXPECT_BUG(g->ops.ltc.intr.isr(g, 0)); - - /* set tstg & rstg bits */ - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() | - ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m() | - ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() | ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()); nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); EXPECT_BUG(g->ops.ltc.intr.isr(g, 0)); - /* set sec & ded error bits */ - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(), - ltc_ltcs_ltss_intr_ecc_sec_error_pending_f() | - ltc_ltcs_ltss_intr_ecc_ded_error_pending_f()); + /* set tstg bits */ + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), + ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()); nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); g->ops.ltc.intr.isr(g, 0); - /* For branch coverage, set sec & ded error bits and make l2 flush succeed */ - save_func = g->ops.mm.cache.l2_flush; - g->ops.mm.cache.l2_flush = mock_l2_flush; - nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(), - ltc_ltcs_ltss_intr_ecc_sec_error_pending_f() | - ltc_ltcs_ltss_intr_ecc_ded_error_pending_f()); + /* set dstg bits */ + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), + ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()); nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), ltc_ltcs_ltss_intr3_ecc_uncorrected_m()); g->ops.ltc.intr.isr(g, 0); + + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), 0); + + /* set sec error bits */ + save_func = g->ops.mm.cache.l2_flush; + g->ops.mm.cache.l2_flush = mock_l2_flush; + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(), + ltc_ltcs_ltss_intr_ecc_sec_error_pending_f()); + g->ops.ltc.intr.isr(g, 0); g->ops.mm.cache.l2_flush = save_func; + /* set ded error bits */ + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(), + ltc_ltcs_ltss_intr_ecc_ded_error_pending_f()); + g->ops.ltc.intr.isr(g, 0); + + /* For branch coverage, set sec error bits and make l2 flush fail */ + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(), + ltc_ltcs_ltss_intr_ecc_sec_error_pending_f()); + EXPECT_BUG(g->ops.ltc.intr.isr(g, 0)); + + nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(), 0); + done: nvgpu_ltc_ecc_free(g); diff --git a/userspace/units/ltc/nvgpu-ltc.h b/userspace/units/ltc/nvgpu-ltc.h index 26ad20a43..0c6683172 100644 --- a/userspace/units/ltc/nvgpu-ltc.h +++ b/userspace/units/ltc/nvgpu-ltc.h @@ -78,15 +78,10 @@ int test_ltc_init_support(struct unit_module *m, * the failure paths. * - Save the current ecc count pointers from the gk20a struct and set the gk20a * pointers to NULL. - * - Setup kmem fault injection to trigger fault on allocation for first alloc. - * - Call ltc ecc counter init and verify error is returned. - * - Setup kmem fault injection to trigger fault on allocation for third alloc - * to validate failures to allocate on second dimension of array. - * - Call ltc ecc counter init and verify error is returned. - * - Re-init ecc support. - * - Setup kmem fault injection to trigger fault on allocation for fifth alloc - * to validate failures to allocate for second ltc ecc stat. - * - Call ltc ecc counter init and verify error is returned. + * - Do following to check fault while allocating ECC counters for SEC, DED, TSTG and DSTG BE + * - Re-init ecc support. + * - Setup kmem fault injection to trigger fault on allocation for particular ECC counter. + * - Call ltc ecc counter init and verify error is returned. * - Re-init ecc support. * - Disable kmem fault injection. * - Call ltc ecc counter init and verify no error is returned. @@ -180,64 +175,52 @@ int test_ltc_remove_support(struct unit_module *m, * * Steps: * - Allocate ECC stat counter objects used by handler (ecc_sec_count, - * ecc_ded_count). + * ecc_ded_count, tstg_ecc_parity_count, dstg_be_ecc_parity_count). * - Test LTC isr with no interrupts pending. - * - Test with corrected and uncorrected bits in the first LTC instances. - * - Set the corrected & uncorrected counter overflow bits in the first + * - Test LTC isr with corrected interrupt. Expect BUG. + * - Test with uncorrected bits in the first LTC instances. + * - Set the uncorrected counter overflow bits in the first * ecc_status register (NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_STATUS). * - Set the interrupt pending bit in the first LTC interrupt register * (NV_PLTCG_LTC0_LTS0_INTR). * - Call the LTC isr. - * - Test with corrected and uncorrected bits in the second LTC instance. - * - Set the corrected & uncorrected counter overflow bits in the second + * - Test with uncorrected bits in the second LTC instance. + * - Set the uncorrected counter overflow bits in the second * ecc_status register. * - Set the interrupt pending bit in the second LTC interrupt register. * - Call the LTC isr. - * - Test with corrected bits only (for branch coverage). - * - Set the corrected counter overflow bit and not the uncorrected bit in - * the ecc_status register. - * - Set the interrupt pending bit in the LTC interrupt register. - * - Call the LTC isr. - * - Test with uncorrected bits only (for branch coverage). - * - Set the uncorrected counter overflow bit and not the corrected bit in - * the ecc_status register. - * - Set the interrupt pending bit in the LTC interrupt register. - * - Call the LTC isr. - * - Test with corrected and uncorrected error counts but without err bits (for + * - Test with uncorrected error counts but without err bits (for * branch coverage). - * - Clear the corrected & uncorrected counter overflow bits in the second - * ecc_status register. - * - Write values to the corrected & uncorrected count registers. - * - Set the interrupt pending bit in the second LTC interrupt register. + * - Clear the uncorrected counter overflow bits in the ecc_status register. + * - Write values to the uncorrected count registers. + * - Set the interrupt pending bit in the LTC interrupt register. * - Call the LTC isr. - * - Test handling of dstg error in data RAM. - * - Set the dstg corrected & uncorrected error bits in the ecc_status - * register. - * - Set the dstg RAM mask field of the dstg_ecc_address register - * (NV_PLTCG_LTC0_LTS0_DSTG_ECC_ADDRESS) to report data RAM. - * - Set the interrupt pending bit in the first LTC interrupt register. + * - Test handling of rstg error. + * - Set the rstg uncorrected counter error bits in the ecc_status register. + * - Set the interrupt pending bit in the LTC interrupt register. * - Call the LTC isr. - * - Test handling of dstg error in byte enable (BE) RAM. - * - Set the dstg corrected & uncorrected error bits in the ecc_status - * register. - * - Set the dstg RAM mask field of the dstg_ecc_address register to report - * BE RAM. - * - Set the interrupt pending bit in the first LTC interrupt register. + * - Expect BUG. + * - Test handling of tstg errors. + * - Set the tstg uncorrected counter error bits in the ecc_status register. + * - Set the interrupt pending bit in the LTC interrupt register. * - Call the LTC isr. - * - Test handling of tstg and rstg errors. - * - Set the tstg and rstg, corrected & uncorrected counter error bits in the - * ecc_status register. - * - Set the interrupt pending bit in the first LTC interrupt register. + * - Test handling of dstg errors. + * - Set the dstg uncorrected counter error bits in the ecc_status register. + * - Set the interrupt pending bit in the LTC interrupt register. * - Call the LTC isr. - * - Test handling of sec and ded errors. - * - Set the sec and ded pending error bits in the ecc_status register. - * - Set the interrupt pending bit in the first LTC interrupt register. - * - Call the LTC isr. - * - Test handling of sec and ded errors when the l2 flush API succeeds (for - * branch coverage). + * - Test handling of sec error when the l2 flush API succeeds * - Override the MM l2_flush HAL to return success. - * - Set the sec and ded pending error bits in the ecc_status register. - * - Set the interrupt pending bit in the first LTC interrupt register. + * - Set the sec pending error bits in the ecc_status register. + * - Set the interrupt pending bit in the LTC interrupt register. + * - Call the LTC isr. + * - Test handling of ded error. + * - Set the ded pending error bits in the ecc_status register. + * - Set the interrupt pending bit in the LTC interrupt register. + * - Call the LTC isr. + * - Test handling of sec error when the l2 flush API fails (for + * branch coverage). + * - Set the sec pending error bits in the ecc_status register. + * - Set the interrupt pending bit in the LTC interrupt register. * - Call the LTC isr. * * Output: Returns PASS unless counter initialization fails or an except occurs