mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: fix ltc isr, unit tests
LTC isr doesn't handle ECC errors correctly. INTR3 reports only parity ECC errors and INTR reports SEC/DED ECC errors. nvgpu managed both these errors with same counters. Fix it as per Volta ECC HW Functional Description. JIRA NVGPU-6982 Change-Id: I6ddaab55f7e1354ad9b832672a9006b7e58df9f7 Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2605012 (cherry picked from commit 5f92651e921b17cb61bbbb8954128c787cd89238) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2632548 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
449a4823d4
commit
c463810bcd
@@ -226,6 +226,14 @@ void nvgpu_ltc_ecc_free(struct gk20a *g)
|
|||||||
nvgpu_kfree(g, ecc->ltc.ecc_ded_count[ltc]);
|
nvgpu_kfree(g, ecc->ltc.ecc_ded_count[ltc]);
|
||||||
ecc->ltc.ecc_ded_count[ltc] = NULL;
|
ecc->ltc.ecc_ded_count[ltc] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ecc->ltc.tstg_ecc_parity_count != NULL) {
|
||||||
|
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count[ltc]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ecc->ltc.dstg_be_ecc_parity_count != NULL) {
|
||||||
|
nvgpu_kfree(g, ecc->ltc.dstg_be_ecc_parity_count[ltc]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
nvgpu_kfree(g, ecc->ltc.ecc_sec_count);
|
nvgpu_kfree(g, ecc->ltc.ecc_sec_count);
|
||||||
@@ -233,4 +241,10 @@ void nvgpu_ltc_ecc_free(struct gk20a *g)
|
|||||||
|
|
||||||
nvgpu_kfree(g, ecc->ltc.ecc_ded_count);
|
nvgpu_kfree(g, ecc->ltc.ecc_ded_count);
|
||||||
ecc->ltc.ecc_ded_count = NULL;
|
ecc->ltc.ecc_ded_count = NULL;
|
||||||
|
|
||||||
|
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count);
|
||||||
|
ecc->ltc.tstg_ecc_parity_count = NULL;
|
||||||
|
|
||||||
|
nvgpu_kfree(g, ecc->ltc.dstg_be_ecc_parity_count);
|
||||||
|
ecc->ltc.dstg_be_ecc_parity_count = NULL;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -391,7 +391,7 @@ void ga10b_ltc_intr_configure(struct gk20a *g)
|
|||||||
static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
||||||
u32 offset, u32 ltc_intr3)
|
u32 offset, u32 ltc_intr3)
|
||||||
{
|
{
|
||||||
u32 ecc_status, ecc_addr, dstg_ecc_addr, corrected_cnt, uncorrected_cnt;
|
u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
|
||||||
u32 corrected_delta, uncorrected_delta;
|
u32 corrected_delta, uncorrected_delta;
|
||||||
u32 corrected_overflow, uncorrected_overflow;
|
u32 corrected_overflow, uncorrected_overflow;
|
||||||
|
|
||||||
@@ -404,8 +404,6 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
|||||||
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset));
|
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset));
|
||||||
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset));
|
ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset));
|
||||||
dstg_ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
|
||||||
ltc_ltc0_lts0_dstg_ecc_address_r(), offset));
|
|
||||||
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
||||||
offset));
|
offset));
|
||||||
@@ -425,7 +423,6 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
|||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m();
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m();
|
||||||
|
|
||||||
gv11b_ltc_intr_init_counters(g,
|
gv11b_ltc_intr_init_counters(g,
|
||||||
corrected_delta, corrected_overflow,
|
|
||||||
uncorrected_delta, uncorrected_overflow, offset);
|
uncorrected_delta, uncorrected_overflow, offset);
|
||||||
|
|
||||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||||
@@ -465,14 +462,16 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
|||||||
}
|
}
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
|
gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, ecc_addr);
|
ecc_status, ecc_addr,
|
||||||
|
uncorrected_delta);
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
|
gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, ecc_addr);
|
ecc_status, ecc_addr,
|
||||||
|
uncorrected_delta);
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
|
gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, dstg_ecc_addr,
|
ecc_status, ecc_addr,
|
||||||
ecc_addr);
|
uncorrected_delta);
|
||||||
|
|
||||||
if ((corrected_overflow != 0U) ||
|
if ((corrected_overflow != 0U) ||
|
||||||
(uncorrected_overflow != 0U)) {
|
(uncorrected_overflow != 0U)) {
|
||||||
|
|||||||
@@ -36,15 +36,16 @@ void gv11b_ltc_intr_en_illegal_compstat(struct gk20a *g, bool enable);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
void gv11b_ltc_intr_init_counters(struct gk20a *g,
|
void gv11b_ltc_intr_init_counters(struct gk20a *g,
|
||||||
u32 corrected_delta, u32 corrected_overflow,
|
|
||||||
u32 uncorrected_delta, u32 uncorrected_overflow,
|
u32 uncorrected_delta, u32 uncorrected_overflow,
|
||||||
u32 offset);
|
u32 offset);
|
||||||
void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
|
void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
|
||||||
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr);
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
|
u32 uncorrected_delta);
|
||||||
void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
||||||
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr);
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
|
u32 uncorrected_delta);
|
||||||
void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
||||||
u32 ltc, u32 slice, u32 ecc_status, u32 dstg_ecc_addr,
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
u32 ecc_addr);
|
u32 uncorrected_delta);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -83,16 +83,9 @@ void gv11b_ltc_intr_en_illegal_compstat(struct gk20a *g, bool enable)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
void gv11b_ltc_intr_init_counters(struct gk20a *g,
|
void gv11b_ltc_intr_init_counters(struct gk20a *g,
|
||||||
u32 corrected_delta, u32 corrected_overflow,
|
|
||||||
u32 uncorrected_delta, u32 uncorrected_overflow,
|
u32 uncorrected_delta, u32 uncorrected_overflow,
|
||||||
u32 offset)
|
u32 offset)
|
||||||
{
|
{
|
||||||
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
|
|
||||||
nvgpu_writel(g,
|
|
||||||
nvgpu_safe_add_u32(
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
|
||||||
offset), 0);
|
|
||||||
}
|
|
||||||
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
|
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
|
||||||
nvgpu_writel(g,
|
nvgpu_writel(g,
|
||||||
nvgpu_safe_add_u32(
|
nvgpu_safe_add_u32(
|
||||||
@@ -102,17 +95,9 @@ void gv11b_ltc_intr_init_counters(struct gk20a *g,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
|
void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
|
||||||
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr)
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
|
u32 uncorrected_delta)
|
||||||
{
|
{
|
||||||
if ((ecc_status &
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m())
|
|
||||||
!= 0U) {
|
|
||||||
nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected");
|
|
||||||
/* This error is not expected to occur in gv11b and hence,
|
|
||||||
* this scenario is considered as a fatal error.
|
|
||||||
*/
|
|
||||||
BUG();
|
|
||||||
}
|
|
||||||
if ((ecc_status &
|
if ((ecc_status &
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m())
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m())
|
||||||
!= 0U) {
|
!= 0U) {
|
||||||
@@ -122,86 +107,59 @@ void gv11b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
|
|||||||
*/
|
*/
|
||||||
BUG();
|
BUG();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
||||||
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr)
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
|
u32 uncorrected_delta)
|
||||||
{
|
{
|
||||||
if ((ecc_status &
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m())
|
|
||||||
!= 0U) {
|
|
||||||
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected");
|
|
||||||
/* This error is not expected to occur in gv11b and hence,
|
|
||||||
* this scenario is considered as a fatal error.
|
|
||||||
*/
|
|
||||||
BUG();
|
|
||||||
}
|
|
||||||
if ((ecc_status &
|
if ((ecc_status &
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m())
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m())
|
||||||
!= 0U) {
|
!= 0U) {
|
||||||
|
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
|
||||||
|
uncorrected_delta);
|
||||||
|
|
||||||
nvgpu_report_ecc_err(g,
|
nvgpu_report_ecc_err(g,
|
||||||
NVGPU_ERR_MODULE_LTC,
|
NVGPU_ERR_MODULE_LTC,
|
||||||
(ltc << 8U) | slice,
|
(ltc << 8U) | slice,
|
||||||
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
|
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
|
||||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
|
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter);
|
||||||
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected");
|
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
||||||
u32 ltc, u32 slice, u32 ecc_status, u32 dstg_ecc_addr,
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
u32 ecc_addr)
|
u32 uncorrected_delta)
|
||||||
{
|
{
|
||||||
|
|
||||||
if ((ecc_status &
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m())
|
|
||||||
!= 0U) {
|
|
||||||
if ((dstg_ecc_addr &
|
|
||||||
ltc_ltc0_lts0_dstg_ecc_address_info_ram_m())
|
|
||||||
== 0U) {
|
|
||||||
nvgpu_report_ecc_err(g,
|
|
||||||
NVGPU_ERR_MODULE_LTC,
|
|
||||||
(ltc << 8U) | slice,
|
|
||||||
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr,
|
|
||||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
|
|
||||||
} else {
|
|
||||||
/* This error is not expected to occur in gv11b and
|
|
||||||
* hence, this scenario is considered as a fatal error.
|
|
||||||
*/
|
|
||||||
BUG();
|
|
||||||
}
|
|
||||||
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error corrected");
|
|
||||||
}
|
|
||||||
if ((ecc_status &
|
if ((ecc_status &
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m())
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m())
|
||||||
!= 0U) {
|
!= 0U) {
|
||||||
if ((dstg_ecc_addr &
|
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter =
|
||||||
ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) {
|
nvgpu_wrapping_add_u32(
|
||||||
nvgpu_report_ecc_err(g,
|
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter,
|
||||||
NVGPU_ERR_MODULE_LTC,
|
uncorrected_delta);
|
||||||
(ltc << 8U) | slice,
|
|
||||||
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr,
|
|
||||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
|
|
||||||
} else {
|
|
||||||
nvgpu_report_ecc_err(g,
|
nvgpu_report_ecc_err(g,
|
||||||
NVGPU_ERR_MODULE_LTC,
|
NVGPU_ERR_MODULE_LTC,
|
||||||
(ltc << 8U) | slice,
|
(ltc << 8U) | slice,
|
||||||
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
|
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
|
||||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
|
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter);
|
||||||
}
|
nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected");
|
||||||
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
|
static void gv11b_ltc_intr_handle_ecc_parity_interrupts(struct gk20a *g,
|
||||||
u32 ltc, u32 slice)
|
u32 ltc, u32 slice)
|
||||||
{
|
{
|
||||||
u32 offset;
|
u32 offset;
|
||||||
u32 ltc_intr3;
|
u32 ltc_intr3;
|
||||||
u32 ecc_status, ecc_addr, dstg_ecc_addr, corrected_cnt, uncorrected_cnt;
|
u32 ecc_status, ecc_addr, uncorrected_cnt;
|
||||||
u32 corrected_delta, uncorrected_delta;
|
u32 uncorrected_delta;
|
||||||
u32 corrected_overflow, uncorrected_overflow;
|
u32 uncorrected_overflow;
|
||||||
u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
|
u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
|
||||||
u32 lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE);
|
u32 lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE);
|
||||||
|
|
||||||
@@ -210,39 +168,42 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
|
|||||||
ltc_intr3 = nvgpu_readl(g, nvgpu_safe_add_u32(
|
ltc_intr3 = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_intr3_r(), offset));
|
ltc_ltc0_lts0_intr3_r(), offset));
|
||||||
|
|
||||||
/* Detect and handle ECC PARITY errors */
|
nvgpu_log(g, gpu_dbg_intr,
|
||||||
if ((ltc_intr3 &
|
"ltc:%u lts: %u cache ecc interrupt intr3: 0x%08x",
|
||||||
(ltc_ltcs_ltss_intr3_ecc_uncorrected_m() |
|
ltc, slice, ltc_intr3);
|
||||||
ltc_ltcs_ltss_intr3_ecc_corrected_m())) != 0U) {
|
|
||||||
|
|
||||||
|
/* Corrected ECC parity errors not expected */
|
||||||
|
if ((ltc_intr3 & ltc_ltcs_ltss_intr3_ecc_corrected_m()) != 0U) {
|
||||||
|
nvgpu_err(g, "corrected parity error not expected");
|
||||||
|
/* This error is not expected to occur in gv11b and hence,
|
||||||
|
* this scenario is considered as a fatal error.
|
||||||
|
*/
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Detect and handle uncorrected ECC PARITY errors */
|
||||||
|
if ((ltc_intr3 & ltc_ltcs_ltss_intr3_ecc_uncorrected_m()) != 0U) {
|
||||||
ecc_status = nvgpu_readl(g,
|
ecc_status = nvgpu_readl(g,
|
||||||
nvgpu_safe_add_u32(
|
nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset));
|
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset));
|
||||||
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset));
|
ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset));
|
||||||
dstg_ecc_addr = nvgpu_readl(g,
|
|
||||||
nvgpu_safe_add_u32(
|
nvgpu_log(g, gpu_dbg_intr,
|
||||||
ltc_ltc0_lts0_dstg_ecc_address_r(), offset));
|
"ecc status 0x%08x error address: 0x%08x",
|
||||||
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
ecc_status, ecc_addr);
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
|
||||||
offset));
|
|
||||||
uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
|
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
|
||||||
offset));
|
offset));
|
||||||
|
|
||||||
corrected_delta =
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(
|
|
||||||
corrected_cnt);
|
|
||||||
uncorrected_delta =
|
uncorrected_delta =
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt);
|
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt);
|
||||||
corrected_overflow = ecc_status &
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m();
|
|
||||||
|
|
||||||
uncorrected_overflow = ecc_status &
|
uncorrected_overflow = ecc_status &
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m();
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m();
|
||||||
|
|
||||||
gv11b_ltc_intr_init_counters(g,
|
gv11b_ltc_intr_init_counters(g,
|
||||||
corrected_delta, corrected_overflow,
|
|
||||||
uncorrected_delta, uncorrected_overflow, offset);
|
uncorrected_delta, uncorrected_overflow, offset);
|
||||||
|
|
||||||
nvgpu_writel(g,
|
nvgpu_writel(g,
|
||||||
@@ -251,60 +212,142 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
|
|||||||
ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f());
|
ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f());
|
||||||
|
|
||||||
/* update counters per slice */
|
/* update counters per slice */
|
||||||
if (corrected_overflow != 0U) {
|
|
||||||
corrected_delta += BIT32(
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s());
|
|
||||||
}
|
|
||||||
if (uncorrected_overflow != 0U) {
|
if (uncorrected_overflow != 0U) {
|
||||||
uncorrected_delta += BIT32(
|
nvgpu_info(g, "ecc counter overflow!");
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s());
|
uncorrected_delta =
|
||||||
}
|
nvgpu_wrapping_add_u32(uncorrected_delta,
|
||||||
|
BIT32(ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s()));
|
||||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter =
|
|
||||||
nvgpu_safe_add_u32(
|
|
||||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
|
|
||||||
corrected_delta);
|
|
||||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter =
|
|
||||||
nvgpu_safe_add_u32(
|
|
||||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
|
|
||||||
uncorrected_delta);
|
|
||||||
nvgpu_log(g, gpu_dbg_intr,
|
|
||||||
"ltc:%d lts: %d cache ecc interrupt intr: 0x%x",
|
|
||||||
ltc, slice, ltc_intr3);
|
|
||||||
|
|
||||||
/* This check has been added to ensure that the slice id is less
|
|
||||||
* than 8-bits and hence, it can be packed as part of LSB 8-bits
|
|
||||||
* along with the LTC id while reporting LTC related ECC errors.
|
|
||||||
*/
|
|
||||||
if (slice > U8_MAX) {
|
|
||||||
nvgpu_log(g, gpu_dbg_intr, "Invalid slice id=%d",
|
|
||||||
slice);
|
|
||||||
slice = slice & 0xFFU;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
|
gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, ecc_addr);
|
ecc_status, ecc_addr,
|
||||||
|
uncorrected_delta);
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
|
gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, ecc_addr);
|
ecc_status, ecc_addr,
|
||||||
|
uncorrected_delta);
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
|
gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, dstg_ecc_addr,
|
ecc_status, ecc_addr,
|
||||||
ecc_addr);
|
uncorrected_delta);
|
||||||
|
|
||||||
if ((corrected_overflow != 0U) ||
|
|
||||||
(uncorrected_overflow != 0U)) {
|
|
||||||
nvgpu_info(g, "ecc counter overflow!");
|
|
||||||
}
|
|
||||||
|
|
||||||
nvgpu_log(g, gpu_dbg_intr, "ecc error address: 0x%x", ecc_addr);
|
|
||||||
|
|
||||||
nvgpu_writel(g,
|
nvgpu_writel(g,
|
||||||
nvgpu_safe_add_u32(ltc_ltc0_lts0_intr3_r(), offset),
|
nvgpu_safe_add_u32(ltc_ltc0_lts0_intr3_r(), offset),
|
||||||
ltc_intr3);
|
ltc_intr3);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
gp10b_ltc_intr_handle_lts_interrupts(g, ltc, slice);
|
static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 ltc, u32 slice)
|
||||||
|
{
|
||||||
|
u32 offset;
|
||||||
|
u32 ltc_intr;
|
||||||
|
u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
|
||||||
|
u32 lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE);
|
||||||
|
|
||||||
|
offset = nvgpu_safe_add_u32(nvgpu_safe_mult_u32(ltc_stride, ltc),
|
||||||
|
nvgpu_safe_mult_u32(lts_stride, slice));
|
||||||
|
ltc_intr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_intr_r(), offset));
|
||||||
|
|
||||||
|
nvgpu_log(g, gpu_dbg_intr,
|
||||||
|
"ltc:%u lts: %u cache ecc interrupt intr: 0x%08x",
|
||||||
|
ltc, slice, ltc_intr);
|
||||||
|
|
||||||
|
/* Detect and handle SEC ECC errors */
|
||||||
|
if ((ltc_intr &
|
||||||
|
ltc_ltcs_ltss_intr_ecc_sec_error_pending_f()) != 0U) {
|
||||||
|
u32 ecc_stats_reg_val;
|
||||||
|
u32 dstg_ecc_addr;
|
||||||
|
|
||||||
|
ecc_stats_reg_val =
|
||||||
|
nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_report_r(), offset));
|
||||||
|
dstg_ecc_addr = nvgpu_readl(g,
|
||||||
|
nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_address_r(), offset));
|
||||||
|
|
||||||
|
nvgpu_err(g, "Single bit error detected in GPU L2!");
|
||||||
|
nvgpu_err(g, "ecc_report_r: %08x dstg_ecc_addr: %08x",
|
||||||
|
ecc_stats_reg_val, dstg_ecc_addr);
|
||||||
|
|
||||||
|
g->ecc.ltc.ecc_sec_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_report_sec_count_v(
|
||||||
|
ecc_stats_reg_val));
|
||||||
|
ecc_stats_reg_val &=
|
||||||
|
~(ltc_ltc0_lts0_dstg_ecc_report_sec_count_m());
|
||||||
|
nvgpu_writel(g,
|
||||||
|
nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||||
|
ecc_stats_reg_val);
|
||||||
|
|
||||||
|
nvgpu_report_ecc_err(g,
|
||||||
|
NVGPU_ERR_MODULE_LTC,
|
||||||
|
(ltc << 8U) | slice,
|
||||||
|
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, dstg_ecc_addr,
|
||||||
|
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Using a SEC code will allow correction of an SBE (Single Bit
|
||||||
|
* Error). But the current HW doesn't have the ability to clear
|
||||||
|
* out the SBE from the RAMs for a read access. So before the
|
||||||
|
* SBE turns into a DBE (Double Bit Error), a SW flush is
|
||||||
|
* preferred.
|
||||||
|
*/
|
||||||
|
if (g->ops.mm.cache.l2_flush(g, true) != 0) {
|
||||||
|
nvgpu_err(g, "l2_flush failed");
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Detect and handle DED ECC errors */
|
||||||
|
if ((ltc_intr &
|
||||||
|
ltc_ltcs_ltss_intr_ecc_ded_error_pending_f()) != 0U) {
|
||||||
|
u32 ecc_stats_reg_val;
|
||||||
|
u32 dstg_ecc_addr;
|
||||||
|
|
||||||
|
ecc_stats_reg_val =
|
||||||
|
nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_report_r(), offset));
|
||||||
|
dstg_ecc_addr = nvgpu_readl(g,
|
||||||
|
nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_address_r(), offset));
|
||||||
|
|
||||||
|
nvgpu_err(g, "Double bit error detected in GPU L2!");
|
||||||
|
nvgpu_err(g, "ecc_report_r: %08x dstg_ecc_addr: %08x",
|
||||||
|
ecc_stats_reg_val, dstg_ecc_addr);
|
||||||
|
|
||||||
|
g->ecc.ltc.ecc_ded_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_report_ded_count_v(
|
||||||
|
ecc_stats_reg_val));
|
||||||
|
ecc_stats_reg_val &=
|
||||||
|
~(ltc_ltc0_lts0_dstg_ecc_report_ded_count_m());
|
||||||
|
nvgpu_writel(g,
|
||||||
|
nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||||
|
ecc_stats_reg_val);
|
||||||
|
|
||||||
|
nvgpu_report_ecc_err(g,
|
||||||
|
NVGPU_ERR_MODULE_LTC,
|
||||||
|
(ltc << 8U) | slice,
|
||||||
|
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, dstg_ecc_addr,
|
||||||
|
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
nvgpu_writel(g, nvgpu_safe_add_u32(ltc_ltc0_lts0_intr_r(), offset),
|
||||||
|
ltc_intr);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
|
||||||
|
u32 ltc, u32 slice)
|
||||||
|
{
|
||||||
|
gv11b_ltc_intr_handle_ecc_parity_interrupts(g, ltc, slice);
|
||||||
|
|
||||||
|
gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(g, ltc, slice);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gv11b_ltc_intr_isr(struct gk20a *g, u32 ltc)
|
void gv11b_ltc_intr_isr(struct gk20a *g, u32 ltc)
|
||||||
|
|||||||
@@ -70,6 +70,16 @@ int gv11b_lts_ecc_init(struct gk20a *g)
|
|||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = NVGPU_ECC_COUNTER_INIT_PER_LTS(tstg_ecc_parity_count);
|
||||||
|
if (err != 0) {
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = NVGPU_ECC_COUNTER_INIT_PER_LTS(dstg_be_ecc_parity_count);
|
||||||
|
if (err != 0) {
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
done:
|
done:
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
||||||
|
|||||||
@@ -204,9 +204,13 @@ struct nvgpu_ecc {
|
|||||||
* unit.
|
* unit.
|
||||||
*/
|
*/
|
||||||
struct {
|
struct {
|
||||||
/** ltc-lts sec count. */
|
/** L2 cache slice TSTG ECC PARITY error count. */
|
||||||
|
struct nvgpu_ecc_stat **tstg_ecc_parity_count;
|
||||||
|
/** L2 cache slice DSTG BE ECC PARITY error count. */
|
||||||
|
struct nvgpu_ecc_stat **dstg_be_ecc_parity_count;
|
||||||
|
/** L2 cache slice SEC error count. */
|
||||||
struct nvgpu_ecc_stat **ecc_sec_count;
|
struct nvgpu_ecc_stat **ecc_sec_count;
|
||||||
/** ltc-lts ded count. */
|
/** L2 cache slice DED error count. */
|
||||||
struct nvgpu_ecc_stat **ecc_ded_count;
|
struct nvgpu_ecc_stat **ecc_ded_count;
|
||||||
} ltc;
|
} ltc;
|
||||||
|
|
||||||
|
|||||||
@@ -284,12 +284,38 @@ static void nvgpu_init_gr_manager(struct gk20a *g)
|
|||||||
gr_syspipe->num_gpc = 1;
|
gr_syspipe->num_gpc = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ltc_ecc_init_fault_check(struct unit_module *m, struct gk20a *g,
|
||||||
|
unsigned int number)
|
||||||
|
{
|
||||||
|
struct nvgpu_posix_fault_inj *kmem_fi =
|
||||||
|
nvgpu_kmem_get_fault_injection();
|
||||||
|
int err;
|
||||||
|
|
||||||
|
/* Re-Init dependent ECC unit */
|
||||||
|
err = nvgpu_ecc_init_support(g);
|
||||||
|
if (err != 0) {
|
||||||
|
unit_err(m, "ecc init failed\n");
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
nvgpu_posix_enable_fault_injection(kmem_fi, true, number);
|
||||||
|
err = g->ops.ltc.ecc_init(g);
|
||||||
|
if (err == 0) {
|
||||||
|
unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int test_ltc_ecc_init_free(struct unit_module *m, struct gk20a *g, void *args)
|
int test_ltc_ecc_init_free(struct unit_module *m, struct gk20a *g, void *args)
|
||||||
{
|
{
|
||||||
int ret = UNIT_SUCCESS;
|
int ret = UNIT_SUCCESS;
|
||||||
int err;
|
int err;
|
||||||
struct nvgpu_ecc_stat **save_sec_ptr = g->ecc.ltc.ecc_sec_count;
|
struct nvgpu_ecc_stat **save_sec_ptr = g->ecc.ltc.ecc_sec_count;
|
||||||
struct nvgpu_ecc_stat **save_ded_ptr = g->ecc.ltc.ecc_ded_count;
|
struct nvgpu_ecc_stat **save_ded_ptr = g->ecc.ltc.ecc_ded_count;
|
||||||
|
struct nvgpu_ecc_stat **save_tstg_ecc_ptr = g->ecc.ltc.tstg_ecc_parity_count;
|
||||||
|
struct nvgpu_ecc_stat **save_dstg_ecc_ptr = g->ecc.ltc.dstg_be_ecc_parity_count;
|
||||||
struct nvgpu_posix_fault_inj *kmem_fi =
|
struct nvgpu_posix_fault_inj *kmem_fi =
|
||||||
nvgpu_kmem_get_fault_injection();
|
nvgpu_kmem_get_fault_injection();
|
||||||
|
|
||||||
@@ -312,14 +338,15 @@ int test_ltc_ecc_init_free(struct unit_module *m, struct gk20a *g, void *args)
|
|||||||
|
|
||||||
g->ecc.ltc.ecc_sec_count = NULL;
|
g->ecc.ltc.ecc_sec_count = NULL;
|
||||||
g->ecc.ltc.ecc_ded_count = NULL;
|
g->ecc.ltc.ecc_ded_count = NULL;
|
||||||
|
g->ecc.ltc.tstg_ecc_parity_count = NULL;
|
||||||
|
g->ecc.ltc.dstg_be_ecc_parity_count = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Call with failure on first kzalloc
|
* Call with failure on first kzalloc for sec_ecc_count
|
||||||
*/
|
*/
|
||||||
nvgpu_posix_enable_fault_injection(kmem_fi, true, 0);
|
err = ltc_ecc_init_fault_check(m, g, 0);
|
||||||
err = g->ops.ltc.ecc_init(g);
|
if (err) {
|
||||||
if (err == 0) {
|
unit_err(m, "sec_ecc_count alloc fault check failed\n");
|
||||||
unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n");
|
|
||||||
ret = UNIT_FAIL;
|
ret = UNIT_FAIL;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
@@ -328,28 +355,42 @@ int test_ltc_ecc_init_free(struct unit_module *m, struct gk20a *g, void *args)
|
|||||||
* Call with failure on third kzalloc for the 2nd array dimension and to
|
* Call with failure on third kzalloc for the 2nd array dimension and to
|
||||||
* validate unrolling.
|
* validate unrolling.
|
||||||
*/
|
*/
|
||||||
nvgpu_posix_enable_fault_injection(kmem_fi, true, 2);
|
err = ltc_ecc_init_fault_check(m, g, 2);
|
||||||
err = g->ops.ltc.ecc_init(g);
|
if (err) {
|
||||||
if (err == 0) {
|
unit_err(m, "sec_ecc_count alloc for LTC 1 fault check failed\n");
|
||||||
unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n");
|
|
||||||
ret = UNIT_FAIL;
|
ret = UNIT_FAIL;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Re-Init dependent ECC unit */
|
/*
|
||||||
err = nvgpu_ecc_init_support(g);
|
* Call with failure on 4th kzalloc for ded_ecc_count and get more
|
||||||
if (err != 0) {
|
* branch/line coverage.
|
||||||
unit_return_fail(m, "ecc init failed\n");
|
*/
|
||||||
|
err = ltc_ecc_init_fault_check(m, g, 4);
|
||||||
|
if (err) {
|
||||||
|
unit_err(m, "dec_ecc_count alloc fault check failed\n");
|
||||||
|
ret = UNIT_FAIL;
|
||||||
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Call with failure on 4th kzalloc for second stat and get more
|
* Call with failure on 8th kzalloc for tstg_ecc_parity_count and get more
|
||||||
* branch/line coverage.
|
* branch/line coverage.
|
||||||
*/
|
*/
|
||||||
nvgpu_posix_enable_fault_injection(kmem_fi, true, 4);
|
err = ltc_ecc_init_fault_check(m, g, 8);
|
||||||
err = g->ops.ltc.ecc_init(g);
|
if (err) {
|
||||||
if (err == 0) {
|
unit_err(m, "tstg_ecc_parity_count alloc fault check failed\n");
|
||||||
unit_err(m, "nvgpu_ecc_counter_init_per_lts() failed to return error\n");
|
ret = UNIT_FAIL;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Call with failure on 11th kzalloc for dstg_be_ecc_parity_count and get more
|
||||||
|
* branch/line coverage.
|
||||||
|
*/
|
||||||
|
err = ltc_ecc_init_fault_check(m, g, 11);
|
||||||
|
if (err) {
|
||||||
|
unit_err(m, "dstg_be_ecc_parity_count alloc fault check failed\n");
|
||||||
ret = UNIT_FAIL;
|
ret = UNIT_FAIL;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
@@ -373,6 +414,8 @@ done:
|
|||||||
nvgpu_posix_enable_fault_injection(kmem_fi, false, 0);
|
nvgpu_posix_enable_fault_injection(kmem_fi, false, 0);
|
||||||
g->ecc.ltc.ecc_sec_count = save_sec_ptr;
|
g->ecc.ltc.ecc_sec_count = save_sec_ptr;
|
||||||
g->ecc.ltc.ecc_ded_count = save_ded_ptr;
|
g->ecc.ltc.ecc_ded_count = save_ded_ptr;
|
||||||
|
g->ecc.ltc.tstg_ecc_parity_count = save_tstg_ecc_ptr;
|
||||||
|
g->ecc.ltc.dstg_be_ecc_parity_count = save_dstg_ecc_ptr;
|
||||||
nvgpu_gr_free(g);
|
nvgpu_gr_free(g);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
@@ -464,105 +507,102 @@ int test_ltc_intr(struct unit_module *m, struct gk20a *g, void *args)
|
|||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = NVGPU_ECC_COUNTER_INIT_PER_LTS(tstg_ecc_parity_count);
|
||||||
|
if (err != 0) {
|
||||||
|
unit_err(m, "failed to init tstg_ecc_parity_count\n");
|
||||||
|
err = UNIT_FAIL;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = NVGPU_ECC_COUNTER_INIT_PER_LTS(dstg_be_ecc_parity_count);
|
||||||
|
if (err != 0) {
|
||||||
|
unit_err(m, "failed to init dstg_be_ecc_parity_count\n");
|
||||||
|
err = UNIT_FAIL;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
/* test with no intr pending */
|
/* test with no intr pending */
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
|
/* test with corrected intr, expect BUG */
|
||||||
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
||||||
|
ltc_ltcs_ltss_intr3_ecc_corrected_m());
|
||||||
|
EXPECT_BUG(g->ops.ltc.intr.isr(g, 0));
|
||||||
|
|
||||||
/* test with intr, but no corrected or uncorrected bits */
|
/* test with intr, but no corrected or uncorrected bits */
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
/* set corrected & uncorrected overflow bits */
|
/* set uncorrected overflow bits */
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m() |
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m());
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m());
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
/* set corrected & uncorrected overflow bits in second instance */
|
/* set uncorrected overflow bits in second instance */
|
||||||
nvgpu_posix_io_writel_reg_space(g,
|
nvgpu_posix_io_writel_reg_space(g,
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_r() + offset1,
|
ltc_ltc0_lts0_l2_cache_ecc_status_r() + offset1,
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m() |
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m());
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m());
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r() + offset1,
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r() + offset1,
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
/* set corrected overflow bit independently for branch coverage */
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
|
||||||
|
|
||||||
/* set uncorrected overflow bit independently for branch coverage */
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Clear the corrected & uncorrected overflow bits. And for branch
|
* Clear the uncorrected overflow bits. And for branch
|
||||||
* coverage, set the uncorrected & corrected err counts.
|
* coverage, set the uncorrected err count.
|
||||||
*/
|
*/
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), 0x0);
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(), 0x0);
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_m());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_m());
|
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_m());
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
/* set dstg bits with data RAM */
|
/* set rstg bits */
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m() |
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
|
||||||
|
|
||||||
/* set dstg bits with byte enable (BE) RAM */
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m() |
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_dstg_ecc_address_r(),
|
|
||||||
ltc_ltc0_lts0_dstg_ecc_address_info_ram_m());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
|
||||||
EXPECT_BUG(g->ops.ltc.intr.isr(g, 0));
|
|
||||||
|
|
||||||
/* set tstg & rstg bits */
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() |
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m() |
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() |
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m());
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m());
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
||||||
EXPECT_BUG(g->ops.ltc.intr.isr(g, 0));
|
EXPECT_BUG(g->ops.ltc.intr.isr(g, 0));
|
||||||
|
|
||||||
/* set sec & ded error bits */
|
/* set tstg bits */
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
||||||
ltc_ltcs_ltss_intr_ecc_sec_error_pending_f() |
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m());
|
||||||
ltc_ltcs_ltss_intr_ecc_ded_error_pending_f());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
/* For branch coverage, set sec & ded error bits and make l2 flush succeed */
|
/* set dstg bits */
|
||||||
save_func = g->ops.mm.cache.l2_flush;
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_l2_cache_ecc_status_r(),
|
||||||
g->ops.mm.cache.l2_flush = mock_l2_flush;
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m());
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(),
|
|
||||||
ltc_ltcs_ltss_intr_ecc_sec_error_pending_f() |
|
|
||||||
ltc_ltcs_ltss_intr_ecc_ded_error_pending_f());
|
|
||||||
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(),
|
||||||
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
ltc_ltcs_ltss_intr3_ecc_uncorrected_m());
|
||||||
g->ops.ltc.intr.isr(g, 0);
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr3_r(), 0);
|
||||||
|
|
||||||
|
/* set sec error bits */
|
||||||
|
save_func = g->ops.mm.cache.l2_flush;
|
||||||
|
g->ops.mm.cache.l2_flush = mock_l2_flush;
|
||||||
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(),
|
||||||
|
ltc_ltcs_ltss_intr_ecc_sec_error_pending_f());
|
||||||
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
g->ops.mm.cache.l2_flush = save_func;
|
g->ops.mm.cache.l2_flush = save_func;
|
||||||
|
|
||||||
|
/* set ded error bits */
|
||||||
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(),
|
||||||
|
ltc_ltcs_ltss_intr_ecc_ded_error_pending_f());
|
||||||
|
g->ops.ltc.intr.isr(g, 0);
|
||||||
|
|
||||||
|
/* For branch coverage, set sec error bits and make l2 flush fail */
|
||||||
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(),
|
||||||
|
ltc_ltcs_ltss_intr_ecc_sec_error_pending_f());
|
||||||
|
EXPECT_BUG(g->ops.ltc.intr.isr(g, 0));
|
||||||
|
|
||||||
|
nvgpu_posix_io_writel_reg_space(g, ltc_ltc0_lts0_intr_r(), 0);
|
||||||
|
|
||||||
done:
|
done:
|
||||||
nvgpu_ltc_ecc_free(g);
|
nvgpu_ltc_ecc_free(g);
|
||||||
|
|
||||||
|
|||||||
@@ -78,14 +78,9 @@ int test_ltc_init_support(struct unit_module *m,
|
|||||||
* the failure paths.
|
* the failure paths.
|
||||||
* - Save the current ecc count pointers from the gk20a struct and set the gk20a
|
* - Save the current ecc count pointers from the gk20a struct and set the gk20a
|
||||||
* pointers to NULL.
|
* pointers to NULL.
|
||||||
* - Setup kmem fault injection to trigger fault on allocation for first alloc.
|
* - Do following to check fault while allocating ECC counters for SEC, DED, TSTG and DSTG BE
|
||||||
* - Call ltc ecc counter init and verify error is returned.
|
|
||||||
* - Setup kmem fault injection to trigger fault on allocation for third alloc
|
|
||||||
* to validate failures to allocate on second dimension of array.
|
|
||||||
* - Call ltc ecc counter init and verify error is returned.
|
|
||||||
* - Re-init ecc support.
|
* - Re-init ecc support.
|
||||||
* - Setup kmem fault injection to trigger fault on allocation for fifth alloc
|
* - Setup kmem fault injection to trigger fault on allocation for particular ECC counter.
|
||||||
* to validate failures to allocate for second ltc ecc stat.
|
|
||||||
* - Call ltc ecc counter init and verify error is returned.
|
* - Call ltc ecc counter init and verify error is returned.
|
||||||
* - Re-init ecc support.
|
* - Re-init ecc support.
|
||||||
* - Disable kmem fault injection.
|
* - Disable kmem fault injection.
|
||||||
@@ -180,64 +175,52 @@ int test_ltc_remove_support(struct unit_module *m,
|
|||||||
*
|
*
|
||||||
* Steps:
|
* Steps:
|
||||||
* - Allocate ECC stat counter objects used by handler (ecc_sec_count,
|
* - Allocate ECC stat counter objects used by handler (ecc_sec_count,
|
||||||
* ecc_ded_count).
|
* ecc_ded_count, tstg_ecc_parity_count, dstg_be_ecc_parity_count).
|
||||||
* - Test LTC isr with no interrupts pending.
|
* - Test LTC isr with no interrupts pending.
|
||||||
* - Test with corrected and uncorrected bits in the first LTC instances.
|
* - Test LTC isr with corrected interrupt. Expect BUG.
|
||||||
* - Set the corrected & uncorrected counter overflow bits in the first
|
* - Test with uncorrected bits in the first LTC instances.
|
||||||
|
* - Set the uncorrected counter overflow bits in the first
|
||||||
* ecc_status register (NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_STATUS).
|
* ecc_status register (NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_STATUS).
|
||||||
* - Set the interrupt pending bit in the first LTC interrupt register
|
* - Set the interrupt pending bit in the first LTC interrupt register
|
||||||
* (NV_PLTCG_LTC0_LTS0_INTR).
|
* (NV_PLTCG_LTC0_LTS0_INTR).
|
||||||
* - Call the LTC isr.
|
* - Call the LTC isr.
|
||||||
* - Test with corrected and uncorrected bits in the second LTC instance.
|
* - Test with uncorrected bits in the second LTC instance.
|
||||||
* - Set the corrected & uncorrected counter overflow bits in the second
|
* - Set the uncorrected counter overflow bits in the second
|
||||||
* ecc_status register.
|
* ecc_status register.
|
||||||
* - Set the interrupt pending bit in the second LTC interrupt register.
|
* - Set the interrupt pending bit in the second LTC interrupt register.
|
||||||
* - Call the LTC isr.
|
* - Call the LTC isr.
|
||||||
* - Test with corrected bits only (for branch coverage).
|
* - Test with uncorrected error counts but without err bits (for
|
||||||
* - Set the corrected counter overflow bit and not the uncorrected bit in
|
* branch coverage).
|
||||||
* the ecc_status register.
|
* - Clear the uncorrected counter overflow bits in the ecc_status register.
|
||||||
|
* - Write values to the uncorrected count registers.
|
||||||
* - Set the interrupt pending bit in the LTC interrupt register.
|
* - Set the interrupt pending bit in the LTC interrupt register.
|
||||||
* - Call the LTC isr.
|
* - Call the LTC isr.
|
||||||
* - Test with uncorrected bits only (for branch coverage).
|
* - Test handling of rstg error.
|
||||||
* - Set the uncorrected counter overflow bit and not the corrected bit in
|
* - Set the rstg uncorrected counter error bits in the ecc_status register.
|
||||||
* the ecc_status register.
|
|
||||||
* - Set the interrupt pending bit in the LTC interrupt register.
|
* - Set the interrupt pending bit in the LTC interrupt register.
|
||||||
* - Call the LTC isr.
|
* - Call the LTC isr.
|
||||||
* - Test with corrected and uncorrected error counts but without err bits (for
|
* - Expect BUG.
|
||||||
* branch coverage).
|
* - Test handling of tstg errors.
|
||||||
* - Clear the corrected & uncorrected counter overflow bits in the second
|
* - Set the tstg uncorrected counter error bits in the ecc_status register.
|
||||||
* ecc_status register.
|
* - Set the interrupt pending bit in the LTC interrupt register.
|
||||||
* - Write values to the corrected & uncorrected count registers.
|
|
||||||
* - Set the interrupt pending bit in the second LTC interrupt register.
|
|
||||||
* - Call the LTC isr.
|
* - Call the LTC isr.
|
||||||
* - Test handling of dstg error in data RAM.
|
* - Test handling of dstg errors.
|
||||||
* - Set the dstg corrected & uncorrected error bits in the ecc_status
|
* - Set the dstg uncorrected counter error bits in the ecc_status register.
|
||||||
* register.
|
* - Set the interrupt pending bit in the LTC interrupt register.
|
||||||
* - Set the dstg RAM mask field of the dstg_ecc_address register
|
|
||||||
* (NV_PLTCG_LTC0_LTS0_DSTG_ECC_ADDRESS) to report data RAM.
|
|
||||||
* - Set the interrupt pending bit in the first LTC interrupt register.
|
|
||||||
* - Call the LTC isr.
|
* - Call the LTC isr.
|
||||||
* - Test handling of dstg error in byte enable (BE) RAM.
|
* - Test handling of sec error when the l2 flush API succeeds
|
||||||
* - Set the dstg corrected & uncorrected error bits in the ecc_status
|
|
||||||
* register.
|
|
||||||
* - Set the dstg RAM mask field of the dstg_ecc_address register to report
|
|
||||||
* BE RAM.
|
|
||||||
* - Set the interrupt pending bit in the first LTC interrupt register.
|
|
||||||
* - Call the LTC isr.
|
|
||||||
* - Test handling of tstg and rstg errors.
|
|
||||||
* - Set the tstg and rstg, corrected & uncorrected counter error bits in the
|
|
||||||
* ecc_status register.
|
|
||||||
* - Set the interrupt pending bit in the first LTC interrupt register.
|
|
||||||
* - Call the LTC isr.
|
|
||||||
* - Test handling of sec and ded errors.
|
|
||||||
* - Set the sec and ded pending error bits in the ecc_status register.
|
|
||||||
* - Set the interrupt pending bit in the first LTC interrupt register.
|
|
||||||
* - Call the LTC isr.
|
|
||||||
* - Test handling of sec and ded errors when the l2 flush API succeeds (for
|
|
||||||
* branch coverage).
|
|
||||||
* - Override the MM l2_flush HAL to return success.
|
* - Override the MM l2_flush HAL to return success.
|
||||||
* - Set the sec and ded pending error bits in the ecc_status register.
|
* - Set the sec pending error bits in the ecc_status register.
|
||||||
* - Set the interrupt pending bit in the first LTC interrupt register.
|
* - Set the interrupt pending bit in the LTC interrupt register.
|
||||||
|
* - Call the LTC isr.
|
||||||
|
* - Test handling of ded error.
|
||||||
|
* - Set the ded pending error bits in the ecc_status register.
|
||||||
|
* - Set the interrupt pending bit in the LTC interrupt register.
|
||||||
|
* - Call the LTC isr.
|
||||||
|
* - Test handling of sec error when the l2 flush API fails (for
|
||||||
|
* branch coverage).
|
||||||
|
* - Set the sec pending error bits in the ecc_status register.
|
||||||
|
* - Set the interrupt pending bit in the LTC interrupt register.
|
||||||
* - Call the LTC isr.
|
* - Call the LTC isr.
|
||||||
*
|
*
|
||||||
* Output: Returns PASS unless counter initialization fails or an except occurs
|
* Output: Returns PASS unless counter initialization fails or an except occurs
|
||||||
|
|||||||
Reference in New Issue
Block a user