gpu: nvgpu: ga10x: fix LTC ecc handling

Notable differences from GV11B are below:
  1. RSTG/TSTG uncorrected errors are supported.
  2. PLTS_INTR doesn't report SEC/DED errors. Instead, PLTS_INTR3 will
     indicate the SEC/DED errors through CORRECTED_ERR_DSTG and
     UNCORRECTED_ERR_DSTG fields respectively.
  3. DSTG_ECC_ADDRESS and DSTG_ECC_REPORT are deprecated.

Bug 3446731

Change-Id: I60018d1b3825adcbb287dea05bc96a87f559c969
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2633959
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Sagar Kamble
2021-11-29 17:01:48 +05:30
committed by mobile promotions
parent c463810bcd
commit 6a6562cd4d
8 changed files with 281 additions and 49 deletions

View File

@@ -227,6 +227,10 @@ void nvgpu_ltc_ecc_free(struct gk20a *g)
ecc->ltc.ecc_ded_count[ltc] = NULL;
}
if (ecc->ltc.rstg_ecc_parity_count != NULL) {
nvgpu_kfree(g, ecc->ltc.rstg_ecc_parity_count[ltc]);
}
if (ecc->ltc.tstg_ecc_parity_count != NULL) {
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count[ltc]);
}
@@ -242,6 +246,9 @@ void nvgpu_ltc_ecc_free(struct gk20a *g)
nvgpu_kfree(g, ecc->ltc.ecc_ded_count);
ecc->ltc.ecc_ded_count = NULL;
nvgpu_kfree(g, ecc->ltc.rstg_ecc_parity_count);
ecc->ltc.rstg_ecc_parity_count = NULL;
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count);
ecc->ltc.tstg_ecc_parity_count = NULL;

View File

@@ -374,7 +374,7 @@ static const struct gops_ltc_intr ga100_ops_ltc_intr = {
};
static const struct gops_ltc ga100_ops_ltc = {
.ecc_init = gv11b_lts_ecc_init,
.ecc_init = ga10b_lts_ecc_init,
.init_ltc_support = nvgpu_init_ltc_support,
.ltc_remove_support = nvgpu_ltc_remove_support,
.determine_L2_size_bytes = gp10b_determine_L2_size_bytes,

View File

@@ -343,7 +343,7 @@ static const struct gops_ltc_intr ga10b_ops_ltc_intr = {
};
static const struct gops_ltc ga10b_ops_ltc = {
.ecc_init = gv11b_lts_ecc_init,
.ecc_init = ga10b_lts_ecc_init,
.init_ltc_support = nvgpu_init_ltc_support,
.ltc_remove_support = nvgpu_ltc_remove_support,
.determine_L2_size_bytes = ga10b_determine_L2_size_bytes,

View File

@@ -28,7 +28,6 @@
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/utils.h>
#include "hal/ltc/intr/ltc_intr_gv11b.h"
#include "ltc_intr_ga10b.h"
#include <nvgpu/hw/ga10b/hw_ltc_ga10b.h>
@@ -388,6 +387,210 @@ void ga10b_ltc_intr_configure(struct gk20a *g)
ga10b_ltc_intr3_configure(g);
}
static void ga10b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
u32 uncorrected_delta)
{
bool is_rstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) ==
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_rstg_v());
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected");
if (!is_rstg_ecc_addr) {
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to RSTG");
return;
}
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter =
nvgpu_wrapping_add_u32(
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter,
uncorrected_delta);
nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter);
}
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) {
nvgpu_err(g, "rstg ecc error corrected");
/* This error is not expected to occur in ga10x and hence,
* this scenario is considered as a fatal error.
*/
BUG();
}
}
static void ga10b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
u32 uncorrected_delta)
{
bool is_tstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) ==
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_tstg_v());
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected");
if (!is_tstg_ecc_addr) {
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to TSTG");
return;
}
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter =
nvgpu_wrapping_add_u32(
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
uncorrected_delta);
nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter);
}
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) {
nvgpu_err(g, "tstg ecc error corrected");
/* This error is not expected to occur in ga10b and hence,
* this scenario is considered as a fatal error.
*/
BUG();
}
}
static bool ga10b_ltc_intr_is_dstg_data_bank(u32 ecc_addr)
{
u32 ecc_ram = ltc_ltc0_lts0_l2_cache_ecc_address_ram_v(ecc_addr);
bool is_dstg_data_bank = false;
if ((ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank0_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank1_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank2_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank3_v())) {
is_dstg_data_bank = true;
}
return is_dstg_data_bank;
}
static bool ga10b_ltc_intr_is_dstg_be_ram(u32 ecc_addr)
{
u32 ecc_ram = ltc_ltc0_lts0_l2_cache_ecc_address_ram_v(ecc_addr);
bool is_dstg_be_ram = false;
if ((ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram0_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram1_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram2_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram3_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram4_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram5_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram6_v()) ||
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram7_v())) {
is_dstg_be_ram = true;
}
return is_dstg_be_ram;
}
static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
u32 corrected_delta, u32 uncorrected_delta)
{
bool is_dstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) ==
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_dstg_v());
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error (SEC) corrected");
if (!is_dstg_ecc_addr) {
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to DSTG");
return;
}
g->ecc.ltc.ecc_sec_count[ltc][slice].counter =
nvgpu_wrapping_add_u32(
g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
corrected_delta);
nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
/*
* Using a SEC code will allow correction of an SBE (Single Bit
* Error). But the current HW doesn't have the ability to clear
* out the SBE from the RAMs for a read access. So before the
* SBE turns into a DBE (Double Bit Error), a SW flush is
* preferred.
*/
if (g->ops.mm.cache.l2_flush(g, true) != 0) {
nvgpu_err(g, "l2_flush failed");
BUG();
}
}
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected");
if (!is_dstg_ecc_addr) {
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to DSTG");
return;
}
if (ga10b_ltc_intr_is_dstg_data_bank(ecc_addr)) {
nvgpu_err(g, "Double bit error detected in GPU L2!");
g->ecc.ltc.ecc_ded_count[ltc][slice].counter =
nvgpu_wrapping_add_u32(
g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
uncorrected_delta);
nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
} else if (ga10b_ltc_intr_is_dstg_be_ram(ecc_addr)) {
nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected");
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter =
nvgpu_wrapping_add_u32(
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter,
uncorrected_delta);
nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter);
} else {
nvgpu_err(g, "unsupported uncorrected dstg ecc error");
BUG();
}
}
}
static void ga10b_ltc_intr_init_counters(struct gk20a *g,
u32 uncorrected_delta, u32 uncorrected_overflow,
u32 corrected_delta, u32 corrected_overflow,
u32 offset)
{
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
nvgpu_writel(g,
nvgpu_safe_add_u32(
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
offset), 0);
}
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
nvgpu_writel(g,
nvgpu_safe_add_u32(
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
offset), 0);
}
}
static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
u32 offset, u32 ltc_intr3)
{
@@ -395,7 +598,18 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
u32 corrected_delta, uncorrected_delta;
u32 corrected_overflow, uncorrected_overflow;
/* Detect and handle ECC PARITY errors */
/*
* Detect and handle ECC PARITY errors and SEC-DED errors.
* SEC errors are reported as DSTG corrected errors and
* DED errors are reported as DSTG uncorrected errors.
* Below are the supported errors:
*
* 1. UNCORRECTED_ERR_RSTG - signals a parity error in RSTG RAMS, for now only CBC RAMS
* 2. UNCORRECTED_ERR_TSTG - signals a parity error in TSTG RAMS
* 3. UNCORRECTED_ERR_DSTG - signals a parity error in DSTG RAMS, non-data RAMS
* and DED in data RAMS.
* 4. CORRECTED_ERR_DSTG - signals an ecc corrected error in DSTG data RAMS (SEC)
*/
if ((ltc_intr3 &
(ltc_ltcs_ltss_intr3_ecc_uncorrected_m() |
ltc_ltcs_ltss_intr3_ecc_corrected_m())) != 0U) {
@@ -404,81 +618,65 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset));
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset));
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
offset));
uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
offset));
corrected_delta =
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(
corrected_cnt);
uncorrected_delta =
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt);
corrected_overflow = ecc_status &
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m();
uncorrected_overflow = ecc_status &
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m();
gv11b_ltc_intr_init_counters(g,
uncorrected_delta, uncorrected_overflow, offset);
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
offset));
corrected_delta =
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(corrected_cnt);
corrected_overflow = ecc_status &
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m();
ga10b_ltc_intr_init_counters(g,
uncorrected_delta, uncorrected_overflow,
corrected_delta, corrected_overflow, offset);
nvgpu_writel(g, nvgpu_safe_add_u32(
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset),
ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f());
/* update counters per slice */
if (corrected_overflow != 0U) {
corrected_delta += BIT32(
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s());
}
if (uncorrected_overflow != 0U) {
nvgpu_info(g, "uncorrected ecc counter overflow!");
uncorrected_delta += BIT32(
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s());
}
g->ecc.ltc.ecc_sec_count[ltc][slice].counter =
nvgpu_safe_add_u32(
g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
corrected_delta);
g->ecc.ltc.ecc_ded_count[ltc][slice].counter =
nvgpu_safe_add_u32(
g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
uncorrected_delta);
if (corrected_overflow != 0U) {
nvgpu_info(g, "corrected ecc counter overflow!");
corrected_delta += BIT32(
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s());
}
nvgpu_log(g, gpu_dbg_intr,
"ltc:%d lts: %d cache ecc interrupt intr: 0x%x",
ltc, slice, ltc_intr3);
"ecc status 0x%08x error address: 0x%08x subunit: %u corrected_delta: 0x%08x uncorrected_delta: 0x%08x",
ecc_status, ecc_addr,
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr),
corrected_delta, uncorrected_delta);
/* This check has been added to ensure that the slice id is less
* than 8-bits and hence, it can be packed as part of LSB 8-bits
* along with the LTC id while reporting LTC related ECC errors.
*/
if (slice > U8_MAX) {
nvgpu_log(g, gpu_dbg_intr, "Invalid slice id=%d",
slice);
slice = slice & 0xFFU;
}
gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
ga10b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
ecc_status, ecc_addr,
uncorrected_delta);
gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
ga10b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
ecc_status, ecc_addr,
uncorrected_delta);
gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
ga10b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
ecc_status, ecc_addr,
uncorrected_delta);
if ((corrected_overflow != 0U) ||
(uncorrected_overflow != 0U)) {
nvgpu_info(g, "ecc counter overflow!");
}
nvgpu_log(g, gpu_dbg_intr, "ecc error address: 0x%x", ecc_addr);
corrected_delta, uncorrected_delta);
}
}

View File

@@ -40,6 +40,7 @@ void ga10b_ltc_set_zbc_depth_entry(struct gk20a *g, u32 depth_val, u32 index);
void ga10b_ltc_init_fs_state(struct gk20a *g);
void ga10b_ltc_lts_set_mgmt_setup(struct gk20a *g);
u64 ga10b_determine_L2_size_bytes(struct gk20a *g);
int ga10b_lts_ecc_init(struct gk20a *g);
#ifdef CONFIG_NVGPU_DEBUGGER
u32 ga10b_ltc_pri_shared_addr(struct gk20a *g, u32 addr);

View File

@@ -29,6 +29,7 @@
#include <nvgpu/errata.h>
#include "hal/gr/gr/gr_gk20a.h"
#include "ltc_gv11b.h"
#include "ltc_ga10b.h"
#include <nvgpu/hw/ga10b/hw_ltc_ga10b.h>
@@ -234,3 +235,25 @@ u64 ga10b_determine_L2_size_bytes(struct gk20a *g)
return size;
}
int ga10b_lts_ecc_init(struct gk20a *g)
{
int err = 0;
err = gv11b_lts_ecc_init(g);
if (err != 0) {
goto done;
}
err = NVGPU_ECC_COUNTER_INIT_PER_LTS(rstg_ecc_parity_count);
if (err != 0) {
goto done;
}
done:
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
}
return err;
}

View File

@@ -204,6 +204,8 @@ struct nvgpu_ecc {
* unit.
*/
struct {
/** L2 cache slice RSTG ECC PARITY error count. */
struct nvgpu_ecc_stat **rstg_ecc_parity_count;
/** L2 cache slice TSTG ECC PARITY error count. */
struct nvgpu_ecc_stat **tstg_ecc_parity_count;
/** L2 cache slice DSTG BE ECC PARITY error count. */

View File

@@ -293,6 +293,7 @@ struct gr_exception_info {
#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
#define GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED (5U)
#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
/**
* @}