mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: ga10x: fix LTC ecc handling
Notable differences from GV11B are below:
1. RSTG/TSTG uncorrected errors are supported.
2. PLTS_INTR doesn't report SEC/DED errors. Instead, PLTS_INTR3 will
indicate the SEC/DED errors through CORRECTED_ERR_DSTG and
UNCORRECTED_ERR_DSTG fields respectively.
3. DSTG_ECC_ADDRESS and DSTG_ECC_REPORT are deprecated.
Bug 3446731
Change-Id: I60018d1b3825adcbb287dea05bc96a87f559c969
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2633959
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
c463810bcd
commit
6a6562cd4d
@@ -227,6 +227,10 @@ void nvgpu_ltc_ecc_free(struct gk20a *g)
|
|||||||
ecc->ltc.ecc_ded_count[ltc] = NULL;
|
ecc->ltc.ecc_ded_count[ltc] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ecc->ltc.rstg_ecc_parity_count != NULL) {
|
||||||
|
nvgpu_kfree(g, ecc->ltc.rstg_ecc_parity_count[ltc]);
|
||||||
|
}
|
||||||
|
|
||||||
if (ecc->ltc.tstg_ecc_parity_count != NULL) {
|
if (ecc->ltc.tstg_ecc_parity_count != NULL) {
|
||||||
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count[ltc]);
|
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count[ltc]);
|
||||||
}
|
}
|
||||||
@@ -242,6 +246,9 @@ void nvgpu_ltc_ecc_free(struct gk20a *g)
|
|||||||
nvgpu_kfree(g, ecc->ltc.ecc_ded_count);
|
nvgpu_kfree(g, ecc->ltc.ecc_ded_count);
|
||||||
ecc->ltc.ecc_ded_count = NULL;
|
ecc->ltc.ecc_ded_count = NULL;
|
||||||
|
|
||||||
|
nvgpu_kfree(g, ecc->ltc.rstg_ecc_parity_count);
|
||||||
|
ecc->ltc.rstg_ecc_parity_count = NULL;
|
||||||
|
|
||||||
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count);
|
nvgpu_kfree(g, ecc->ltc.tstg_ecc_parity_count);
|
||||||
ecc->ltc.tstg_ecc_parity_count = NULL;
|
ecc->ltc.tstg_ecc_parity_count = NULL;
|
||||||
|
|
||||||
|
|||||||
@@ -374,7 +374,7 @@ static const struct gops_ltc_intr ga100_ops_ltc_intr = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const struct gops_ltc ga100_ops_ltc = {
|
static const struct gops_ltc ga100_ops_ltc = {
|
||||||
.ecc_init = gv11b_lts_ecc_init,
|
.ecc_init = ga10b_lts_ecc_init,
|
||||||
.init_ltc_support = nvgpu_init_ltc_support,
|
.init_ltc_support = nvgpu_init_ltc_support,
|
||||||
.ltc_remove_support = nvgpu_ltc_remove_support,
|
.ltc_remove_support = nvgpu_ltc_remove_support,
|
||||||
.determine_L2_size_bytes = gp10b_determine_L2_size_bytes,
|
.determine_L2_size_bytes = gp10b_determine_L2_size_bytes,
|
||||||
|
|||||||
@@ -343,7 +343,7 @@ static const struct gops_ltc_intr ga10b_ops_ltc_intr = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const struct gops_ltc ga10b_ops_ltc = {
|
static const struct gops_ltc ga10b_ops_ltc = {
|
||||||
.ecc_init = gv11b_lts_ecc_init,
|
.ecc_init = ga10b_lts_ecc_init,
|
||||||
.init_ltc_support = nvgpu_init_ltc_support,
|
.init_ltc_support = nvgpu_init_ltc_support,
|
||||||
.ltc_remove_support = nvgpu_ltc_remove_support,
|
.ltc_remove_support = nvgpu_ltc_remove_support,
|
||||||
.determine_L2_size_bytes = ga10b_determine_L2_size_bytes,
|
.determine_L2_size_bytes = ga10b_determine_L2_size_bytes,
|
||||||
|
|||||||
@@ -28,7 +28,6 @@
|
|||||||
#include <nvgpu/nvgpu_err.h>
|
#include <nvgpu/nvgpu_err.h>
|
||||||
#include <nvgpu/utils.h>
|
#include <nvgpu/utils.h>
|
||||||
|
|
||||||
#include "hal/ltc/intr/ltc_intr_gv11b.h"
|
|
||||||
#include "ltc_intr_ga10b.h"
|
#include "ltc_intr_ga10b.h"
|
||||||
|
|
||||||
#include <nvgpu/hw/ga10b/hw_ltc_ga10b.h>
|
#include <nvgpu/hw/ga10b/hw_ltc_ga10b.h>
|
||||||
@@ -388,6 +387,210 @@ void ga10b_ltc_intr_configure(struct gk20a *g)
|
|||||||
ga10b_ltc_intr3_configure(g);
|
ga10b_ltc_intr3_configure(g);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ga10b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
|
||||||
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
|
u32 uncorrected_delta)
|
||||||
|
{
|
||||||
|
bool is_rstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) ==
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_rstg_v());
|
||||||
|
|
||||||
|
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected");
|
||||||
|
|
||||||
|
if (!is_rstg_ecc_addr) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to RSTG");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter,
|
||||||
|
uncorrected_delta);
|
||||||
|
nvgpu_report_ecc_err(g,
|
||||||
|
NVGPU_ERR_MODULE_LTC,
|
||||||
|
(ltc << 8U) | slice,
|
||||||
|
GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr,
|
||||||
|
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) {
|
||||||
|
nvgpu_err(g, "rstg ecc error corrected");
|
||||||
|
/* This error is not expected to occur in ga10x and hence,
|
||||||
|
* this scenario is considered as a fatal error.
|
||||||
|
*/
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ga10b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
||||||
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
|
u32 uncorrected_delta)
|
||||||
|
{
|
||||||
|
bool is_tstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) ==
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_tstg_v());
|
||||||
|
|
||||||
|
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected");
|
||||||
|
|
||||||
|
if (!is_tstg_ecc_addr) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to TSTG");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
|
||||||
|
uncorrected_delta);
|
||||||
|
nvgpu_report_ecc_err(g,
|
||||||
|
NVGPU_ERR_MODULE_LTC,
|
||||||
|
(ltc << 8U) | slice,
|
||||||
|
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
|
||||||
|
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) {
|
||||||
|
nvgpu_err(g, "tstg ecc error corrected");
|
||||||
|
/* This error is not expected to occur in ga10b and hence,
|
||||||
|
* this scenario is considered as a fatal error.
|
||||||
|
*/
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ga10b_ltc_intr_is_dstg_data_bank(u32 ecc_addr)
|
||||||
|
{
|
||||||
|
u32 ecc_ram = ltc_ltc0_lts0_l2_cache_ecc_address_ram_v(ecc_addr);
|
||||||
|
bool is_dstg_data_bank = false;
|
||||||
|
|
||||||
|
if ((ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank0_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank1_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank2_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_bank3_v())) {
|
||||||
|
is_dstg_data_bank = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return is_dstg_data_bank;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ga10b_ltc_intr_is_dstg_be_ram(u32 ecc_addr)
|
||||||
|
{
|
||||||
|
u32 ecc_ram = ltc_ltc0_lts0_l2_cache_ecc_address_ram_v(ecc_addr);
|
||||||
|
bool is_dstg_be_ram = false;
|
||||||
|
|
||||||
|
if ((ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram0_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram1_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram2_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram3_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram4_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram5_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram6_v()) ||
|
||||||
|
(ecc_ram == ltc_ltc0_lts0_l2_cache_ecc_address_ram_dstg_db_clrbe_trlram7_v())) {
|
||||||
|
is_dstg_be_ram = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return is_dstg_be_ram;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
||||||
|
u32 ltc, u32 slice, u32 ecc_status, u32 ecc_addr,
|
||||||
|
u32 corrected_delta, u32 uncorrected_delta)
|
||||||
|
{
|
||||||
|
bool is_dstg_ecc_addr = (ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr) ==
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_dstg_v());
|
||||||
|
|
||||||
|
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error (SEC) corrected");
|
||||||
|
|
||||||
|
if (!is_dstg_ecc_addr) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to DSTG");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
g->ecc.ltc.ecc_sec_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
|
||||||
|
corrected_delta);
|
||||||
|
|
||||||
|
nvgpu_report_ecc_err(g,
|
||||||
|
NVGPU_ERR_MODULE_LTC,
|
||||||
|
(ltc << 8U) | slice,
|
||||||
|
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr,
|
||||||
|
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Using a SEC code will allow correction of an SBE (Single Bit
|
||||||
|
* Error). But the current HW doesn't have the ability to clear
|
||||||
|
* out the SBE from the RAMs for a read access. So before the
|
||||||
|
* SBE turns into a DBE (Double Bit Error), a SW flush is
|
||||||
|
* preferred.
|
||||||
|
*/
|
||||||
|
if (g->ops.mm.cache.l2_flush(g, true) != 0) {
|
||||||
|
nvgpu_err(g, "l2_flush failed");
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected");
|
||||||
|
|
||||||
|
if (!is_dstg_ecc_addr) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "ECC address doesn't belong to DSTG");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ga10b_ltc_intr_is_dstg_data_bank(ecc_addr)) {
|
||||||
|
nvgpu_err(g, "Double bit error detected in GPU L2!");
|
||||||
|
|
||||||
|
g->ecc.ltc.ecc_ded_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
|
||||||
|
uncorrected_delta);
|
||||||
|
|
||||||
|
nvgpu_report_ecc_err(g,
|
||||||
|
NVGPU_ERR_MODULE_LTC,
|
||||||
|
(ltc << 8U) | slice,
|
||||||
|
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr,
|
||||||
|
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
|
||||||
|
} else if (ga10b_ltc_intr_is_dstg_be_ram(ecc_addr)) {
|
||||||
|
nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected");
|
||||||
|
|
||||||
|
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter =
|
||||||
|
nvgpu_wrapping_add_u32(
|
||||||
|
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter,
|
||||||
|
uncorrected_delta);
|
||||||
|
|
||||||
|
nvgpu_report_ecc_err(g,
|
||||||
|
NVGPU_ERR_MODULE_LTC,
|
||||||
|
(ltc << 8U) | slice,
|
||||||
|
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
|
||||||
|
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter);
|
||||||
|
} else {
|
||||||
|
nvgpu_err(g, "unsupported uncorrected dstg ecc error");
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ga10b_ltc_intr_init_counters(struct gk20a *g,
|
||||||
|
u32 uncorrected_delta, u32 uncorrected_overflow,
|
||||||
|
u32 corrected_delta, u32 corrected_overflow,
|
||||||
|
u32 offset)
|
||||||
|
{
|
||||||
|
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
|
||||||
|
nvgpu_writel(g,
|
||||||
|
nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
|
||||||
|
offset), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
|
||||||
|
nvgpu_writel(g,
|
||||||
|
nvgpu_safe_add_u32(
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
||||||
|
offset), 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
||||||
u32 offset, u32 ltc_intr3)
|
u32 offset, u32 ltc_intr3)
|
||||||
{
|
{
|
||||||
@@ -395,7 +598,18 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
|||||||
u32 corrected_delta, uncorrected_delta;
|
u32 corrected_delta, uncorrected_delta;
|
||||||
u32 corrected_overflow, uncorrected_overflow;
|
u32 corrected_overflow, uncorrected_overflow;
|
||||||
|
|
||||||
/* Detect and handle ECC PARITY errors */
|
/*
|
||||||
|
* Detect and handle ECC PARITY errors and SEC-DED errors.
|
||||||
|
* SEC errors are reported as DSTG corrected errors and
|
||||||
|
* DED errors are reported as DSTG uncorrected errors.
|
||||||
|
* Below are the supported errors:
|
||||||
|
*
|
||||||
|
* 1. UNCORRECTED_ERR_RSTG - signals a parity error in RSTG RAMS, for now only CBC RAMS
|
||||||
|
* 2. UNCORRECTED_ERR_TSTG - signals a parity error in TSTG RAMS
|
||||||
|
* 3. UNCORRECTED_ERR_DSTG - signals a parity error in DSTG RAMS, non-data RAMS
|
||||||
|
* and DED in data RAMS.
|
||||||
|
* 4. CORRECTED_ERR_DSTG - signals an ecc corrected error in DSTG data RAMS (SEC)
|
||||||
|
*/
|
||||||
if ((ltc_intr3 &
|
if ((ltc_intr3 &
|
||||||
(ltc_ltcs_ltss_intr3_ecc_uncorrected_m() |
|
(ltc_ltcs_ltss_intr3_ecc_uncorrected_m() |
|
||||||
ltc_ltcs_ltss_intr3_ecc_corrected_m())) != 0U) {
|
ltc_ltcs_ltss_intr3_ecc_corrected_m())) != 0U) {
|
||||||
@@ -404,81 +618,65 @@ static void ga10b_ltc_intr3_ecc_interrupts(struct gk20a *g, u32 ltc, u32 slice,
|
|||||||
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset));
|
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset));
|
||||||
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
ecc_addr = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset));
|
ltc_ltc0_lts0_l2_cache_ecc_address_r(), offset));
|
||||||
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
|
||||||
offset));
|
|
||||||
uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
uncorrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
|
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(),
|
||||||
offset));
|
offset));
|
||||||
|
|
||||||
corrected_delta =
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(
|
|
||||||
corrected_cnt);
|
|
||||||
uncorrected_delta =
|
uncorrected_delta =
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt);
|
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt);
|
||||||
corrected_overflow = ecc_status &
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m();
|
|
||||||
|
|
||||||
uncorrected_overflow = ecc_status &
|
uncorrected_overflow = ecc_status &
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m();
|
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m();
|
||||||
|
|
||||||
gv11b_ltc_intr_init_counters(g,
|
corrected_cnt = nvgpu_readl(g, nvgpu_safe_add_u32(
|
||||||
uncorrected_delta, uncorrected_overflow, offset);
|
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(),
|
||||||
|
offset));
|
||||||
|
|
||||||
|
corrected_delta =
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(corrected_cnt);
|
||||||
|
|
||||||
|
corrected_overflow = ecc_status &
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m();
|
||||||
|
|
||||||
|
ga10b_ltc_intr_init_counters(g,
|
||||||
|
uncorrected_delta, uncorrected_overflow,
|
||||||
|
corrected_delta, corrected_overflow, offset);
|
||||||
|
|
||||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset),
|
ltc_ltc0_lts0_l2_cache_ecc_status_r(), offset),
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f());
|
ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f());
|
||||||
|
|
||||||
/* update counters per slice */
|
/* update counters per slice */
|
||||||
if (corrected_overflow != 0U) {
|
|
||||||
corrected_delta += BIT32(
|
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s());
|
|
||||||
}
|
|
||||||
if (uncorrected_overflow != 0U) {
|
if (uncorrected_overflow != 0U) {
|
||||||
|
nvgpu_info(g, "uncorrected ecc counter overflow!");
|
||||||
uncorrected_delta += BIT32(
|
uncorrected_delta += BIT32(
|
||||||
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s());
|
ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s());
|
||||||
}
|
}
|
||||||
|
|
||||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter =
|
if (corrected_overflow != 0U) {
|
||||||
nvgpu_safe_add_u32(
|
nvgpu_info(g, "corrected ecc counter overflow!");
|
||||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
|
corrected_delta += BIT32(
|
||||||
corrected_delta);
|
ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s());
|
||||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter =
|
}
|
||||||
nvgpu_safe_add_u32(
|
|
||||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
|
|
||||||
uncorrected_delta);
|
|
||||||
nvgpu_log(g, gpu_dbg_intr,
|
nvgpu_log(g, gpu_dbg_intr,
|
||||||
"ltc:%d lts: %d cache ecc interrupt intr: 0x%x",
|
"ecc status 0x%08x error address: 0x%08x subunit: %u corrected_delta: 0x%08x uncorrected_delta: 0x%08x",
|
||||||
ltc, slice, ltc_intr3);
|
ecc_status, ecc_addr,
|
||||||
|
ltc_ltc0_lts0_l2_cache_ecc_address_subunit_v(ecc_addr),
|
||||||
|
corrected_delta, uncorrected_delta);
|
||||||
|
|
||||||
/* This check has been added to ensure that the slice id is less
|
ga10b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
|
||||||
* than 8-bits and hence, it can be packed as part of LSB 8-bits
|
|
||||||
* along with the LTC id while reporting LTC related ECC errors.
|
|
||||||
*/
|
|
||||||
if (slice > U8_MAX) {
|
|
||||||
nvgpu_log(g, gpu_dbg_intr, "Invalid slice id=%d",
|
|
||||||
slice);
|
|
||||||
slice = slice & 0xFFU;
|
|
||||||
}
|
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_rstg_ecc_interrupts(g, ltc, slice,
|
|
||||||
ecc_status, ecc_addr,
|
ecc_status, ecc_addr,
|
||||||
uncorrected_delta);
|
uncorrected_delta);
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
|
ga10b_ltc_intr_handle_tstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, ecc_addr,
|
ecc_status, ecc_addr,
|
||||||
uncorrected_delta);
|
uncorrected_delta);
|
||||||
|
|
||||||
gv11b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
|
ga10b_ltc_intr_handle_dstg_ecc_interrupts(g, ltc, slice,
|
||||||
ecc_status, ecc_addr,
|
ecc_status, ecc_addr,
|
||||||
uncorrected_delta);
|
corrected_delta, uncorrected_delta);
|
||||||
|
|
||||||
if ((corrected_overflow != 0U) ||
|
|
||||||
(uncorrected_overflow != 0U)) {
|
|
||||||
nvgpu_info(g, "ecc counter overflow!");
|
|
||||||
}
|
|
||||||
|
|
||||||
nvgpu_log(g, gpu_dbg_intr, "ecc error address: 0x%x", ecc_addr);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ void ga10b_ltc_set_zbc_depth_entry(struct gk20a *g, u32 depth_val, u32 index);
|
|||||||
void ga10b_ltc_init_fs_state(struct gk20a *g);
|
void ga10b_ltc_init_fs_state(struct gk20a *g);
|
||||||
void ga10b_ltc_lts_set_mgmt_setup(struct gk20a *g);
|
void ga10b_ltc_lts_set_mgmt_setup(struct gk20a *g);
|
||||||
u64 ga10b_determine_L2_size_bytes(struct gk20a *g);
|
u64 ga10b_determine_L2_size_bytes(struct gk20a *g);
|
||||||
|
int ga10b_lts_ecc_init(struct gk20a *g);
|
||||||
|
|
||||||
#ifdef CONFIG_NVGPU_DEBUGGER
|
#ifdef CONFIG_NVGPU_DEBUGGER
|
||||||
u32 ga10b_ltc_pri_shared_addr(struct gk20a *g, u32 addr);
|
u32 ga10b_ltc_pri_shared_addr(struct gk20a *g, u32 addr);
|
||||||
|
|||||||
@@ -29,6 +29,7 @@
|
|||||||
#include <nvgpu/errata.h>
|
#include <nvgpu/errata.h>
|
||||||
|
|
||||||
#include "hal/gr/gr/gr_gk20a.h"
|
#include "hal/gr/gr/gr_gk20a.h"
|
||||||
|
#include "ltc_gv11b.h"
|
||||||
#include "ltc_ga10b.h"
|
#include "ltc_ga10b.h"
|
||||||
|
|
||||||
#include <nvgpu/hw/ga10b/hw_ltc_ga10b.h>
|
#include <nvgpu/hw/ga10b/hw_ltc_ga10b.h>
|
||||||
@@ -234,3 +235,25 @@ u64 ga10b_determine_L2_size_bytes(struct gk20a *g)
|
|||||||
|
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ga10b_lts_ecc_init(struct gk20a *g)
|
||||||
|
{
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
err = gv11b_lts_ecc_init(g);
|
||||||
|
if (err != 0) {
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = NVGPU_ECC_COUNTER_INIT_PER_LTS(rstg_ecc_parity_count);
|
||||||
|
if (err != 0) {
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
done:
|
||||||
|
if (err != 0) {
|
||||||
|
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
||||||
|
}
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|||||||
@@ -204,6 +204,8 @@ struct nvgpu_ecc {
|
|||||||
* unit.
|
* unit.
|
||||||
*/
|
*/
|
||||||
struct {
|
struct {
|
||||||
|
/** L2 cache slice RSTG ECC PARITY error count. */
|
||||||
|
struct nvgpu_ecc_stat **rstg_ecc_parity_count;
|
||||||
/** L2 cache slice TSTG ECC PARITY error count. */
|
/** L2 cache slice TSTG ECC PARITY error count. */
|
||||||
struct nvgpu_ecc_stat **tstg_ecc_parity_count;
|
struct nvgpu_ecc_stat **tstg_ecc_parity_count;
|
||||||
/** L2 cache slice DSTG BE ECC PARITY error count. */
|
/** L2 cache slice DSTG BE ECC PARITY error count. */
|
||||||
|
|||||||
@@ -293,6 +293,7 @@ struct gr_exception_info {
|
|||||||
#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
|
#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
|
||||||
#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
|
#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
|
||||||
#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
|
#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
|
||||||
|
#define GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED (5U)
|
||||||
#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
|
#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
|
||||||
/**
|
/**
|
||||||
* @}
|
* @}
|
||||||
|
|||||||
Reference in New Issue
Block a user