gpu: nvgpu: gv11b: Add L1 DATA + iCACHE parity

This CL covers the following parity support (uncorrected error),
1) SM's L1 DATA
2) SM's L0 && L1 icache

Volta Resiliency Id - Volta-634

JIRA GPUT19X-113
JIRA GPUT19X-99

Bug 1807553

Change-Id: Iacbf492028983529dadc5753007e43510b8cb786
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: http://git-master/r/1483681
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Lakshmanan M
2017-05-17 11:42:24 +05:30
committed by mobile promotions
parent d503a23444
commit 5a08eafbe0
4 changed files with 350 additions and 0 deletions

View File

@@ -368,6 +368,170 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
u32 l1_data_ecc_status, l1_data_ecc_corrected_err_status = 0;
u32 l1_data_ecc_uncorrected_err_status = 0;
u32 l1_data_corrected_err_count_delta = 0;
u32 l1_data_uncorrected_err_count_delta = 0;
bool is_l1_data_ecc_corrected_total_err_overflow = 0;
bool is_l1_data_ecc_uncorrected_total_err_overflow = 0;
/* Check for L1 data ECC errors. */
l1_data_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset);
l1_data_ecc_corrected_err_status = l1_data_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_1_m());
l1_data_ecc_uncorrected_err_status = l1_data_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_1_m());
if ((l1_data_ecc_corrected_err_status == 0) && (l1_data_ecc_uncorrected_err_status == 0))
return 0;
l1_data_corrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() +
offset));
l1_data_uncorrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() +
offset));
is_l1_data_ecc_corrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_total_counter_overflow_v(l1_data_ecc_status);
is_l1_data_ecc_uncorrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_total_counter_overflow_v(l1_data_ecc_status);
if ((l1_data_corrected_err_count_delta > 0) || is_l1_data_ecc_corrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"corrected error (SBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]",
l1_data_ecc_corrected_err_status, is_l1_data_ecc_corrected_total_err_overflow);
/* HW uses 16-bits counter */
l1_data_corrected_err_count_delta +=
(is_l1_data_ecc_corrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count.counters[tpc] +=
l1_data_corrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset,
0);
}
if ((l1_data_uncorrected_err_count_delta > 0) || is_l1_data_ecc_uncorrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"Uncorrected error (DBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]",
l1_data_ecc_uncorrected_err_status, is_l1_data_ecc_uncorrected_total_err_overflow);
/* HW uses 16-bits counter */
l1_data_uncorrected_err_count_delta +=
(is_l1_data_ecc_uncorrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count.counters[tpc] +=
l1_data_uncorrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
0);
}
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f());
return 0;
}
static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
u32 icache_ecc_status, icache_ecc_corrected_err_status = 0;
u32 icache_ecc_uncorrected_err_status = 0;
u32 icache_corrected_err_count_delta = 0;
u32 icache_uncorrected_err_count_delta = 0;
bool is_icache_ecc_corrected_total_err_overflow = 0;
bool is_icache_ecc_uncorrected_total_err_overflow = 0;
/* Check for L0 && L1 icache ECC errors. */
icache_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset);
icache_ecc_corrected_err_status = icache_ecc_status &
(gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m() |
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m() |
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m() |
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m());
icache_ecc_uncorrected_err_status = icache_ecc_status &
(gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m() |
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m() |
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m() |
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m());
if ((icache_ecc_corrected_err_status == 0) && (icache_ecc_uncorrected_err_status == 0))
return 0;
icache_corrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() +
offset));
icache_uncorrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() +
offset));
is_icache_ecc_corrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_total_counter_overflow_v(icache_ecc_status);
is_icache_ecc_uncorrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_total_counter_overflow_v(icache_ecc_status);
if ((icache_corrected_err_count_delta > 0) || is_icache_ecc_corrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"corrected error (SBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]",
icache_ecc_corrected_err_status, is_icache_ecc_corrected_total_err_overflow);
/* HW uses 16-bits counter */
icache_corrected_err_count_delta +=
(is_icache_ecc_corrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_icache_corrected_err_count.counters[tpc] +=
icache_corrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset,
0);
}
if ((icache_uncorrected_err_count_delta > 0) || is_icache_ecc_uncorrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"Uncorrected error (DBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]",
icache_ecc_uncorrected_err_status, is_icache_ecc_uncorrected_total_err_overflow);
/* HW uses 16-bits counter */
icache_uncorrected_err_count_delta +=
(is_icache_ecc_uncorrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count.counters[tpc] +=
icache_uncorrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset,
0);
}
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset,
gr_pri_gpc0_tpc0_sm_icache_ecc_status_reset_task_f());
return 0;
}
static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
@@ -383,6 +547,12 @@ static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
/* Check for CBU ECC errors. */
gr_gv11b_handle_cbu_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
/* Check for L1 data ECC errors. */
gr_gv11b_handle_l1_data_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
/* Check for L0 && L1 icache ECC errors. */
gr_gv11b_handle_icache_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
return ret;
}

View File

@@ -41,6 +41,10 @@ struct gr_t19x {
struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count;
struct gr_gp10b_ecc_stat sm_cbu_corrected_err_count;
struct gr_gp10b_ecc_stat sm_cbu_uncorrected_err_count;
struct gr_gp10b_ecc_stat sm_l1_data_corrected_err_count;
struct gr_gp10b_ecc_stat sm_l1_data_uncorrected_err_count;
struct gr_gp10b_ecc_stat sm_icache_corrected_err_count;
struct gr_gp10b_ecc_stat sm_icache_uncorrected_err_count;
} ecc_stats;
};

View File

@@ -127,6 +127,10 @@ static struct device_attribute *dev_attr_sm_l1_tag_ecc_corrected_err_count_array
static struct device_attribute *dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array;
static struct device_attribute *dev_attr_sm_cbu_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_sm_cbu_ecc_uncorrected_err_count_array;
static struct device_attribute *dev_attr_sm_l1_data_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_sm_l1_data_ecc_uncorrected_err_count_array;
static struct device_attribute *dev_attr_sm_icache_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_array;
void gr_gv11b_create_sysfs(struct device *dev)
{
@@ -165,6 +169,30 @@ void gr_gv11b_create_sysfs(struct device *dev)
&g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count,
dev_attr_sm_cbu_ecc_uncorrected_err_count_array);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_l1_data_ecc_corrected_err_count",
&g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count,
dev_attr_sm_l1_data_ecc_corrected_err_count_array);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_l1_data_ecc_uncorrected_err_count",
&g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count,
dev_attr_sm_l1_data_ecc_uncorrected_err_count_array);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_icache_ecc_corrected_err_count",
&g->gr.t19x.ecc_stats.sm_icache_corrected_err_count,
dev_attr_sm_icache_ecc_corrected_err_count_array);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_icache_ecc_uncorrected_err_count",
&g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count,
dev_attr_sm_icache_ecc_uncorrected_err_count_array);
if (error)
dev_err(dev, "Failed to create gv11b sysfs attributes!\n");
}
@@ -193,4 +221,24 @@ static void gr_gv11b_remove_sysfs(struct device *dev)
&g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count,
dev_attr_sm_cbu_ecc_uncorrected_err_count_array);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count,
dev_attr_sm_l1_data_ecc_corrected_err_count_array);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count,
dev_attr_sm_l1_data_ecc_uncorrected_err_count_array);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_icache_corrected_err_count,
dev_attr_sm_icache_ecc_corrected_err_count_array);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count,
dev_attr_sm_icache_ecc_uncorrected_err_count_array);
}

View File

@@ -582,6 +582,134 @@ static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v(u32
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r(void)
{
return 0x0050436c;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_0_m(void)
{
return 0x1 << 0;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_1_m(void)
{
return 0x1 << 1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_0_m(void)
{
return 0x1 << 2;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_1_m(void)
{
return 0x1 << 3;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 8) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 10) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f(void)
{
return 0x40000000;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r(void)
{
return 0x00504370;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(void)
{
return 0x00504374;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_r(void)
{
return 0x0050464c;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m(void)
{
return 0x1 << 0;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m(void)
{
return 0x1 << 1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m(void)
{
return 0x1 << 2;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m(void)
{
return 0x1 << 3;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m(void)
{
return 0x1 << 4;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m(void)
{
return 0x1 << 5;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m(void)
{
return 0x1 << 6;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m(void)
{
return 0x1 << 7;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 16) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 18) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_reset_task_f(void)
{
return 0x40000000;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(void)
{
return 0x00504650;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(void)
{
return 0x00504654;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(void)
{
return 0x00504624;