gpu: nvgpu: gv11b: Add LRF + CBU parity support

This CL covers the following parity support (uncorrected error),
1) SM's LRF
2) SM's CBU

Volta Resiliency Id - Volta-637

JIRA GPUT19X-85
JIRA GPUT19X-110

Bug 1775457

Change-Id: I3befb1fe22719d06aa819ef27654aaf97f911a9b
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: http://git-master/r/1481791
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Lakshmanan M
2017-05-15 15:32:21 +05:30
committed by mobile promotions
parent ffc37e50fa
commit d503a23444
4 changed files with 379 additions and 7 deletions

View File

@@ -192,24 +192,197 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
u32 lrf_ecc_status, lrf_ecc_corrected_err_status = 0;
u32 lrf_ecc_uncorrected_err_status = 0;
u32 lrf_corrected_err_count_delta = 0;
u32 lrf_uncorrected_err_count_delta = 0;
bool is_lrf_ecc_corrected_total_err_overflow = 0;
bool is_lrf_ecc_uncorrected_total_err_overflow = 0;
/* Check for LRF ECC errors. */
lrf_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
lrf_ecc_corrected_err_status = lrf_ecc_status &
(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp4_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp5_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp6_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp7_m());
lrf_ecc_uncorrected_err_status = lrf_ecc_status &
(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp4_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp5_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp6_m() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp7_m());
if ((lrf_ecc_corrected_err_status == 0) && (lrf_ecc_uncorrected_err_status == 0))
return 0;
lrf_corrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() +
offset));
lrf_uncorrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() +
offset));
is_lrf_ecc_corrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_total_counter_overflow_v(lrf_ecc_status);
is_lrf_ecc_uncorrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_total_counter_overflow_v(lrf_ecc_status);
if ((lrf_corrected_err_count_delta > 0) || is_lrf_ecc_corrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"corrected error (SBE) detected in SM LRF! err_mask [%08x] is_overf [%d]",
lrf_ecc_corrected_err_status, is_lrf_ecc_corrected_total_err_overflow);
/* HW uses 16-bits counter */
lrf_corrected_err_count_delta +=
(is_lrf_ecc_corrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s());
g->gr.t18x.ecc_stats.sm_lrf_single_err_count.counters[tpc] +=
lrf_corrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset,
0);
}
if ((lrf_uncorrected_err_count_delta > 0) || is_lrf_ecc_uncorrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"Uncorrected error (DBE) detected in SM LRF! err_mask [%08x] is_overf [%d]",
lrf_ecc_uncorrected_err_status, is_lrf_ecc_uncorrected_total_err_overflow);
/* HW uses 16-bits counter */
lrf_uncorrected_err_count_delta +=
(is_lrf_ecc_uncorrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s());
g->gr.t18x.ecc_stats.sm_lrf_double_err_count.counters[tpc] +=
lrf_uncorrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
0);
}
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_reset_task_f());
return 0;
}
static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
u32 cbu_ecc_status, cbu_ecc_corrected_err_status = 0;
u32 cbu_ecc_uncorrected_err_status = 0;
u32 cbu_corrected_err_count_delta = 0;
u32 cbu_uncorrected_err_count_delta = 0;
bool is_cbu_ecc_corrected_total_err_overflow = 0;
bool is_cbu_ecc_uncorrected_total_err_overflow = 0;
/* Check for CBU ECC errors. */
cbu_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset);
cbu_ecc_corrected_err_status = cbu_ecc_status &
(gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm0_m() |
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm1_m() |
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm0_m() |
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm1_m());
cbu_ecc_uncorrected_err_status = cbu_ecc_status &
(gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm0_m() |
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm1_m() |
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm0_m() |
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm1_m());
if ((cbu_ecc_corrected_err_status == 0) && (cbu_ecc_uncorrected_err_status == 0))
return 0;
cbu_corrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() +
offset));
cbu_uncorrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() +
offset));
is_cbu_ecc_corrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_total_counter_overflow_v(cbu_ecc_status);
is_cbu_ecc_uncorrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_total_counter_overflow_v(cbu_ecc_status);
if ((cbu_corrected_err_count_delta > 0) || is_cbu_ecc_corrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"corrected error (SBE) detected in SM CBU! err_mask [%08x] is_overf [%d]",
cbu_ecc_corrected_err_status, is_cbu_ecc_corrected_total_err_overflow);
/* HW uses 16-bits counter */
cbu_corrected_err_count_delta +=
(is_cbu_ecc_corrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count.counters[tpc] +=
cbu_corrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset,
0);
}
if ((cbu_uncorrected_err_count_delta > 0) || is_cbu_ecc_uncorrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"Uncorrected error (DBE) detected in SM CBU! err_mask [%08x] is_overf [%d]",
cbu_ecc_uncorrected_err_status, is_cbu_ecc_uncorrected_total_err_overflow);
/* HW uses 16-bits counter */
cbu_uncorrected_err_count_delta +=
(is_cbu_ecc_uncorrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count.counters[tpc] +=
cbu_uncorrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
0);
}
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset,
gr_pri_gpc0_tpc0_sm_cbu_ecc_status_reset_task_f());
return 0;
}
static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
int ret = 0;
u32 offset = proj_gpc_stride_v() * gpc +
proj_tpc_in_gpc_stride_v() * tpc;
u32 lrf_ecc_status;
/* Check for L1 tag ECC errors. */
gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
/* Check for LRF ECC errors. */
lrf_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
gr_gv11b_handle_lrf_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
/* Check for CBU ECC errors. */
gr_gv11b_handle_cbu_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
lrf_ecc_status);
return ret;
}

View File

@@ -39,6 +39,8 @@ struct gr_t19x {
struct {
struct gr_gp10b_ecc_stat sm_l1_tag_corrected_err_count;
struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count;
struct gr_gp10b_ecc_stat sm_cbu_corrected_err_count;
struct gr_gp10b_ecc_stat sm_cbu_uncorrected_err_count;
} ecc_stats;
};

View File

@@ -125,6 +125,8 @@ struct gk20a_platform t19x_gpu_tegra_platform = {
static struct device_attribute *dev_attr_sm_l1_tag_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array;
static struct device_attribute *dev_attr_sm_cbu_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_sm_cbu_ecc_uncorrected_err_count_array;
void gr_gv11b_create_sysfs(struct device *dev)
{
@@ -151,6 +153,18 @@ void gr_gv11b_create_sysfs(struct device *dev)
&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_cbu_ecc_corrected_err_count",
&g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count,
dev_attr_sm_cbu_ecc_corrected_err_count_array);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_cbu_ecc_uncorrected_err_count",
&g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count,
dev_attr_sm_cbu_ecc_uncorrected_err_count_array);
if (error)
dev_err(dev, "Failed to create gv11b sysfs attributes!\n");
}
@@ -168,4 +182,15 @@ static void gr_gv11b_remove_sysfs(struct device *dev)
0,
&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count,
dev_attr_sm_cbu_ecc_corrected_err_count_array);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count,
dev_attr_sm_cbu_ecc_uncorrected_err_count_array);
}

View File

@@ -482,6 +482,106 @@ static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(void)
{
return 0x00504358;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m(void)
{
return 0x1 << 0;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m(void)
{
return 0x1 << 1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m(void)
{
return 0x1 << 2;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m(void)
{
return 0x1 << 3;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp4_m(void)
{
return 0x1 << 4;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp5_m(void)
{
return 0x1 << 5;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp6_m(void)
{
return 0x1 << 6;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp7_m(void)
{
return 0x1 << 7;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m(void)
{
return 0x1 << 8;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m(void)
{
return 0x1 << 9;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m(void)
{
return 0x1 << 10;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m(void)
{
return 0x1 << 11;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp4_m(void)
{
return 0x1 << 12;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp5_m(void)
{
return 0x1 << 13;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp6_m(void)
{
return 0x1 << 14;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp7_m(void)
{
return 0x1 << 15;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 24) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 26) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_reset_task_f(void)
{
return 0x40000000;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r(void)
{
return 0x0050435c;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(void)
{
return 0x00504360;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(void)
{
return 0x00504624;
@@ -554,6 +654,78 @@ static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(u
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r(void)
{
return 0x00504638;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm0_m(void)
{
return 0x1 << 0;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm1_m(void)
{
return 0x1 << 1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm0_m(void)
{
return 0x1 << 2;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm1_m(void)
{
return 0x1 << 3;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm0_m(void)
{
return 0x1 << 4;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm1_m(void)
{
return 0x1 << 5;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm0_m(void)
{
return 0x1 << 6;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm1_m(void)
{
return 0x1 << 7;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 16) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 18) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_reset_task_f(void)
{
return 0x40000000;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r(void)
{
return 0x0050463c;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(void)
{
return 0x00504640;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_tex_m_routing_r(void)
{
return 0x005042c4;