gpu: nvgpu: gv11b: Add L1 tags parity support

This CL covers the following parity support (corrected + uncorrected),
1) SM's L1 tags
2) SM's S2R's pixel PRF buffer
3) SM's L1 D-cache miss latency FIFOs

Volta Resiliency Id - Volta-720, Volta-721,  Volta-637

JIRA GPUT19X-85
JIRA GPUT19X-104
JIRA GPUT19X-100
JIRA GPUT19X-103

Bug 1825948
Bug 1825962
Bug 1775457

Change-Id: I53d7231a36b2c7c252395eca27b349eca80dec63
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: http://git-master/r/1478881
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Lakshmanan M
2017-05-10 12:38:08 +05:30
committed by mobile promotions
parent 808af68d96
commit ffc37e50fa
4 changed files with 225 additions and 1 deletions

View File

@@ -108,6 +108,89 @@ static bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num)
return valid;
}
static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
u32 l1_tag_ecc_status, l1_tag_ecc_corrected_err_status = 0;
u32 l1_tag_ecc_uncorrected_err_status = 0;
u32 l1_tag_corrected_err_count_delta = 0;
u32 l1_tag_uncorrected_err_count_delta = 0;
bool is_l1_tag_ecc_corrected_total_err_overflow = 0;
bool is_l1_tag_ecc_uncorrected_total_err_overflow = 0;
/* Check for L1 tag ECC errors. */
l1_tag_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset);
l1_tag_ecc_corrected_err_status = l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m());
l1_tag_ecc_uncorrected_err_status = l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m());
if ((l1_tag_ecc_corrected_err_status == 0) && (l1_tag_ecc_uncorrected_err_status == 0))
return 0;
l1_tag_corrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() +
offset));
l1_tag_uncorrected_err_count_delta =
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() +
offset));
is_l1_tag_ecc_corrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(l1_tag_ecc_status);
is_l1_tag_ecc_uncorrected_total_err_overflow =
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(l1_tag_ecc_status);
if ((l1_tag_corrected_err_count_delta > 0) || is_l1_tag_ecc_corrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"corrected error (SBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]",
l1_tag_ecc_corrected_err_status, is_l1_tag_ecc_corrected_total_err_overflow);
/* HW uses 16-bits counter */
l1_tag_corrected_err_count_delta +=
(is_l1_tag_ecc_corrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters[tpc] +=
l1_tag_corrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
0);
}
if ((l1_tag_uncorrected_err_count_delta > 0) || is_l1_tag_ecc_uncorrected_total_err_overflow) {
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
"Uncorrected error (DBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]",
l1_tag_ecc_uncorrected_err_status, is_l1_tag_ecc_uncorrected_total_err_overflow);
/* HW uses 16-bits counter */
l1_tag_uncorrected_err_count_delta +=
(is_l1_tag_ecc_uncorrected_total_err_overflow <<
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s());
g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count.counters[tpc] +=
l1_tag_uncorrected_err_count_delta;
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
0);
}
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f());
return 0;
}
static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
@@ -118,7 +201,8 @@ static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
proj_tpc_in_gpc_stride_v() * tpc;
u32 lrf_ecc_status;
gr_gk20a_handle_sm_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
/* Check for L1 tag ECC errors. */
gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
/* Check for LRF ECC errors. */
lrf_ecc_status = gk20a_readl(g,
@@ -1692,6 +1776,7 @@ void gv11b_init_gr(struct gpu_ops *gops)
gops->gr.pre_process_sm_exception =
gr_gv11b_pre_process_sm_exception;
gops->gr.handle_fecs_error = gr_gv11b_handle_fecs_error;
gops->gr.create_gr_sysfs = gr_gv11b_create_sysfs;
gops->gr.setup_rop_mapping = gr_gv11b_setup_rop_mapping;
gops->gr.init_sw_veid_bundle = gr_gv11b_init_sw_veid_bundle;
gops->gr.program_zcull_mapping = gr_gv11b_program_zcull_mapping;

View File

@@ -35,6 +35,13 @@ enum {
VOLTA_DMA_COPY_A = 0xC3B5,
};
struct gr_t19x {
struct {
struct gr_gp10b_ecc_stat sm_l1_tag_corrected_err_count;
struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count;
} ecc_stats;
};
#define NVC397_SET_SHADER_EXCEPTIONS 0x1528
#define NVC397_SET_CIRCULAR_BUFFER_SIZE 0x1280
#define NVC397_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
@@ -48,4 +55,5 @@ int gr_gv11b_alloc_buffer(struct vm_gk20a *vm, size_t size,
/*zcull*/
void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
u32 *zcull_map_tiles);
void gr_gv11b_create_sysfs(struct device *dev);
#endif

View File

@@ -27,11 +27,13 @@
#include "tegra/linux/clk.h"
#include "gp10b/platform_gp10b.h"
#include "tegra/linux/platform_gp10b_tegra.h"
#include "tegra/linux/platform_gk20a_tegra.h"
#include "gr_gv11b.h"
#include "nvgpu_gpuid_t19x.h"
static void gr_gv11b_remove_sysfs(struct device *dev);
static int gv11b_tegra_probe(struct device *dev)
{
@@ -57,6 +59,15 @@ static int gv11b_tegra_probe(struct device *dev)
return 0;
}
static int gv11b_tegra_remove(struct device *dev)
{
gp10b_tegra_remove(dev);
gr_gv11b_remove_sysfs(dev);
return 0;
}
static bool gv11b_tegra_is_railgated(struct device *dev)
{
bool ret = false;
@@ -89,6 +100,7 @@ struct gk20a_platform t19x_gpu_tegra_platform = {
.ptimer_src_freq = 31250000,
.probe = gv11b_tegra_probe,
.remove = gv11b_tegra_remove,
/* power management callbacks */
.suspend = gv11b_tegra_suspend,
@@ -110,3 +122,50 @@ struct gk20a_platform t19x_gpu_tegra_platform = {
.reset_assert = gp10b_tegra_reset_assert,
.reset_deassert = gp10b_tegra_reset_deassert,
};
static struct device_attribute *dev_attr_sm_l1_tag_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array;
void gr_gv11b_create_sysfs(struct device *dev)
{
struct gk20a *g = get_gk20a(dev);
int error = 0;
/* This stat creation function is called on GR init. GR can get
initialized multiple times but we only need to create the ECC
stats once. Therefore, add the following check to avoid
creating duplicate stat sysfs nodes. */
if (g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters != NULL)
return;
gr_gp10b_create_sysfs(dev);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_l1_tag_ecc_corrected_err_count",
&g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count,
dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
error |= gr_gp10b_ecc_stat_create(dev,
0,
"sm_l1_tag_ecc_uncorrected_err_count",
&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
if (error)
dev_err(dev, "Failed to create gv11b sysfs attributes!\n");
}
static void gr_gv11b_remove_sysfs(struct device *dev)
{
struct gk20a *g = get_gk20a(dev);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count,
dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
gr_gp10b_ecc_stat_remove(dev,
0,
&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
}

View File

@@ -482,6 +482,78 @@ static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(void)
{
return 0x00504358;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(void)
{
return 0x00504624;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m(void)
{
return 0x1 << 0;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m(void)
{
return 0x1 << 1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m(void)
{
return 0x1 << 2;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m(void)
{
return 0x1 << 3;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m(void)
{
return 0x1 << 4;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m(void)
{
return 0x1 << 5;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m(void)
{
return 0x1 << 6;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m(void)
{
return 0x1 << 7;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 8) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r)
{
return (r >> 10) & 0x1;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f(void)
{
return 0x40000000;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r(void)
{
return 0x00504628;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r(void)
{
return 0x0050462c;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_pri_gpc0_tpc0_tex_m_routing_r(void)
{
return 0x005042c4;