mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 18:42:29 +03:00
gpu: nvgpu: gv11b: Add L1 tags parity support
This CL covers the following parity support (corrected + uncorrected), 1) SM's L1 tags 2) SM's S2R's pixel PRF buffer 3) SM's L1 D-cache miss latency FIFOs Volta Resiliency Id - Volta-720, Volta-721, Volta-637 JIRA GPUT19X-85 JIRA GPUT19X-104 JIRA GPUT19X-100 JIRA GPUT19X-103 Bug 1825948 Bug 1825962 Bug 1775457 Change-Id: I53d7231a36b2c7c252395eca27b349eca80dec63 Signed-off-by: Lakshmanan M <lm@nvidia.com> Reviewed-on: http://git-master/r/1478881 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
808af68d96
commit
ffc37e50fa
@@ -108,6 +108,89 @@ static bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num)
|
||||
return valid;
|
||||
}
|
||||
|
||||
static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
bool *post_event, struct channel_gk20a *fault_ch,
|
||||
u32 *hww_global_esr)
|
||||
{
|
||||
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
|
||||
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
|
||||
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
|
||||
u32 l1_tag_ecc_status, l1_tag_ecc_corrected_err_status = 0;
|
||||
u32 l1_tag_ecc_uncorrected_err_status = 0;
|
||||
u32 l1_tag_corrected_err_count_delta = 0;
|
||||
u32 l1_tag_uncorrected_err_count_delta = 0;
|
||||
bool is_l1_tag_ecc_corrected_total_err_overflow = 0;
|
||||
bool is_l1_tag_ecc_uncorrected_total_err_overflow = 0;
|
||||
|
||||
/* Check for L1 tag ECC errors. */
|
||||
l1_tag_ecc_status = gk20a_readl(g,
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset);
|
||||
l1_tag_ecc_corrected_err_status = l1_tag_ecc_status &
|
||||
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m());
|
||||
l1_tag_ecc_uncorrected_err_status = l1_tag_ecc_status &
|
||||
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m());
|
||||
|
||||
if ((l1_tag_ecc_corrected_err_status == 0) && (l1_tag_ecc_uncorrected_err_status == 0))
|
||||
return 0;
|
||||
|
||||
l1_tag_corrected_err_count_delta =
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v(
|
||||
gk20a_readl(g,
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() +
|
||||
offset));
|
||||
l1_tag_uncorrected_err_count_delta =
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(
|
||||
gk20a_readl(g,
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() +
|
||||
offset));
|
||||
is_l1_tag_ecc_corrected_total_err_overflow =
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(l1_tag_ecc_status);
|
||||
is_l1_tag_ecc_uncorrected_total_err_overflow =
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(l1_tag_ecc_status);
|
||||
|
||||
if ((l1_tag_corrected_err_count_delta > 0) || is_l1_tag_ecc_corrected_total_err_overflow) {
|
||||
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
|
||||
"corrected error (SBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]",
|
||||
l1_tag_ecc_corrected_err_status, is_l1_tag_ecc_corrected_total_err_overflow);
|
||||
|
||||
/* HW uses 16-bits counter */
|
||||
l1_tag_corrected_err_count_delta +=
|
||||
(is_l1_tag_ecc_corrected_total_err_overflow <<
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s());
|
||||
g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters[tpc] +=
|
||||
l1_tag_corrected_err_count_delta;
|
||||
gk20a_writel(g,
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
|
||||
0);
|
||||
}
|
||||
if ((l1_tag_uncorrected_err_count_delta > 0) || is_l1_tag_ecc_uncorrected_total_err_overflow) {
|
||||
gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
|
||||
"Uncorrected error (DBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]",
|
||||
l1_tag_ecc_uncorrected_err_status, is_l1_tag_ecc_uncorrected_total_err_overflow);
|
||||
|
||||
/* HW uses 16-bits counter */
|
||||
l1_tag_uncorrected_err_count_delta +=
|
||||
(is_l1_tag_ecc_uncorrected_total_err_overflow <<
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s());
|
||||
g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count.counters[tpc] +=
|
||||
l1_tag_uncorrected_err_count_delta;
|
||||
gk20a_writel(g,
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
|
||||
0);
|
||||
}
|
||||
|
||||
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset,
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f());
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
bool *post_event, struct channel_gk20a *fault_ch,
|
||||
@@ -118,7 +201,8 @@ static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
proj_tpc_in_gpc_stride_v() * tpc;
|
||||
u32 lrf_ecc_status;
|
||||
|
||||
gr_gk20a_handle_sm_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
|
||||
/* Check for L1 tag ECC errors. */
|
||||
gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
|
||||
|
||||
/* Check for LRF ECC errors. */
|
||||
lrf_ecc_status = gk20a_readl(g,
|
||||
@@ -1692,6 +1776,7 @@ void gv11b_init_gr(struct gpu_ops *gops)
|
||||
gops->gr.pre_process_sm_exception =
|
||||
gr_gv11b_pre_process_sm_exception;
|
||||
gops->gr.handle_fecs_error = gr_gv11b_handle_fecs_error;
|
||||
gops->gr.create_gr_sysfs = gr_gv11b_create_sysfs;
|
||||
gops->gr.setup_rop_mapping = gr_gv11b_setup_rop_mapping;
|
||||
gops->gr.init_sw_veid_bundle = gr_gv11b_init_sw_veid_bundle;
|
||||
gops->gr.program_zcull_mapping = gr_gv11b_program_zcull_mapping;
|
||||
|
||||
@@ -35,6 +35,13 @@ enum {
|
||||
VOLTA_DMA_COPY_A = 0xC3B5,
|
||||
};
|
||||
|
||||
struct gr_t19x {
|
||||
struct {
|
||||
struct gr_gp10b_ecc_stat sm_l1_tag_corrected_err_count;
|
||||
struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count;
|
||||
} ecc_stats;
|
||||
};
|
||||
|
||||
#define NVC397_SET_SHADER_EXCEPTIONS 0x1528
|
||||
#define NVC397_SET_CIRCULAR_BUFFER_SIZE 0x1280
|
||||
#define NVC397_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
|
||||
@@ -48,4 +55,5 @@ int gr_gv11b_alloc_buffer(struct vm_gk20a *vm, size_t size,
|
||||
/*zcull*/
|
||||
void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
|
||||
u32 *zcull_map_tiles);
|
||||
void gr_gv11b_create_sysfs(struct device *dev);
|
||||
#endif
|
||||
|
||||
@@ -27,11 +27,13 @@
|
||||
#include "tegra/linux/clk.h"
|
||||
|
||||
#include "gp10b/platform_gp10b.h"
|
||||
#include "tegra/linux/platform_gp10b_tegra.h"
|
||||
|
||||
#include "tegra/linux/platform_gk20a_tegra.h"
|
||||
#include "gr_gv11b.h"
|
||||
#include "nvgpu_gpuid_t19x.h"
|
||||
|
||||
static void gr_gv11b_remove_sysfs(struct device *dev);
|
||||
|
||||
static int gv11b_tegra_probe(struct device *dev)
|
||||
{
|
||||
@@ -57,6 +59,15 @@ static int gv11b_tegra_probe(struct device *dev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gv11b_tegra_remove(struct device *dev)
|
||||
{
|
||||
gp10b_tegra_remove(dev);
|
||||
|
||||
gr_gv11b_remove_sysfs(dev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool gv11b_tegra_is_railgated(struct device *dev)
|
||||
{
|
||||
bool ret = false;
|
||||
@@ -89,6 +100,7 @@ struct gk20a_platform t19x_gpu_tegra_platform = {
|
||||
.ptimer_src_freq = 31250000,
|
||||
|
||||
.probe = gv11b_tegra_probe,
|
||||
.remove = gv11b_tegra_remove,
|
||||
|
||||
/* power management callbacks */
|
||||
.suspend = gv11b_tegra_suspend,
|
||||
@@ -110,3 +122,50 @@ struct gk20a_platform t19x_gpu_tegra_platform = {
|
||||
.reset_assert = gp10b_tegra_reset_assert,
|
||||
.reset_deassert = gp10b_tegra_reset_deassert,
|
||||
};
|
||||
|
||||
static struct device_attribute *dev_attr_sm_l1_tag_ecc_corrected_err_count_array;
|
||||
static struct device_attribute *dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array;
|
||||
|
||||
void gr_gv11b_create_sysfs(struct device *dev)
|
||||
{
|
||||
struct gk20a *g = get_gk20a(dev);
|
||||
int error = 0;
|
||||
/* This stat creation function is called on GR init. GR can get
|
||||
initialized multiple times but we only need to create the ECC
|
||||
stats once. Therefore, add the following check to avoid
|
||||
creating duplicate stat sysfs nodes. */
|
||||
if (g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters != NULL)
|
||||
return;
|
||||
|
||||
gr_gp10b_create_sysfs(dev);
|
||||
|
||||
error |= gr_gp10b_ecc_stat_create(dev,
|
||||
0,
|
||||
"sm_l1_tag_ecc_corrected_err_count",
|
||||
&g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count,
|
||||
dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
|
||||
|
||||
error |= gr_gp10b_ecc_stat_create(dev,
|
||||
0,
|
||||
"sm_l1_tag_ecc_uncorrected_err_count",
|
||||
&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
|
||||
dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
|
||||
|
||||
if (error)
|
||||
dev_err(dev, "Failed to create gv11b sysfs attributes!\n");
|
||||
}
|
||||
|
||||
static void gr_gv11b_remove_sysfs(struct device *dev)
|
||||
{
|
||||
struct gk20a *g = get_gk20a(dev);
|
||||
|
||||
gr_gp10b_ecc_stat_remove(dev,
|
||||
0,
|
||||
&g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count,
|
||||
dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
|
||||
|
||||
gr_gp10b_ecc_stat_remove(dev,
|
||||
0,
|
||||
&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
|
||||
dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
|
||||
}
|
||||
|
||||
@@ -482,6 +482,78 @@ static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(void)
|
||||
{
|
||||
return 0x00504358;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(void)
|
||||
{
|
||||
return 0x00504624;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m(void)
|
||||
{
|
||||
return 0x1 << 0;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m(void)
|
||||
{
|
||||
return 0x1 << 1;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m(void)
|
||||
{
|
||||
return 0x1 << 2;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m(void)
|
||||
{
|
||||
return 0x1 << 3;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m(void)
|
||||
{
|
||||
return 0x1 << 4;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m(void)
|
||||
{
|
||||
return 0x1 << 5;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m(void)
|
||||
{
|
||||
return 0x1 << 6;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m(void)
|
||||
{
|
||||
return 0x1 << 7;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(u32 r)
|
||||
{
|
||||
return (r >> 8) & 0x1;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r)
|
||||
{
|
||||
return (r >> 10) & 0x1;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f(void)
|
||||
{
|
||||
return 0x40000000;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r(void)
|
||||
{
|
||||
return 0x00504628;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s(void)
|
||||
{
|
||||
return 16;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v(u32 r)
|
||||
{
|
||||
return (r >> 0) & 0xffff;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r(void)
|
||||
{
|
||||
return 0x0050462c;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s(void)
|
||||
{
|
||||
return 16;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(u32 r)
|
||||
{
|
||||
return (r >> 0) & 0xffff;
|
||||
}
|
||||
static inline u32 gr_pri_gpc0_tpc0_tex_m_routing_r(void)
|
||||
{
|
||||
return 0x005042c4;
|
||||
|
||||
Reference in New Issue
Block a user