gpu: nvgpu: GPC MMU ECC support

Adding support for GPC MMU ECC error handling

JIRA: GPUT19X-112

Change-Id: I62083bf2f144ff628ecd8c0aefc8d227a233ff36
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: http://git-master/r/1490772
Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
David Nieto
2017-05-26 08:31:46 -07:00
committed by mobile promotions
parent 6bc36bded0
commit 345eaef6a7
4 changed files with 332 additions and 16 deletions

View File

@@ -31,6 +31,8 @@ struct ecc_gr_t19x {
struct gk20a_ecc_stat fecs_uncorrected_err_count;
struct gk20a_ecc_stat gpccs_corrected_err_count;
struct gk20a_ecc_stat gpccs_uncorrected_err_count;
struct gk20a_ecc_stat mmu_l1tlb_corrected_err_count;
struct gk20a_ecc_stat mmu_l1tlb_uncorrected_err_count;
};
struct ecc_ltc_t19x {

View File

@@ -658,16 +658,101 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
return 0;
}
static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
u32 exception)
{
int ret = 0;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 offset = gpc_stride * gpc;
u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
u32 corrected_overflow, uncorrected_overflow;
int hww_esr;
hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
if (!(hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() |
gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())))
return ret;
ecc_status = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_status_r() + offset);
ecc_addr = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_address_r() + offset);
corrected_cnt = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset);
uncorrected_cnt = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset);
corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(
corrected_cnt);
uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(
uncorrected_cnt);
corrected_overflow = ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m();
uncorrected_overflow = ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m();
/* clear the interrupt */
if ((corrected_delta > 0) || corrected_overflow)
gk20a_writel(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() +
offset, 0);
if ((uncorrected_delta > 0) || uncorrected_overflow)
gk20a_writel(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() +
offset, 0);
gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset,
gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f());
/* Handle overflow */
if (corrected_overflow)
corrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s());
if (uncorrected_overflow)
uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc] +=
corrected_delta;
g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc] +=
uncorrected_delta;
nvgpu_log(g, gpu_dbg_intr,
"mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m())
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m())
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m())
nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m())
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
if (corrected_overflow || uncorrected_overflow)
nvgpu_info(g, "mmu l1tlb ecc counter overflow!");
nvgpu_log(g, gpu_dbg_intr,
"ecc error address: 0x%x", ecc_addr);
nvgpu_log(g, gpu_dbg_intr,
"ecc error count corrected: %d, uncorrected %d",
g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc],
g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc]);
return ret;
}
static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
u32 exception)
{
int ret = 0;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 offset = gpc_stride * gpc;
u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
u32 corrected_overflow, uncorrected_overflow;
int hww_esr;
u32 offset = proj_gpc_stride_v() * gpc;
hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
@@ -741,6 +826,15 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
return ret;
}
static int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception)
{
if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m())
return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc,
gpc_exception);
return 0;
}
static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception)
{
@@ -764,7 +858,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
(tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) |
gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1)));
gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1) |
gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1)));
}
static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
@@ -1810,7 +1905,7 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
nvgpu_log(g, gpu_dbg_intr,
"dmem ecc error uncorrected");
if (corrected_overflow || uncorrected_overflow)
nvgpu_info(g, "gpccs ecc counter overflow!");
nvgpu_info(g, "fecs ecc counter overflow!");
nvgpu_log(g, gpu_dbg_intr,
"ecc error row address: 0x%x",
@@ -2422,4 +2517,6 @@ void gv11b_init_gr(struct gpu_ops *gops)
gops->gr.handle_gpc_gpccs_exception =
gr_gv11b_handle_gpc_gpccs_exception;
gops->gr.set_czf_bypass = NULL;
gops->gr.handle_gpc_gpcmmu_exception =
gr_gv11b_handle_gpc_gpcmmu_exception;
}

View File

@@ -171,6 +171,8 @@ static struct device_attribute *dev_attr_sm_icache_ecc_corrected_err_count_array
static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_array;
static struct device_attribute *dev_attr_gcc_l15_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_gcc_l15_ecc_uncorrected_err_count_array;
static struct device_attribute *dev_attr_mmu_l1tlb_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array;
static struct device_attribute *dev_attr_fecs_ecc_corrected_err_count_array;
static struct device_attribute *dev_attr_fecs_ecc_uncorrected_err_count_array;
@@ -295,6 +297,19 @@ void gr_gv11b_create_sysfs(struct device *dev)
&g->ecc.gr.t19x.gpccs_corrected_err_count,
dev_attr_gpccs_ecc_corrected_err_count_array);
error |= gp10b_ecc_stat_create(dev,
g->gr.gpc_count,
"gpc",
"mmu_l1tlb_ecc_uncorrected_err_count",
&g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count,
dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array);
error |= gp10b_ecc_stat_create(dev,
g->gr.gpc_count,
"gpc",
"mmu_l1tlb_ecc_corrected_err_count",
&g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count,
dev_attr_mmu_l1tlb_ecc_corrected_err_count_array);
if (error)
dev_err(dev, "Failed to create gv11b sysfs attributes!\n");
}
@@ -382,4 +397,14 @@ static void gr_gv11b_remove_sysfs(struct device *dev)
g->gr.gpc_count,
&g->ecc.gr.t19x.gpccs_corrected_err_count,
dev_attr_gpccs_ecc_corrected_err_count_array);
gp10b_ecc_stat_remove(dev,
g->gr.gpc_count,
&g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count,
dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array);
gp10b_ecc_stat_remove(dev,
g->gr.gpc_count,
&g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count,
dev_attr_mmu_l1tlb_ecc_corrected_err_count_array);
}

View File

@@ -3426,6 +3426,10 @@ static inline u32 gr_gpcs_gpccs_gpc_exception_en_gpccs_f(u32 v)
{
return (v & 0x1) << 14;
}
static inline u32 gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(u32 v)
{
return (v & 0x1) << 15;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_r(void)
{
return 0x00502c90;
@@ -3442,6 +3446,30 @@ static inline u32 gr_gpc0_gpccs_gpc_exception_tpc_0_pending_v(void)
{
return 0x00000001;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_f(u32 v)
{
return (v & 0x1) << 14;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_m(void)
{
return 0x1 << 14;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_pending_f(void)
{
return 0x4000;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpcmmu_f(u32 v)
{
return (v & 0x1) << 15;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpcmmu_m(void)
{
return 0x1 << 15;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpcmmu_pending_f(void)
{
return 0x8000;
}
static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_r(void)
{
return 0x00501048;
@@ -3498,18 +3526,6 @@ static inline u32 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_f(u32 v)
{
return (v & 0x1) << 14;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_m(void)
{
return 0x1 << 14;
}
static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_pending_f(void)
{
return 0x4000;
}
static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_r(void)
{
return 0x00504508;
@@ -4014,6 +4030,182 @@ static inline u32 gr_gpcs_tc_debug0_limit_coalesce_buffer_size_m(void)
{
return 0x1ff << 0;
}
static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_r(void)
{
return 0x00500324;
}
static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_f(u32 v)
{
return (v & 0x1) << 0;
}
static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m(void)
{
return 0x1 << 0;
}
static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_f(u32 v)
{
return (v & 0x1) << 1;
}
static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m(void)
{
return 0x1 << 1;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_r(void)
{
return 0x00500314;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_f(u32 v)
{
return (v & 0x1) << 0;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m(void)
{
return 0x1 << 0;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_f(u32 v)
{
return (v & 0x1) << 2;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m(void)
{
return 0x1 << 2;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_f(u32 v)
{
return (v & 0x1) << 1;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m(void)
{
return 0x1 << 1;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_f(u32 v)
{
return (v & 0x1) << 3;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m(void)
{
return 0x1 << 3;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_f(u32 v)
{
return (v & 0x1) << 18;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m(void)
{
return 0x1 << 18;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_f(u32 v)
{
return (v & 0x1) << 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m(void)
{
return 0x1 << 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_unique_counter_overflow_f(u32 v)
{
return (v & 0x1) << 19;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_unique_counter_overflow_m(void)
{
return 0x1 << 19;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_unique_counter_overflow_f(u32 v)
{
return (v & 0x1) << 17;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_unique_counter_overflow_m(void)
{
return 0x1 << 17;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_reset_f(u32 v)
{
return (v & 0x1) << 30;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f(void)
{
return 0x40000000;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_address_r(void)
{
return 0x00500320;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_address_index_f(u32 v)
{
return (v & 0xffffffff) << 0;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r(void)
{
return 0x00500318;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_f(u32 v)
{
return (v & 0xffff) << 0;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_m(void)
{
return 0xffff << 0;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_s(void)
{
return 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_f(u32 v)
{
return (v & 0xffff) << 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_m(void)
{
return 0xffff << 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_v(u32 r)
{
return (r >> 16) & 0xffff;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r(void)
{
return 0x0050031c;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s(void)
{
return 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_f(u32 v)
{
return (v & 0xffff) << 0;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_m(void)
{
return 0xffff << 0;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(u32 r)
{
return (r >> 0) & 0xffff;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_s(void)
{
return 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_f(u32 v)
{
return (v & 0xffff) << 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_m(void)
{
return 0xffff << 16;
}
static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_v(u32 r)
{
return (r >> 16) & 0xffff;
}
static inline u32 gr_gpc0_gpccs_hww_esr_r(void)
{
return 0x00502c98;