gpu: nvgpu: gv11b: Add L1 tags parity support

This CL covers the following parity support (corrected + uncorrected), 1) SM's L1 tags 2) SM's S2R's pixel PRF buffer 3) SM's L1 D-cache miss latency FIFOs Volta Resiliency Id - Volta-720, Volta-721, Volta-637 JIRA GPUT19X-85 JIRA GPUT19X-104 JIRA GPUT19X-100 JIRA GPUT19X-103 Bug 1825948 Bug 1825962 Bug 1775457 Change-Id: I53d7231a36b2c7c252395eca27b349eca80dec63 Signed-off-by: Lakshmanan M <lm@nvidia.com> Reviewed-on: http://git-master/r/1478881 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 18:42:29 +03:00 · 2017-05-10 12:38:08 +05:30
parent 808af68d96
commit ffc37e50fa
4 changed files with 225 additions and 1 deletions
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -108,6 +108,89 @@ static bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num)
 	return valid;
 }

+static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
+			bool *post_event, struct channel_gk20a *fault_ch,
+			u32 *hww_global_esr)
+{
+	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+	u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
+	u32 l1_tag_ecc_status, l1_tag_ecc_corrected_err_status = 0;
+	u32 l1_tag_ecc_uncorrected_err_status = 0;
+	u32 l1_tag_corrected_err_count_delta = 0;
+	u32 l1_tag_uncorrected_err_count_delta = 0;
+	bool is_l1_tag_ecc_corrected_total_err_overflow = 0;
+	bool is_l1_tag_ecc_uncorrected_total_err_overflow = 0;
+
+	/* Check for L1 tag ECC errors. */
+	l1_tag_ecc_status = gk20a_readl(g,
+		gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset);
+	l1_tag_ecc_corrected_err_status = l1_tag_ecc_status &
+		(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
+		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m() |
+		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() |
+		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m());
+	l1_tag_ecc_uncorrected_err_status = l1_tag_ecc_status &
+		(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
+		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m() |
+		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m() |
+		 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m());
+
+	if ((l1_tag_ecc_corrected_err_status == 0) && (l1_tag_ecc_uncorrected_err_status == 0))
+		return 0;
+
+	l1_tag_corrected_err_count_delta =
+		gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v(
+			gk20a_readl(g,
+				gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() +
+				offset));
+	l1_tag_uncorrected_err_count_delta =
+		gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(
+			gk20a_readl(g,
+				gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() +
+				offset));
+	is_l1_tag_ecc_corrected_total_err_overflow =
+		gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(l1_tag_ecc_status);
+	is_l1_tag_ecc_uncorrected_total_err_overflow =
+		gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(l1_tag_ecc_status);
+
+	if ((l1_tag_corrected_err_count_delta > 0) || is_l1_tag_ecc_corrected_total_err_overflow) {
+		gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
+			"corrected error (SBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]",
+			l1_tag_ecc_corrected_err_status, is_l1_tag_ecc_corrected_total_err_overflow);
+
+		/* HW uses 16-bits counter */
+		l1_tag_corrected_err_count_delta +=
+			(is_l1_tag_ecc_corrected_total_err_overflow <<
+			 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s());
+		g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters[tpc] +=
+							l1_tag_corrected_err_count_delta;
+		gk20a_writel(g,
+			gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
+			0);
+	}
+	if ((l1_tag_uncorrected_err_count_delta > 0) || is_l1_tag_ecc_uncorrected_total_err_overflow) {
+		gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
+			"Uncorrected error (DBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]",
+			l1_tag_ecc_uncorrected_err_status, is_l1_tag_ecc_uncorrected_total_err_overflow);
+
+		/* HW uses 16-bits counter */
+		l1_tag_uncorrected_err_count_delta +=
+			(is_l1_tag_ecc_uncorrected_total_err_overflow <<
+			 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s());
+		g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count.counters[tpc] +=
+							l1_tag_uncorrected_err_count_delta;
+		gk20a_writel(g,
+			gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
+			0);
+	}
+
+	gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset,
+			gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f());
+
+	return 0;
+
+}

 static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
 			bool *post_event, struct channel_gk20a *fault_ch,
@@ -118,7 +201,8 @@ static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
 			proj_tpc_in_gpc_stride_v() * tpc;
 	u32 lrf_ecc_status;

-	gr_gk20a_handle_sm_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
+	/* Check for L1 tag ECC errors. */
+	gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);

 	/* Check for LRF ECC errors. */
 	lrf_ecc_status = gk20a_readl(g,
@@ -1692,6 +1776,7 @@ void gv11b_init_gr(struct gpu_ops *gops)
 	gops->gr.pre_process_sm_exception =
 		gr_gv11b_pre_process_sm_exception;
 	gops->gr.handle_fecs_error = gr_gv11b_handle_fecs_error;
+	gops->gr.create_gr_sysfs = gr_gv11b_create_sysfs;
 	gops->gr.setup_rop_mapping = gr_gv11b_setup_rop_mapping;
 	gops->gr.init_sw_veid_bundle = gr_gv11b_init_sw_veid_bundle;
 	gops->gr.program_zcull_mapping = gr_gv11b_program_zcull_mapping;
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -35,6 +35,13 @@ enum {
 	VOLTA_DMA_COPY_A        = 0xC3B5,
 };

+struct gr_t19x {
+	struct {
+		struct gr_gp10b_ecc_stat sm_l1_tag_corrected_err_count;
+		struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count;
+	} ecc_stats;
+};
+
 #define NVC397_SET_SHADER_EXCEPTIONS		0x1528
 #define NVC397_SET_CIRCULAR_BUFFER_SIZE 	0x1280
 #define NVC397_SET_ALPHA_CIRCULAR_BUFFER_SIZE 	0x02dc
@@ -48,4 +55,5 @@ int gr_gv11b_alloc_buffer(struct vm_gk20a *vm, size_t size,
 /*zcull*/
 void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
 					u32 *zcull_map_tiles);
+void gr_gv11b_create_sysfs(struct device *dev);
 #endif
--- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c
+++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c
@@ -27,11 +27,13 @@
 #include "tegra/linux/clk.h"

 #include "gp10b/platform_gp10b.h"
+#include "tegra/linux/platform_gp10b_tegra.h"

 #include "tegra/linux/platform_gk20a_tegra.h"
 #include "gr_gv11b.h"
 #include "nvgpu_gpuid_t19x.h"

+static void gr_gv11b_remove_sysfs(struct device *dev);

 static int gv11b_tegra_probe(struct device *dev)
 {
@@ -57,6 +59,15 @@ static int gv11b_tegra_probe(struct device *dev)
 	return 0;
 }

+static int gv11b_tegra_remove(struct device *dev)
+{
+	gp10b_tegra_remove(dev);
+
+	gr_gv11b_remove_sysfs(dev);
+
+	return 0;
+}
+
 static bool gv11b_tegra_is_railgated(struct device *dev)
 {
 	bool ret = false;
@@ -89,6 +100,7 @@ struct gk20a_platform t19x_gpu_tegra_platform = {
 	.ptimer_src_freq	= 31250000,

 	.probe = gv11b_tegra_probe,
+	.remove = gv11b_tegra_remove,

 	/* power management callbacks */
 	.suspend = gv11b_tegra_suspend,
@@ -110,3 +122,50 @@ struct gk20a_platform t19x_gpu_tegra_platform = {
 	.reset_assert = gp10b_tegra_reset_assert,
 	.reset_deassert = gp10b_tegra_reset_deassert,
 };
+
+static struct device_attribute *dev_attr_sm_l1_tag_ecc_corrected_err_count_array;
+static struct device_attribute *dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array;
+
+void gr_gv11b_create_sysfs(struct device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+	int error = 0;
+	/* This stat creation function is called on GR init. GR can get
+       initialized multiple times but we only need to create the ECC
+       stats once. Therefore, add the following check to avoid
+       creating duplicate stat sysfs nodes. */
+	if (g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters != NULL)
+		return;
+
+	gr_gp10b_create_sysfs(dev);
+
+	error |= gr_gp10b_ecc_stat_create(dev,
+				0,
+				"sm_l1_tag_ecc_corrected_err_count",
+				&g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count,
+				dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
+
+	error |= gr_gp10b_ecc_stat_create(dev,
+				0,
+				"sm_l1_tag_ecc_uncorrected_err_count",
+				&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
+				dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
+
+	if (error)
+		dev_err(dev, "Failed to create gv11b sysfs attributes!\n");
+}
+
+static void gr_gv11b_remove_sysfs(struct device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+
+	gr_gp10b_ecc_stat_remove(dev,
+			0,
+			&g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count,
+			dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
+
+	gr_gp10b_ecc_stat_remove(dev,
+			0,
+			&g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count,
+			dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
+}
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
@@ -482,6 +482,78 @@ static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(void)
 {
 	return 0x00504358;
 }
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(void)
+{
+	return 0x00504624;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m(void)
+{
+	return 0x1 << 4;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m(void)
+{
+	return 0x1 << 5;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m(void)
+{
+	return 0x1 << 6;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m(void)
+{
+	return 0x1 << 7;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(u32 r)
+{
+	return (r >> 8) & 0x1;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r)
+{
+	return (r >> 10) & 0x1;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r(void)
+{
+	return 0x00504628;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s(void)
+{
+	return 16;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r(void)
+{
+	return 0x0050462c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s(void)
+{
+	return 16;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
 static inline u32 gr_pri_gpc0_tpc0_tex_m_routing_r(void)
 {
 	return 0x005042c4;