gpu: nvgpu: gv11b: enable more gr exceptions

-pd, scc, ds, ssync, mme and sked exceptions are enabled. This will be useful for debugging -Handle enabled interrupts -Add gr ops to handle ssync hww. For legacy chips, ssync hww_esr register is gpcs_ppcs_ssync_hww_esr. Since ssync hww is not enabled on legacy chips, added ssync hww exception handling for volta only. Change-Id: I63ba2eb51fa82e74832df26ee4cf3546458e5669 Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1644751 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-23 18:16:01 +03:00 · 2018-01-23 12:16:40 -08:00
parent 9beefc4551
commit 791ce6bd54
8 changed files with 203 additions and 19 deletions
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -442,6 +442,7 @@ struct gpu_ops {
 		void (*dump_ctxsw_stats)(struct gk20a *g, struct vm_gk20a *vm,
 					 struct nvgpu_gr_ctx *gr_ctx);
 		void (*fecs_host_int_enable)(struct gk20a *g);
+		int (*handle_ssync_hww)(struct gk20a *g);
 	} gr;
 	struct {
 		void (*init_hw)(struct gk20a *g);
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5895,7 +5895,10 @@ int gk20a_gr_isr(struct gk20a *g)

 		if (exception & gr_exception_fe_m()) {
 			u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
-			nvgpu_err(g, "fe warning %08x", fe);
+			u32 info = gk20a_readl(g, gr_fe_hww_esr_info_r());
+
+			nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
+					fe, info);
 			gk20a_writel(g, gr_fe_hww_esr_r(),
 				gr_fe_hww_esr_reset_active_f());
 			need_reset |= -EFAULT;
@@ -5903,12 +5906,67 @@ int gk20a_gr_isr(struct gk20a *g)

 		if (exception & gr_exception_memfmt_m()) {
 			u32 memfmt = gk20a_readl(g, gr_memfmt_hww_esr_r());
-			nvgpu_err(g, "memfmt exception %08x", memfmt);
+
+			nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
 			gk20a_writel(g, gr_memfmt_hww_esr_r(),
 					gr_memfmt_hww_esr_reset_active_f());
 			need_reset |= -EFAULT;
 		}

+		if (exception & gr_exception_pd_m()) {
+			u32 pd = gk20a_readl(g, gr_pd_hww_esr_r());
+
+			nvgpu_err(g, "pd exception: esr 0x%08x", pd);
+			gk20a_writel(g, gr_pd_hww_esr_r(),
+					gr_pd_hww_esr_reset_active_f());
+			need_reset |= -EFAULT;
+		}
+
+		if (exception & gr_exception_scc_m()) {
+			u32 scc = gk20a_readl(g, gr_scc_hww_esr_r());
+
+			nvgpu_err(g, "scc exception: esr 0x%08x", scc);
+			gk20a_writel(g, gr_scc_hww_esr_r(),
+					gr_scc_hww_esr_reset_active_f());
+			need_reset |= -EFAULT;
+		}
+
+		if (exception & gr_exception_ds_m()) {
+			u32 ds = gk20a_readl(g, gr_ds_hww_esr_r());
+
+			nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
+			gk20a_writel(g, gr_ds_hww_esr_r(),
+					 gr_ds_hww_esr_reset_task_f());
+			need_reset |= -EFAULT;
+		}
+
+		if (exception & gr_exception_ssync_m()) {
+			if (g->ops.gr.handle_ssync_hww)
+				need_reset |= g->ops.gr.handle_ssync_hww(g);
+			else
+				nvgpu_err(g, "unhandled ssync exception");
+		}
+
+		if (exception & gr_exception_mme_m()) {
+			u32 mme = gk20a_readl(g, gr_mme_hww_esr_r());
+			u32 info = gk20a_readl(g, gr_mme_hww_esr_info_r());
+
+			nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
+					mme, info);
+			gk20a_writel(g, gr_mme_hww_esr_r(),
+				gr_mme_hww_esr_reset_active_f());
+			need_reset |= -EFAULT;
+		}
+
+		if (exception & gr_exception_sked_m()) {
+			u32 sked = gk20a_readl(g, gr_sked_hww_esr_r());
+
+			nvgpu_err(g, "sked exception: esr 0x%08x", sked);
+			gk20a_writel(g, gr_sked_hww_esr_r(),
+				gr_sked_hww_esr_reset_active_f());
+			need_reset |= -EFAULT;
+		}
+
 		/* check if a gpc exception has occurred */
 		if (exception & gr_exception_gpc_m() && need_reset == 0) {
 			bool post_event = false;
@@ -5931,22 +5989,6 @@ int gk20a_gr_isr(struct gk20a *g)
 			}
 		}

-		if (exception & gr_exception_ds_m()) {
-			u32 ds = gk20a_readl(g, gr_ds_hww_esr_r());
-			nvgpu_err(g, "ds exception %08x", ds);
-			gk20a_writel(g, gr_ds_hww_esr_r(),
-					 gr_ds_hww_esr_reset_task_f());
-			need_reset |= -EFAULT;
-		}
-
-		if (exception & gr_exception_sked_m()) {
-			u32 sked = gk20a_readl(g, gr_sked_hww_esr_r());
-
-			nvgpu_err(g, "sked exception %08x", sked);
-			gk20a_writel(g, gr_sked_hww_esr_r(),
-				gr_sked_hww_esr_reset_active_f());
-		}
-
 		gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
 		gr_intr &= ~gr_intr_exception_pending_f();

--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -426,6 +426,7 @@ static const struct gpu_ops gv100_ops = {
 			gr_gv11b_handle_tpc_sm_ecc_exception,
 		.decode_egpc_addr = gv11b_gr_decode_egpc_addr,
 		.fecs_host_int_enable = gr_gv11b_fecs_host_int_enable,
+		.handle_ssync_hww = gr_gv11b_handle_ssync_hww,
 	},
 	.fb = {
 		.reset = gv100_fb_reset,
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -313,12 +313,32 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
 void gr_gv11b_enable_hww_exceptions(struct gk20a *g)
 {
 	/* enable exceptions */
+
 	gk20a_writel(g, gr_fe_hww_esr_r(),
 		     gr_fe_hww_esr_en_enable_f() |
 		     gr_fe_hww_esr_reset_active_f());
 	gk20a_writel(g, gr_memfmt_hww_esr_r(),
 		     gr_memfmt_hww_esr_en_enable_f() |
 		     gr_memfmt_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_pd_hww_esr_r(),
+		     gr_pd_hww_esr_en_enable_f() |
+		     gr_pd_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_scc_hww_esr_r(),
+		     gr_scc_hww_esr_en_enable_f() |
+		     gr_scc_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_ds_hww_esr_r(),
+		     gr_ds_hww_esr_en_enabled_f() |
+		     gr_ds_hww_esr_reset_task_f());
+	gk20a_writel(g, gr_ssync_hww_esr_r(),
+		     gr_ssync_hww_esr_en_enable_f() |
+		     gr_ssync_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_mme_hww_esr_r(),
+		     gr_mme_hww_esr_en_enable_f() |
+		     gr_mme_hww_esr_reset_active_f());
+
+	/* For now leave POR values */
+	nvgpu_log(g, gpu_dbg_info, "gr_sked_hww_esr_en_r 0x%08x",
+			gk20a_readl(g, gr_sked_hww_esr_en_r()));
 }

 void gr_gv11b_fecs_host_int_enable(struct gk20a *g)
@@ -351,8 +371,16 @@ void gr_gv11b_enable_exceptions(struct gk20a *g)

 	reg_val = gr_exception_en_fe_enabled_f() |
 			gr_exception_en_memfmt_enabled_f() |
+			gr_exception_en_pd_enabled_f() |
+			gr_exception_en_scc_enabled_f() |
 			gr_exception_en_ds_enabled_f() |
+			gr_exception_en_ssync_enabled_f() |
+			gr_exception_en_mme_enabled_f() |
+			gr_exception_en_sked_enabled_f() |
 			gr_exception_en_gpc_enabled_f();
+
+	nvgpu_log(g, gpu_dbg_info, "gr_exception_en 0x%08x", reg_val);
+
 	gk20a_writel(g, gr_exception_en_r(), reg_val);

 }
@@ -4246,3 +4274,13 @@ u32 gr_gv11b_get_gpcs_swdx_dss_zbc_z_format_reg(struct gk20a *g)
 {
 	return gr_gpcs_swdx_dss_zbc_z_01_to_04_format_r();
 }
+
+int gr_gv11b_handle_ssync_hww(struct gk20a *g)
+{
+	u32 ssync = gk20a_readl(g, gr_ssync_hww_esr_r());
+
+	nvgpu_err(g, "ssync exception: esr 0x%08x", ssync);
+	gk20a_writel(g, gr_ssync_hww_esr_r(),
+			 gr_ssync_hww_esr_reset_active_f());
+	return -EFAULT;
+}
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -227,5 +227,6 @@ int gr_gv11b_set_ctxsw_preemption_mode(struct gk20a *g,
 void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
                struct channel_gk20a *ch_ctx,
                struct nvgpu_mem *mem);
+int gr_gv11b_handle_ssync_hww(struct gk20a *g);

 #endif
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -399,6 +399,7 @@ static const struct gpu_ops gv11b_ops = {
 		.ecc_init_scrub_reg = gr_gv11b_ecc_init_scrub_reg,
 		.dump_ctxsw_stats = gr_gp10b_dump_ctxsw_stats,
 		.fecs_host_int_enable = gr_gv11b_fecs_host_int_enable,
+		.handle_ssync_hww = gr_gv11b_handle_ssync_hww,
 	},
 	.fb = {
 		.reset = gv11b_fb_reset,
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gk20a/hw_gr_gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gk20a/hw_gr_gk20a.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -196,6 +196,22 @@ static inline u32 gr_exception_sked_m(void)
 {
 	return 0x1U << 8U;
 }
+static inline u32 gr_exception_pd_m(void)
+{
+	return 0x1U << 2U;
+}
+static inline u32 gr_exception_scc_m(void)
+{
+	return 0x1U << 3U;
+}
+static inline u32 gr_exception_ssync_m(void)
+{
+	return 0x1U << 5U;
+}
+static inline u32 gr_exception_mme_m(void)
+{
+	return 0x1U << 7U;
+}
 static inline u32 gr_exception1_r(void)
 {
 	return 0x00400118U;
@@ -544,6 +560,10 @@ static inline u32 gr_fe_hww_esr_en_enable_f(void)
 {
 	return 0x80000000U;
 }
+static inline u32 gr_fe_hww_esr_info_r(void)
+{
+	return 0x004041b0U;
+}
 static inline u32 gr_fe_go_idle_timeout_r(void)
 {
 	return 0x00404154U;
@@ -592,6 +612,10 @@ static inline u32 gr_mme_hww_esr_en_enable_f(void)
 {
 	return 0x80000000U;
 }
+static inline u32 gr_mme_hww_esr_info_r(void)
+{
+	return 0x00404494U;
+}
 static inline u32 gr_memfmt_hww_esr_r(void)
 {
 	return 0x00404600U;
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
@@ -184,6 +184,22 @@ static inline u32 gr_exception_sked_m(void)
 {
 	return 0x1U << 8U;
 }
+static inline u32 gr_exception_pd_m(void)
+{
+	return 0x1U << 2U;
+}
+static inline u32 gr_exception_scc_m(void)
+{
+	return 0x1U << 3U;
+}
+static inline u32 gr_exception_ssync_m(void)
+{
+	return 0x1U << 5U;
+}
+static inline u32 gr_exception_mme_m(void)
+{
+	return 0x1U << 7U;
+}
 static inline u32 gr_exception1_r(void)
 {
 	return 0x00400118U;
@@ -232,6 +248,46 @@ static inline u32 gr_exception_en_ds_enabled_f(void)
 {
 	return 0x10U;
 }
+static inline u32 gr_exception_en_pd_m(void)
+{
+	return 0x1U << 2U;
+}
+static inline u32 gr_exception_en_pd_enabled_f(void)
+{
+	return 0x4U;
+}
+static inline u32 gr_exception_en_scc_m(void)
+{
+	return 0x1U << 3U;
+}
+static inline u32 gr_exception_en_scc_enabled_f(void)
+{
+	return 0x8U;
+}
+static inline u32 gr_exception_en_ssync_m(void)
+{
+	return 0x1U << 5U;
+}
+static inline u32 gr_exception_en_ssync_enabled_f(void)
+{
+	return 0x20U;
+}
+static inline u32 gr_exception_en_mme_m(void)
+{
+	return 0x1U << 7U;
+}
+static inline u32 gr_exception_en_mme_enabled_f(void)
+{
+	return 0x80U;
+}
+static inline u32 gr_exception_en_sked_m(void)
+{
+	return 0x1U << 8U;
+}
+static inline u32 gr_exception_en_sked_enabled_f(void)
+{
+	return 0x100U;
+}
 static inline u32 gr_exception1_en_r(void)
 {
 	return 0x00400130U;
@@ -1408,6 +1464,10 @@ static inline u32 gr_fe_hww_esr_en_enable_f(void)
 {
 	return 0x80000000U;
 }
+static inline u32 gr_fe_hww_esr_info_r(void)
+{
+	return 0x004041b0U;
+}
 static inline u32 gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r(void)
 {
 	return 0x00419eacU;
@@ -1536,6 +1596,10 @@ static inline u32 gr_mme_hww_esr_en_enable_f(void)
 {
 	return 0x80000000U;
 }
+static inline u32 gr_mme_hww_esr_info_r(void)
+{
+	return 0x00404494U;
+}
 static inline u32 gr_memfmt_hww_esr_r(void)
 {
 	return 0x00404600U;
@@ -2980,6 +3044,18 @@ static inline u32 gr_scc_hww_esr_en_enable_f(void)
 {
 	return 0x80000000U;
 }
+static inline u32 gr_ssync_hww_esr_r(void)
+{
+	return 0x00405a14U;
+}
+static inline u32 gr_ssync_hww_esr_reset_active_f(void)
+{
+	return 0x40000000U;
+}
+static inline u32 gr_ssync_hww_esr_en_enable_f(void)
+{
+	return 0x80000000U;
+}
 static inline u32 gr_sked_hww_esr_r(void)
 {
 	return 0x00407020U;