From 672e6bc31e0e679d0bfc9d34b9a94134df4655a8 Mon Sep 17 00:00:00 2001
From: Seema Khowala <seemaj@nvidia.com>
Date: Mon, 28 Jan 2019 13:19:49 -0800
Subject: [PATCH] gpu: nvgpu: disable elpg before ctxsw_disable

if fecs is sent stop_ctxsw method, elpg entry/exit cannot happen
and may timeout. It could manifest as different error signatures
depending on when stop_ctxsw fecs method gets sent with respect
to pmu elpg sequence. It could come as pmu halt or abort or
maybe ext error too.

If ctxsw failed to disable, do not read engine info and just abort tsg.

Bug 2092051
Bug 2429295
Bug 2484211
Bug 1890287

Change-Id: I5f3ba07663bcafd3f0083d44c603420b0ccf6945
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2014914
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/tsg.c | 44 +++++++++++++++++++++++------
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c  | 42 +++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c
index 984a7d23f..793bf94d3 100644
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -185,18 +185,45 @@ int gk20a_tsg_unbind_channel(struct channel_gk20a *ch)
 void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
 			 bool verbose, u32 rc_type)
 {
-	u32 engines;
+	u32 engines_mask = 0U;
+	int err;
 
-	/* stop context switching to prevent engine assignments from
-	   changing until TSG is recovered */
 	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-	gr_gk20a_disable_ctxsw(g);
 
-	engines = g->ops.fifo.get_engines_mask_on_id(g, tsg->tsgid, true);
+	/* disable tsg so that it does not get scheduled again */
+	g->ops.fifo.disable_tsg(tsg);
 
-	if (engines != 0U) {
-		gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
-					rc_type);
+	/*
+	 * stop context switching to prevent engine assignments from
+	 * changing until engine status is checked to make sure tsg
+	 * being recovered is not loaded on the engines
+	 */
+	err = gr_gk20a_disable_ctxsw(g);
+
+	if (err != 0) {
+		/* if failed to disable ctxsw, just abort tsg */
+		nvgpu_err(g, "failed to disable ctxsw");
+	} else {
+		/* recover engines if tsg is loaded on the engines */
+		engines_mask = g->ops.fifo.get_engines_mask_on_id(g,
+				tsg->tsgid, true);
+
+		/*
+		 * it is ok to enable ctxsw before tsg is recovered. If engines
+		 * is 0, no engine recovery is needed and if it is  non zero,
+		 * gk20a_fifo_recover will call get_engines_mask_on_id again.
+		 * By that time if tsg is not on the engine, engine need not
+		 * be reset.
+		 */
+		err = gr_gk20a_enable_ctxsw(g);
+		if (err != 0) {
+			nvgpu_err(g, "failed to enable ctxsw");
+		}
+	}
+
+	if (engines_mask != 0U) {
+		gk20a_fifo_recover(g, engines_mask, tsg->tsgid, true, true,
+					verbose, rc_type);
 	} else {
 		if (nvgpu_tsg_mark_error(g, tsg) && verbose) {
 			gk20a_debug_dump(g);
@@ -205,7 +232,6 @@ void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
 		gk20a_fifo_abort_tsg(g, tsg, false);
 	}
 
-	gr_gk20a_enable_ctxsw(g);
 	nvgpu_mutex_release(&g->dbg_sessions_lock);
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 108155222..cc00c5fc7 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -528,7 +528,14 @@ static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
 		      .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
 }
 
-/* Stop processing (stall) context switches at FECS. */
+/**
+ * Stop processing (stall) context switches at FECS:-
+ * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen
+ * and may timeout. It could manifest as different error signatures
+ * depending on when stop_ctxsw fecs method gets sent with respect
+ * to pmu elpg sequence. It could come as pmu halt or abort or
+ * maybe ext error too.
+*/
 int gr_gk20a_disable_ctxsw(struct gk20a *g)
 {
 	int err = 0;
@@ -538,8 +545,24 @@ int gr_gk20a_disable_ctxsw(struct gk20a *g)
 	nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
 	g->ctxsw_disable_count++;
 	if (g->ctxsw_disable_count == 1) {
-		err = gr_gk20a_ctrl_ctxsw(g,
+		err = nvgpu_pg_elpg_disable(g);
+		if (err != 0) {
+			nvgpu_err(g, "failed to disable elpg. not safe to "
+					"stop_ctxsw");
+			/* stop ctxsw command is not sent */
+			g->ctxsw_disable_count--;
+		} else {
+			err = gr_gk20a_ctrl_ctxsw(g,
 				gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
+			if (err != 0) {
+				nvgpu_err(g, "failed to stop fecs ctxsw");
+				/* stop ctxsw failed */
+				g->ctxsw_disable_count--;
+			}
+		}
+	} else {
+		nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d",
+			g->ctxsw_disable_count);
 	}
 	nvgpu_mutex_release(&g->ctxsw_disable_lock);
 
@@ -554,12 +577,27 @@ int gr_gk20a_enable_ctxsw(struct gk20a *g)
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
 
 	nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
+	if (g->ctxsw_disable_count == 0) {
+		goto ctxsw_already_enabled;
+	}
 	g->ctxsw_disable_count--;
 	WARN_ON(g->ctxsw_disable_count < 0);
 	if (g->ctxsw_disable_count == 0) {
 		err = gr_gk20a_ctrl_ctxsw(g,
 				gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
+		if (err != 0) {
+			nvgpu_err(g, "failed to start fecs ctxsw");
+		} else {
+			if (nvgpu_pg_elpg_enable(g) != 0) {
+				nvgpu_err(g, "failed to enable elpg "
+					"after start_ctxsw");
+			}
+		}
+	} else {
+		nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet",
+			g->ctxsw_disable_count);
 	}
+ctxsw_already_enabled:
 	nvgpu_mutex_release(&g->ctxsw_disable_lock);
 
 	return err;