gpu: nvgpu: disable elpg before ctxsw_disable

if fecs is sent stop_ctxsw method, elpg entry/exit cannot happen and may timeout. It could manifest as different error signatures depending on when stop_ctxsw fecs method gets sent with respect to pmu elpg sequence. It could come as pmu halt or abort or maybe ext error too. If ctxsw failed to disable, do not read engine info and just abort tsg. Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I5f3ba07663bcafd3f0083d44c603420b0ccf6945 Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2014914 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-01-28 13:19:49 -08:00
parent 59bf4b39ff
commit 672e6bc31e
2 changed files with 75 additions and 11 deletions
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -185,18 +185,45 @@ int gk20a_tsg_unbind_channel(struct channel_gk20a *ch)
 void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
 			 bool verbose, u32 rc_type)
 {
-	u32 engines;
+	u32 engines_mask = 0U;
+	int err;

-	/* stop context switching to prevent engine assignments from
-	   changing until TSG is recovered */
 	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
-	gr_gk20a_disable_ctxsw(g);

-	engines = g->ops.fifo.get_engines_mask_on_id(g, tsg->tsgid, true);
+	/* disable tsg so that it does not get scheduled again */
+	g->ops.fifo.disable_tsg(tsg);

-	if (engines != 0U) {
-		gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
-					rc_type);
+	/*
+	 * stop context switching to prevent engine assignments from
+	 * changing until engine status is checked to make sure tsg
+	 * being recovered is not loaded on the engines
+	 */
+	err = gr_gk20a_disable_ctxsw(g);
+
+	if (err != 0) {
+		/* if failed to disable ctxsw, just abort tsg */
+		nvgpu_err(g, "failed to disable ctxsw");
+	} else {
+		/* recover engines if tsg is loaded on the engines */
+		engines_mask = g->ops.fifo.get_engines_mask_on_id(g,
+				tsg->tsgid, true);
+
+		/*
+		 * it is ok to enable ctxsw before tsg is recovered. If engines
+		 * is 0, no engine recovery is needed and if it is  non zero,
+		 * gk20a_fifo_recover will call get_engines_mask_on_id again.
+		 * By that time if tsg is not on the engine, engine need not
+		 * be reset.
+		 */
+		err = gr_gk20a_enable_ctxsw(g);
+		if (err != 0) {
+			nvgpu_err(g, "failed to enable ctxsw");
+		}
+	}
+
+	if (engines_mask != 0U) {
+		gk20a_fifo_recover(g, engines_mask, tsg->tsgid, true, true,
+					verbose, rc_type);
 	} else {
 		if (nvgpu_tsg_mark_error(g, tsg) && verbose) {
 			gk20a_debug_dump(g);
@@ -205,7 +232,6 @@ void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
 		gk20a_fifo_abort_tsg(g, tsg, false);
 	}

-	gr_gk20a_enable_ctxsw(g);
 	nvgpu_mutex_release(&g->dbg_sessions_lock);
 }

--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -528,7 +528,14 @@ static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
 		      .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
 }

-/* Stop processing (stall) context switches at FECS. */
+/**
+ * Stop processing (stall) context switches at FECS:-
+ * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen
+ * and may timeout. It could manifest as different error signatures
+ * depending on when stop_ctxsw fecs method gets sent with respect
+ * to pmu elpg sequence. It could come as pmu halt or abort or
+ * maybe ext error too.
+*/
 int gr_gk20a_disable_ctxsw(struct gk20a *g)
 {
 	int err = 0;
@@ -538,8 +545,24 @@ int gr_gk20a_disable_ctxsw(struct gk20a *g)
 	nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
 	g->ctxsw_disable_count++;
 	if (g->ctxsw_disable_count == 1) {
-		err = gr_gk20a_ctrl_ctxsw(g,
+		err = nvgpu_pg_elpg_disable(g);
+		if (err != 0) {
+			nvgpu_err(g, "failed to disable elpg. not safe to "
+					"stop_ctxsw");
+			/* stop ctxsw command is not sent */
+			g->ctxsw_disable_count--;
+		} else {
+			err = gr_gk20a_ctrl_ctxsw(g,
 				gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
+			if (err != 0) {
+				nvgpu_err(g, "failed to stop fecs ctxsw");
+				/* stop ctxsw failed */
+				g->ctxsw_disable_count--;
+			}
+		}
+	} else {
+		nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d",
+			g->ctxsw_disable_count);
 	}
 	nvgpu_mutex_release(&g->ctxsw_disable_lock);

@@ -554,12 +577,27 @@ int gr_gk20a_enable_ctxsw(struct gk20a *g)
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");

 	nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
+	if (g->ctxsw_disable_count == 0) {
+		goto ctxsw_already_enabled;
+	}
 	g->ctxsw_disable_count--;
 	WARN_ON(g->ctxsw_disable_count < 0);
 	if (g->ctxsw_disable_count == 0) {
 		err = gr_gk20a_ctrl_ctxsw(g,
 				gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
+		if (err != 0) {
+			nvgpu_err(g, "failed to start fecs ctxsw");
+		} else {
+			if (nvgpu_pg_elpg_enable(g) != 0) {
+				nvgpu_err(g, "failed to enable elpg "
+					"after start_ctxsw");
+			}
+		}
+	} else {
+		nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet",
+			g->ctxsw_disable_count);
 	}
+ctxsw_already_enabled:
 	nvgpu_mutex_release(&g->ctxsw_disable_lock);

 	return err;