gpu: nvgpu: add refcounting for ctxsw disable/enable

ctxsw disable could be called recursively for RM server. Suspend contexts disables ctxsw at the beginning, then call tsg disable and preempt. If preempt timeout happens, it goes to recovery path, which will try to disable ctxsw again. More details on Bug 200331110. Jira VQRM-2982 Change-Id: I4659c842ae73ed59be51ae65b25366f24abcaf22 Signed-off-by: Shashank Singh <shashsingh@nvidia.com> Signed-off-by: Richard Zhao <rizhao@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1671716 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Sourab Gupta <sourabg@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-23 09:57:08 +03:00 · 2017-12-28 11:52:14 +05:30
parent 02956adcd3
commit db089a73a5
4 changed files with 29 additions and 7 deletions
--- a/drivers/gpu/nvgpu/common/linux/driver_common.c
+++ b/drivers/gpu/nvgpu/common/linux/driver_common.c
@@ -62,6 +62,7 @@ static void nvgpu_init_vars(struct gk20a *g)
 	nvgpu_mutex_init(&g->client_lock);
 	nvgpu_mutex_init(&g->poweron_lock);
 	nvgpu_mutex_init(&g->poweroff_lock);
+	nvgpu_mutex_init(&g->ctxsw_disable_lock);

 	l->regs_saved = l->regs;
 	l->bar1_saved = l->bar1;
--- a/drivers/gpu/nvgpu/common/linux/vgpu/vgpu_linux.c
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/vgpu_linux.c
@@ -76,6 +76,7 @@ static void vgpu_init_vars(struct gk20a *g, struct gk20a_platform *platform)

 	nvgpu_mutex_init(&g->poweron_lock);
 	nvgpu_mutex_init(&g->poweroff_lock);
+	nvgpu_mutex_init(&g->ctxsw_disable_lock);
 	l->regs_saved = l->regs;
 	l->bar1_saved = l->bar1;

--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1191,6 +1191,9 @@ struct gk20a {

 	nvgpu_atomic_t usage_count;

+	struct nvgpu_mutex ctxsw_disable_lock;
+	int ctxsw_disable_count;
+
 	struct nvgpu_ref refcount;

 	const char *name;
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -616,22 +616,39 @@ static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
 		      .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
 }

-/* Stop processing (stall) context switches at FECS.
- * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
- * are sent to the ucode in sequence, it can get into an undefined state. */
+/* Stop processing (stall) context switches at FECS. */
 int gr_gk20a_disable_ctxsw(struct gk20a *g)
 {
+	int err = 0;
+
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
-	return gr_gk20a_ctrl_ctxsw(g,
+
+	nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
+	g->ctxsw_disable_count++;
+	if (g->ctxsw_disable_count == 1)
+		err = gr_gk20a_ctrl_ctxsw(g,
 				gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
+	nvgpu_mutex_release(&g->ctxsw_disable_lock);
+
+	return err;
 }

 /* Start processing (continue) context switches at FECS */
 int gr_gk20a_enable_ctxsw(struct gk20a *g)
 {
+	int err = 0;
+
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
-	return gr_gk20a_ctrl_ctxsw(g,
+
+	nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
+	g->ctxsw_disable_count--;
+	WARN_ON(g->ctxsw_disable_count < 0);
+	if (g->ctxsw_disable_count == 0)
+		err = gr_gk20a_ctrl_ctxsw(g,
 				gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
+	nvgpu_mutex_release(&g->ctxsw_disable_lock);
+
+	return err;
 }

 int gr_gk20a_halt_pipe(struct gk20a *g)