gpu: nvgpu: update SW quiesce

Update SW quiesce as follows: - After waking up sw_quiesce_thread, nvgpu_sw_quiesce masks interrupts, then disables and preempts runlists without lock. There could be still a concurrent thread that would re-enable the runlist by accident. This is very unlikely and would mean we are not in mission mode anyway. - In sw_quiesce_thread, wait NVGPU_SW_QUIESCE_TIMEOUT_MS, to leave some time for interrupt handler to set error notifier (in case of HW error interrupt). Then disable and preempt runlists, and set error notifier for remaining channels before exiting the process. Also modified nvgpu_can_busy to return false in case SW quiesce is pending. This will make subsequent devctl to fail. Jira NVGPU-4512 Change-Id: I36dd554485f3b9b08f740f352f737ac4baa28746 Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2266389 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-23 18:16:01 +03:00 · 2019-12-19 11:11:56 -05:00
parent fbafc9e05c
commit e0a6000456
8 changed files with 67 additions and 57 deletions
--- a/drivers/gpu/nvgpu/common/fifo/fifo.c
+++ b/drivers/gpu/nvgpu/common/fifo/fifo.c
@@ -1,7 +1,7 @@
 /*
 * FIFO
 *
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -282,20 +282,11 @@ int nvgpu_fifo_suspend(struct gk20a *g)
 #ifndef CONFIG_NVGPU_RECOVERY
 void nvgpu_fifo_sw_quiesce(struct gk20a *g)
 {
-	u32 runlist_mask;
+	u32 runlist_mask = U32_MAX;
 	nvgpu_runlist_lock_active_runlists(g);
 	/* Disable all runlists */
 	runlist_mask = nvgpu_runlist_get_runlists_mask(g,
 			0U, ID_TYPE_UNKNOWN, 0U, 0U);
 	g->ops.runlist.write_state(g, runlist_mask, RUNLIST_DISABLED);
 	/* Preempt all runlists */
 	g->ops.fifo.preempt_runlists_for_rc(g, runlist_mask);
 	nvgpu_channel_sw_quiesce(g);
 	nvgpu_runlist_unlock_active_runlists(g);
 }
 #endif
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -72,15 +72,14 @@ static void gk20a_mask_interrupts(struct gk20a *g)
 }
 #ifndef CONFIG_NVGPU_RECOVERY
 #define NVGPU_SW_QUIESCE_TIMEOUT_MS	50
 static int nvgpu_sw_quiesce_thread(void *data)
 {
 	struct gk20a *g = data;
 	int err = 0;
-	g->sw_quiesce_init_done = true;
+	/* wait until SW quiesce is requested */
 	nvgpu_cond_signal(&g->sw_quiesce_cond);
 	/* wait until all SW quiesce is requested */
 	NVGPU_COND_WAIT(&g->sw_quiesce_cond,
 		g->sw_quiesce_pending ||
 		nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U);
@@ -88,29 +87,17 @@ static int nvgpu_sw_quiesce_thread(void *data)
 	if (nvgpu_thread_should_stop(&g->sw_quiesce_thread)) {
 		goto done;
 	}
 	nvgpu_wait_for_deferred_interrupts(g);
-	nvgpu_err(g, "sw quiesce in progress");
+	nvgpu_err(g, "SW quiesce thread running");
 	nvgpu_msleep(NVGPU_SW_QUIESCE_TIMEOUT_MS);
 	nvgpu_mutex_acquire(&g->power_lock);
 	if (nvgpu_is_powered_off(g) || g->is_virtual) {
 		err = -EINVAL;
 		goto idle;
 	}
 	nvgpu_start_gpu_idle(g);
 	nvgpu_disable_irqs(g);
 	gk20a_mask_interrupts(g);
 	nvgpu_fifo_sw_quiesce(g);
-
+	nvgpu_channel_sw_quiesce(g);
-idle:
+	nvgpu_bug_exit(1);
 	nvgpu_mutex_release(&g->power_lock);
 	nvgpu_err(g, "sw quiesce done, err=%d", err);
 done:
 	nvgpu_log_info(g, "done");
-	return err;
+	return 0;
 }
 #endif
@@ -142,8 +129,7 @@ static int nvgpu_sw_quiesce_init_support(struct gk20a *g)
 		return err;
 	}
-	/* wait until thread actually starts */
+	g->sw_quiesce_init_done = true;
 	NVGPU_COND_WAIT(&g->sw_quiesce_cond, g->sw_quiesce_init_done, 0U);
 #endif
 	return 0;
@@ -162,10 +148,23 @@ void nvgpu_sw_quiesce_remove_support(struct gk20a *g)
 void nvgpu_sw_quiesce(struct gk20a *g)
 {
-#ifndef CONFIG_NVGPU_RECOVERY
+#ifdef CONFIG_NVGPU_RECOVERY
 	nvgpu_err(g, "SW quiesce not supported");
 #else
 	if (g->is_virtual || (g->enabled_flags == NULL) ||
 		nvgpu_is_enabled(g, NVGPU_DISABLE_SW_QUIESCE)) {
-		goto fail;
+		nvgpu_err(g, "SW quiesce not supported");
 		return;
 	}
 	if (!g->sw_quiesce_init_done) {
 		nvgpu_err(g, "SW quiesce not initialized");
 		return;
 	}
 	if (g->sw_quiesce_pending) {
 		nvgpu_err(g, "SW quiesce already pending");
 		return;
 	}
 	nvgpu_err(g, "SW quiesce requested");
@@ -177,13 +176,10 @@ void nvgpu_sw_quiesce(struct gk20a *g)
 	 */
 	g->sw_quiesce_pending = true;
-	nvgpu_cond_signal(&g->sw_quiesce_cond);
+	nvgpu_cond_broadcast(&g->sw_quiesce_cond);
-
+	gk20a_mask_interrupts(g);
-	return;
+	nvgpu_fifo_sw_quiesce(g);
 fail:
 #endif
 	nvgpu_err(g, "sw quiesce not supported");
 }
 /* init interface layer support for all falcons */
@@ -709,6 +705,13 @@ int nvgpu_can_busy(struct gk20a *g)
 	/* Can't do anything if the system is rebooting/shutting down
 	 * or the driver is restarting
 	 */
 #ifndef CONFIG_NVGPU_RECOVERY
 	if (g->sw_quiesce_pending) {
 		return 0;
 	}
 #endif
 	if (nvgpu_is_enabled(g, NVGPU_KERNEL_IS_DYING) ||
 		nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) {
 		return 0;
--- a/drivers/gpu/nvgpu/include/nvgpu/bug.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/bug.h
@@ -109,7 +109,7 @@ nvgpu_bug_cb_from_node(struct nvgpu_list_node *node)
 };
 #ifdef __KERNEL__
-static inline void nvgpu_bug_exit(void) { }
+static inline void nvgpu_bug_exit(int status) { }
 static inline void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb) { }
 static inline void nvgpu_bug_unregister_cb(struct nvgpu_bug_cb *cb) { }
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/fifo.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/fifo.h
@@ -1,7 +1,7 @@
 /*
 * FIFO common definitions.
 *
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -511,15 +511,12 @@ int nvgpu_fifo_suspend(struct gk20a *g);
 *
 * @param g [in]	The GPU driver struct.
 *
- * Gracefully put FIFO into a non-functioning state to ensure that no corrupted
+ * Put FIFO into a non-functioning state to ensure that no corrupted
 * work is completed because of the fault. This is because the freedom
 * from interference may not always be shown between the faulted and
 * the non-faulted TSG contexts.
 * - Disable all runlists
 * - Preempt all runlists
 * - Quiesce all channels
 *
 * @see nvgpu_channel_sw_quiesce
 */
 void nvgpu_fifo_sw_quiesce(struct gk20a *g);
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/posix/bug.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/posix/bug.h
@@ -124,6 +124,16 @@ void nvgpu_bug_cb_longjmp(void *arg);
 struct nvgpu_bug_cb;
 /**
 * @brief Exit current process
 *
 * @param status [in]	Status to return
 *
 * This function is used during BUG() handling to exit
 * current process.
 */
 void nvgpu_bug_exit(int status);
 /**
 * @brief Register callback to be invoked on BUG()
 *
--- a/drivers/gpu/nvgpu/os/posix/bug.c
+++ b/drivers/gpu/nvgpu/os/posix/bug.c
@@ -89,6 +89,13 @@ static void nvgpu_bug_init(void)
 	bug.in_use = true;
 }
 void nvgpu_bug_exit(int status)
 {
 #ifndef __NVGPU_UNIT_TEST__
 	exit(status);
 #endif
 }
 void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb)
 {
 	(void) pthread_once(&bug.once, nvgpu_bug_init);
--- a/userspace/units/fifo/fifo/nvgpu-fifo.c
+++ b/userspace/units/fifo/fifo/nvgpu-fifo.c
@@ -28,6 +28,7 @@
 #include <nvgpu/posix/posix-fault-injection.h>
 #include <nvgpu/posix/dma.h>
 #include <nvgpu/io.h>
 #include <nvgpu/runlist.h>
 #include "hal/init/hal_gv11b.h"
 #include "nvgpu/hw/gk20a/hw_fifo_gk20a.h"
@@ -166,6 +167,7 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args)
 	u32 reg_val;
 	int ret = UNIT_FAIL;
 	int err;
 	u32 runlist_mask;
 	err = test_fifo_setup_gv11b_reg_space(m, g);
 	unit_assert(err == 0, goto done);
@@ -176,9 +178,12 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args)
 	unit_assert(err == 0, goto done);
 #ifndef CONFIG_NVGPU_RECOVERY
 	runlist_mask = nvgpu_runlist_get_runlists_mask(g, 0U,
 		ID_TYPE_UNKNOWN, 0U, 0U);
 	unit_assert(runlist_mask != 0U, goto done);
 	nvgpu_fifo_sw_quiesce(g);
 	reg_val = nvgpu_readl(g, fifo_sched_disable_r());
-		unit_assert(reg_val == 3U, goto done);
+	unit_assert((reg_val & runlist_mask) == runlist_mask, goto done);
 #endif
 	ret = UNIT_SUCCESS;
--- a/userspace/units/init/nvgpu-init.c
+++ b/userspace/units/init/nvgpu-init.c
@@ -719,7 +719,6 @@ int test_quiesce(struct unit_module *m, struct gk20a *g, void *args)
 	nvgpu_set_power_state(g, NVGPU_STATE_POWERED_ON);
 	/* make sure we simulate interrupts enabled */
 	g->mc.irqs_enabled = true;
 	intr_masked = false;
 	/* setup HAL for masking interrupts */
@@ -740,10 +739,8 @@ int test_quiesce(struct unit_module *m, struct gk20a *g, void *args)
 	/* wait for quiesce thread to complete */
 	nvgpu_thread_join(&g->sw_quiesce_thread);
-
+	if (!intr_masked) {
-
+		unit_err(m, "quiesce failed to mask interrupts\n");
 	if (g->mc.irqs_enabled || !intr_masked) {
 		unit_err(m, "quiesce failed to disable interrupts\n");
 		ret = UNIT_FAIL;
 	}