gpu: nvgpu: fix race for nvgpu_thread_stop

The pmu init thread typically returns immediately without calling nvgpu_thread_should_stop(). pmu_pg_kill_task() checks if the thread is running, and if it is, calls nvgpu_thread_stop(). However, there's a race condition where the init thread could have exited between the time that kill_task() checked the running flag and the time we actually stop the thread, leading to a kernel crash. Fix this by making the running flag in the nvgpu_thread struct atomic. Both the thread proxy function and the thread_stop() function will set the flag to false. In the case of nvgpu_thread_proxy(), if the flag is already false, then nvgpu_thread_stop() has already reset it, at which point we just wait for nvgpu_thread_should_stop() to return true. In the case of nvgpu_thread_stop(), if the flag is already false, then the thread proxy function has already exited, and there is nothing more to do. Bug 2591298 Change-Id: I9ba6b63c30a5c3e1df11e790094836b44373122b Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2230358 GVS: Gerrit_Virtual_Submit Reviewed-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2019-11-01 15:28:08 -07:00
parent 44a28012de
commit c58029ad24
2 changed files with 26 additions and 7 deletions
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/thread.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/thread.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -17,11 +17,13 @@
 #ifndef __NVGPU_THREAD_LINUX_H__
 #define __NVGPU_THREAD_LINUX_H__

+#include <nvgpu/atomic.h>
+
 struct task_struct;

 struct nvgpu_thread {
 	struct task_struct *task;
-	bool running;
+	nvgpu_atomic_t running;
 	int (*fn)(void *);
 	void *data;
 };
--- a/drivers/gpu/nvgpu/os/linux/thread.c
+++ b/drivers/gpu/nvgpu/os/linux/thread.c
@@ -23,8 +23,19 @@ int nvgpu_thread_proxy(void *threaddata)
 {
 	struct nvgpu_thread *thread = threaddata;
 	int ret = thread->fn(thread->data);
+	bool was_running;

-	thread->running = false;
+	was_running = nvgpu_atomic_xchg(&thread->running, false);
+
+	/* if the thread was not running, then nvgpu_thread_stop() was
+	 * called, so just wait until we get the notification that we should
+	 * stop.
+	 */
+	if (!was_running) {
+		while (!nvgpu_thread_should_stop(thread)) {
+			nvgpu_usleep_range(5000, 5100);
+		}
+	}
 	return ret;
 }

@@ -40,15 +51,21 @@ int nvgpu_thread_create(struct nvgpu_thread *thread,
 	thread->task = task;
 	thread->fn = threadfn;
 	thread->data = data;
-	thread->running = true;
+	nvgpu_atomic_set(&thread->running, true);
 	wake_up_process(task);
 	return 0;
 };

 void nvgpu_thread_stop(struct nvgpu_thread *thread)
 {
+	bool was_running;
+
 	if (thread->task) {
-		kthread_stop(thread->task);
+		was_running = nvgpu_atomic_xchg(&thread->running, false);
+
+		if (was_running) {
+			kthread_stop(thread->task);
+		}
 		thread->task = NULL;
 	}
 };
@@ -72,12 +89,12 @@ bool nvgpu_thread_should_stop(struct nvgpu_thread *thread)

 bool nvgpu_thread_is_running(struct nvgpu_thread *thread)
 {
-	return ACCESS_ONCE(thread->running);
+	return nvgpu_atomic_read(&thread->running);
 };

 void nvgpu_thread_join(struct nvgpu_thread *thread)
 {
-	while (ACCESS_ONCE(thread->running)) {
+	while (nvgpu_atomic_read(&thread->running)) {
 		nvgpu_msleep(10);
 	}
 };