gpu: nvgpu: Fix semaphore race condition

A race condition existed in gk20a_channel_semaphore_wait_fd(). In some instances the semaphore underlying the sync_fence being waited on would have already signaled. This would cause the subsequent sync_fence_wait_async() call to return 1 and do nothing. Normally, the sync_fence_wait_async() call would release the newly created semaphore but in the above case that would not happen and hang any channel waiting on that semaphore. To fix this problem if sync_fence_wait_async() returns 1 immediately release the newly created semaphore. Bug 1604892 Change-Id: I1f5e811695bb099f71b7762835aba4a7e27362ec Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/935910 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
2025-12-24 10:34:43 +03:00 · 2016-01-21 14:50:23 -08:00
parent aa74098f29
commit f7d219dd1c
3 changed files with 19 additions and 6 deletions
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -456,7 +456,7 @@ static int gk20a_channel_semaphore_wait_fd(
 	struct priv_cmd_entry *wait_cmd = NULL;
 	struct wait_fence_work *w;
 	int written;
-	int err;
+	int err, ret;
 	u64 va;

 	sync_fence = gk20a_sync_fence_fdget(fd);
@@ -490,8 +490,18 @@ static int gk20a_channel_semaphore_wait_fd(
 	va = gk20a_semaphore_gpu_va(w->sema, c->vm);
 	/* GPU unblocked when when the semaphore value becomes 1. */
 	written = add_sema_cmd(wait_cmd->ptr, va, 1, true, false);
+
 	WARN_ON(written != wait_cmd->size);
-	sync_fence_wait_async(sync_fence, &w->waiter);
+	ret = sync_fence_wait_async(sync_fence, &w->waiter);
+
+	/*
+	 * If the sync_fence has already signaled then the above async_wait
+	 * will never trigger. This causes the semaphore release op to never
+	 * happen which, in turn, hangs the GPU. That's bad. So let's just
+	 * do the semaphore_release right now.
+	 */
+	if (ret == 1)
+		gk20a_semaphore_release(w->sema);

 	/* XXX - this fixes an actual bug, we need to hold a ref to this
 	   semaphore while the job is in flight. */
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -154,7 +154,9 @@ struct gk20a_fence *gk20a_fence_from_semaphore(

 #ifdef CONFIG_SYNC
 	sync_fence = gk20a_sync_fence_create(timeline, semaphore,
-					     dependency, "fence");
+					     dependency, "f-gk20a-0x%04llx",
+					     ((u64)(void *)semaphore->value) &
+					     0xffff);
 	if (!sync_fence)
 		return NULL;
 #endif
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
@@ -47,8 +47,9 @@ struct gk20a_sync_pt {
 	ktime_t				dep_timestamp;

 	/*
-	 * A spinlock is necessary since there are times when this lock
-	 * will be acquired in interrupt context.
+	 * Use a spin lock here since it will have better performance
+	 * than a mutex - there should be very little contention on this
+	 * lock.
 	 */
 	spinlock_t			lock;
 };