gpu: nvgpu: use inplace allocation in sync framework

This change is the first of a series of changes to support the usage of pre-allocated job tracking resources in the submit path. With this change, we still maintain a dynamically-allocated joblist, but make the necessary changes in the channel_sync & fence framework to use in-place allocations. Specifically, we: 1) Update channel sync framework routines to take in pre-allocated priv_cmd_entry(s) & gk20a_fence(s) rather than dynamically allocating themselves 2) Move allocation of priv_cmd_entry(s) & gk20a_fence(s) to gk20a_submit_prepare_syncs 3) Modify fence framework to have seperate allocation and init APIs. We expose allocation as a seperate API, so the client can allocate the object before passing it into the channel sync framework. 4) Fix clean_up logic in channel sync framework Bug 1795076 Change-Id: I96db457683cd207fd029c31c45f548f98055e844 Signed-off-by: Sachit Kadle <skadle@nvidia.com> Reviewed-on: http://git-master/r/1206725 (cherry picked from commit 9d196fd10db6c2f934c2a53b1fc0500eb4626624) Reviewed-on: http://git-master/r/1223933 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2016-08-22 18:06:30 -07:00
parent 3c2656c8c6
commit 63e8592e06
7 changed files with 275 additions and 175 deletions
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1376,16 +1376,20 @@ static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c)

 /* allocate a cmd buffer with given size. size is number of u32 entries */
 int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
-			     struct priv_cmd_entry **entry)
+			     struct priv_cmd_entry *e)
 {
 	struct priv_cmd_queue *q = &c->priv_cmd_q;
-	struct priv_cmd_entry *e;
 	u32 free_count;
 	u32 size = orig_size;

 	gk20a_dbg_fn("size %d", orig_size);

-	*entry = NULL;
+	if (!e) {
+		gk20a_err(dev_from_gk20a(c->g),
+			"ch %d: priv cmd entry is null",
+			c->hw_chid);
+		return -EINVAL;
+	}

 	/* if free space in the end is less than requested, increase the size
 	 * to make the real allocated space start from beginning. */
@@ -1400,14 +1404,6 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
 	if (size > free_count)
 		return -EAGAIN;

-	e = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
-	if (!e) {
-		gk20a_err(dev_from_gk20a(c->g),
-			"ch %d: fail to allocate priv cmd entry",
-			c->hw_chid);
-		return -ENOMEM;
-	}
-
 	e->size = orig_size;
 	e->mem = &q->mem;

@@ -1426,8 +1422,10 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
 	/* we already handled q->put + size > q->size so BUG_ON this */
 	BUG_ON(q->put > q->size);

-	*entry = e;
+	/* commit the previous writes before making the entry valid */
+	wmb();

+	e->valid = true;
 	gk20a_dbg_fn("done");

 	return 0;
@@ -1441,6 +1439,21 @@ static void free_priv_cmdbuf(struct channel_gk20a *c,
 	kfree(e);
 }

+static struct channel_gk20a_job *channel_gk20a_alloc_job(
+		struct channel_gk20a *c)
+{
+	struct channel_gk20a_job *job = NULL;
+
+	job = kzalloc(sizeof(*job), GFP_KERNEL);
+	return job;
+}
+
+static void channel_gk20a_free_job(struct channel_gk20a *c,
+		struct channel_gk20a_job *job)
+{
+	kfree(job);
+}
+
 int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 		struct nvgpu_alloc_gpfifo_args *args)
 {
@@ -1818,10 +1831,15 @@ int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e)
 	if (!e)
 		return 0;

-	if ((q->get != e->off) && e->off != 0)
-		gk20a_err(d, "requests out-of-order, ch=%d\n", c->hw_chid);
+	if (e->valid) {
+		/* read the entry's valid flag before reading its contents */
+		rmb();
+		if ((q->get != e->off) && e->off != 0)
+			gk20a_err(d, "requests out-of-order, ch=%d\n",
+				  c->hw_chid);
+		q->get = e->off + e->size;
+	}

-	q->get = e->off + e->size;
 	free_priv_cmdbuf(c, e);

 	return 0;
@@ -1854,14 +1872,10 @@ static void gk20a_channel_cancel_job_clean_up(struct channel_gk20a *c,
 }

 static int gk20a_channel_add_job(struct channel_gk20a *c,
-				 struct gk20a_fence *pre_fence,
-				 struct gk20a_fence *post_fence,
-				 struct priv_cmd_entry *wait_cmd,
-				 struct priv_cmd_entry *incr_cmd,
+				 struct channel_gk20a_job *job,
 				 bool skip_buffer_refcounting)
 {
 	struct vm_gk20a *vm = c->vm;
-	struct channel_gk20a_job *job = NULL;
 	struct mapped_buffer_node **mapped_buffers = NULL;
 	int err = 0, num_mapped_buffers = 0;

@@ -1875,22 +1889,12 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 			goto err_put_vm;
 	}

-	job = kzalloc(sizeof(*job), GFP_KERNEL);
-	if (!job) {
-		err = -ENOMEM;
-		goto err_put_buffers;
-	}
-
 	/* put() is done in gk20a_channel_update() when the job is done */
 	c = gk20a_channel_get(c);

 	if (c) {
 		job->num_mapped_buffers = num_mapped_buffers;
 		job->mapped_buffers = mapped_buffers;
-		job->pre_fence = pre_fence;
-		job->post_fence = post_fence;
-		job->wait_cmd = wait_cmd;
-		job->incr_cmd = incr_cmd;

 		gk20a_channel_timeout_start(c, job);

@@ -1899,13 +1903,11 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 		spin_unlock(&c->jobs_lock);
 	} else {
 		err = -ETIMEDOUT;
-		goto err_free_job;
+		goto err_put_buffers;
 	}

 	return 0;

-err_free_job:
-	kfree(job);
 err_put_buffers:
 	gk20a_vm_put_buffers(vm, mapped_buffers, num_mapped_buffers);
 err_put_vm:
@@ -2000,7 +2002,7 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
 		list_del_init(&job->list);
 		spin_unlock(&c->jobs_lock);

-		kfree(job);
+		channel_gk20a_free_job(c, job);
 		job_finished = 1;
 		gk20a_idle(g->dev);
 	}
@@ -2143,6 +2145,7 @@ out:
 */
 static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 				      struct nvgpu_fence *fence,
+				      struct channel_gk20a_job *job,
 				      struct priv_cmd_entry **wait_cmd,
 				      struct priv_cmd_entry **incr_cmd,
 				      struct gk20a_fence **pre_fence,
@@ -2194,18 +2197,32 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	 * this condition.
 	 */
 	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
+		job->wait_cmd = kzalloc(sizeof(struct priv_cmd_entry),
+					GFP_KERNEL);
+		job->pre_fence = gk20a_alloc_fence(c);
+
+		if (!job->wait_cmd || !job->pre_fence) {
+			err = -ENOMEM;
+			goto clean_up_pre_fence;
+		}
+
 		if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
 			wait_fence_fd = fence->id;
 			err = c->sync->wait_fd(c->sync, wait_fence_fd,
-					       wait_cmd, pre_fence);
+					       job->wait_cmd, job->pre_fence);
 		} else {
 			err = c->sync->wait_syncpt(c->sync, fence->id,
-						   fence->value, wait_cmd,
-						   pre_fence);
+						   fence->value, job->wait_cmd,
+						   job->pre_fence);
 		}
+
+		if (!err) {
+			if (job->wait_cmd->valid)
+				*wait_cmd = job->wait_cmd;
+			*pre_fence = job->pre_fence;
+		} else
+			goto clean_up_pre_fence;
 	}
-	if (err)
-		goto fail;

 	if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
 	    (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
@@ -2216,22 +2233,41 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	 * is used to keep track of method completion for idle railgating. The
 	 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
 	 */
+	job->incr_cmd = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
+	job->post_fence = gk20a_alloc_fence(c);
+
+	if (!job->incr_cmd || !job->post_fence) {
+		err = -ENOMEM;
+		goto clean_up_post_fence;
+	}
+
 	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
-		err = c->sync->incr_user(c->sync, wait_fence_fd, incr_cmd,
-				 post_fence, need_wfi, need_sync_fence);
+		err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
+				 job->post_fence, need_wfi, need_sync_fence);
 	else
-		err = c->sync->incr(c->sync, incr_cmd,
-				    post_fence, need_sync_fence);
-	if (err)
-		goto fail;
+		err = c->sync->incr(c->sync, job->incr_cmd,
+				    job->post_fence, need_sync_fence);
+	if (!err) {
+		*incr_cmd = job->incr_cmd;
+		*post_fence = job->post_fence;
+	} else
+		goto clean_up_post_fence;

 	return 0;

+clean_up_post_fence:
+	gk20a_free_priv_cmdbuf(c, job->incr_cmd);
+	gk20a_fence_put(job->post_fence);
+	job->incr_cmd = NULL;
+	job->post_fence = NULL;
+clean_up_pre_fence:
+	gk20a_free_priv_cmdbuf(c, job->wait_cmd);
+	gk20a_fence_put(job->pre_fence);
+	job->wait_cmd = NULL;
+	job->pre_fence = NULL;
+	*wait_cmd = NULL;
+	*pre_fence = NULL;
 fail:
-	/*
-	 * Cleanup is handled by gk20a_submit_channel_gpfifo() since it is the
-	 * real owner of the objects we make here.
-	 */
 	return err;
 }

@@ -2250,6 +2286,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	struct priv_cmd_entry *incr_cmd = NULL;
 	struct gk20a_fence *pre_fence = NULL;
 	struct gk20a_fence *post_fence = NULL;
+	struct channel_gk20a_job *job = NULL;
 	/* we might need two extra gpfifo entries - one for pre fence
 	 * and one for post fence. */
 	const int extra_entries = 2;
@@ -2351,11 +2388,18 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	}

 	if (need_job_tracking) {
-		err = gk20a_submit_prepare_syncs(c, fence, &wait_cmd, &incr_cmd,
+		job = channel_gk20a_alloc_job(c);
+		if (!job) {
+			err = -ENOMEM;
+			goto clean_up;
+		}
+
+		err = gk20a_submit_prepare_syncs(c, fence, job,
+						 &wait_cmd, &incr_cmd,
 						 &pre_fence, &post_fence,
 						 force_need_sync_fence, flags);
 		if (err)
-			goto clean_up;
+			goto clean_up_job;
 	}

 	if (wait_cmd)
@@ -2365,7 +2409,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
 				num_entries);
 	if (err)
-		goto clean_up;
+		goto clean_up_job;

 	/*
 	 * And here's where we add the incr_cmd we generated earlier. It should
@@ -2379,9 +2423,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,

 	if (need_job_tracking)
 		/* TODO! Check for errors... */
-		gk20a_channel_add_job(c, pre_fence, post_fence,
-				wait_cmd, incr_cmd,
-				skip_buffer_refcounting);
+		gk20a_channel_add_job(c, job, skip_buffer_refcounting);

 	g->ops.fifo.userd_gp_put(g, c);

@@ -2398,10 +2440,10 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	gk20a_dbg_fn("done");
 	return err;

+clean_up_job:
+	channel_gk20a_free_job(c, job);
 clean_up:
 	gk20a_dbg_fn("fail");
-	free_priv_cmdbuf(c, wait_cmd);
-	free_priv_cmdbuf(c, incr_cmd);
 	gk20a_fence_put(pre_fence);
 	gk20a_fence_put(post_fence);
 	if (need_job_tracking)
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -218,7 +218,7 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch);
 void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error);
 void gk20a_channel_semaphore_wakeup(struct gk20a *g, bool post_events);
 int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 size,
-			     struct priv_cmd_entry **entry);
+			     struct priv_cmd_entry *entry);
 int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e);

 int gk20a_enable_channel_tsg(struct gk20a *g, struct channel_gk20a *ch);
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -57,12 +57,11 @@ static void add_wait_cmd(struct gk20a *g, struct priv_cmd_entry *cmd, u32 off,
 }

 static int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s,
-		u32 id, u32 thresh, struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence)
+		u32 id, u32 thresh, struct priv_cmd_entry *wait_cmd,
+		struct gk20a_fence *fence)
 {
 	struct gk20a_channel_syncpt *sp =
 		container_of(s, struct gk20a_channel_syncpt, ops);
-	struct priv_cmd_entry *wait_cmd = NULL;
 	struct channel_gk20a *c = sp->c;
 	int err = 0;

@@ -75,7 +74,7 @@ static int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s,
 	if (nvhost_syncpt_is_expired_ext(sp->host1x_pdev, id, thresh))
 		return 0;

-	err = gk20a_channel_alloc_priv_cmdbuf(c, 4, &wait_cmd);
+	err = gk20a_channel_alloc_priv_cmdbuf(c, 4, wait_cmd);
 	if (err) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"not enough priv cmd buffer space");
@@ -84,21 +83,18 @@ static int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s,

 	add_wait_cmd(c->g, wait_cmd, 0, id, thresh);

-	*entry = wait_cmd;
-	*fence = NULL;
 	return 0;
 }

 static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
-		       struct priv_cmd_entry **entry,
-		       struct gk20a_fence **fence)
+		       struct priv_cmd_entry *wait_cmd,
+		       struct gk20a_fence *fence)
 {
 #ifdef CONFIG_SYNC
 	int i;
 	int num_wait_cmds;
 	struct sync_fence *sync_fence;
 	struct sync_pt *pt;
-	struct priv_cmd_entry *wait_cmd = NULL;
 	struct gk20a_channel_syncpt *sp =
 		container_of(s, struct gk20a_channel_syncpt, ops);
 	struct channel_gk20a *c = sp->c;
@@ -134,7 +130,7 @@ static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
 		return 0;
 	}

-	err = gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, &wait_cmd);
+	err = gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, wait_cmd);
 	if (err) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"not enough priv cmd buffer space");
@@ -172,8 +168,6 @@ static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
 	WARN_ON(i != num_wait_cmds);
 	sync_fence_put(sync_fence);

-	*entry = wait_cmd;
-	*fence = NULL;
 	return 0;
 #else
 	return -ENODEV;
@@ -193,15 +187,14 @@ static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
 static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 				       bool wfi_cmd,
 				       bool register_irq,
-				       struct priv_cmd_entry **entry,
-				       struct gk20a_fence **fence,
+				       struct priv_cmd_entry *incr_cmd,
+				       struct gk20a_fence *fence,
 				       bool need_sync_fence)
 {
 	u32 thresh;
 	int incr_cmd_size;
 	int off;
 	int err;
-	struct priv_cmd_entry *incr_cmd = NULL;
 	struct gk20a_channel_syncpt *sp =
 		container_of(s, struct gk20a_channel_syncpt, ops);
 	struct channel_gk20a *c = sp->c;
@@ -210,7 +203,7 @@ static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 	if (wfi_cmd)
 		incr_cmd_size += 2;

-	err = gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, &incr_cmd);
+	err = gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, incr_cmd);
 	if (err)
 		return err;

@@ -267,15 +260,21 @@ static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 		}
 	}

-	*fence = gk20a_fence_from_syncpt(sp->host1x_pdev, sp->id, thresh,
+	err = gk20a_fence_from_syncpt(fence, sp->host1x_pdev, sp->id, thresh,
 					 wfi_cmd, need_sync_fence);
-	*entry = incr_cmd;
+	if (err)
+		goto clean_up_priv_cmd;
+
 	return 0;
+
+clean_up_priv_cmd:
+	gk20a_free_priv_cmdbuf(c, incr_cmd);
+	return err;
 }

 static int gk20a_channel_syncpt_incr_wfi(struct gk20a_channel_sync *s,
-				  struct priv_cmd_entry **entry,
-				  struct gk20a_fence **fence)
+				  struct priv_cmd_entry *entry,
+				  struct gk20a_fence *fence)
 {
 	return __gk20a_channel_syncpt_incr(s,
 			true /* wfi */,
@@ -284,8 +283,8 @@ static int gk20a_channel_syncpt_incr_wfi(struct gk20a_channel_sync *s,
 }

 static int gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
-			      struct priv_cmd_entry **entry,
-			      struct gk20a_fence **fence,
+			      struct priv_cmd_entry *entry,
+			      struct gk20a_fence *fence,
 			      bool need_sync_fence)
 {
 	/* Don't put wfi cmd to this one since we're not returning
@@ -298,8 +297,8 @@ static int gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,

 static int gk20a_channel_syncpt_incr_user(struct gk20a_channel_sync *s,
 				   int wait_fence_fd,
-				   struct priv_cmd_entry **entry,
-				   struct gk20a_fence **fence,
+				   struct priv_cmd_entry *entry,
+				   struct gk20a_fence *fence,
 				   bool wfi,
 				   bool need_sync_fence)
 {
@@ -500,8 +499,8 @@ static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,

 static int gk20a_channel_semaphore_wait_syncpt(
 		struct gk20a_channel_sync *s, u32 id,
-		u32 thresh, struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence)
+		u32 thresh, struct priv_cmd_entry *entry,
+		struct gk20a_fence *fence)
 {
 	struct gk20a_channel_semaphore *sema =
 		container_of(s, struct gk20a_channel_semaphore, ops);
@@ -525,7 +524,7 @@ static int gk20a_channel_semaphore_wait_syncpt(
 */
 static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
 					 struct sync_fence *fence,
-					 struct priv_cmd_entry **wait_cmd,
+					 struct priv_cmd_entry *wait_cmd,
 					 struct gk20a_semaphore **fp_sema)
 {
 	struct gk20a_semaphore *sema;
@@ -551,7 +550,7 @@ static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,

 	gk20a_semaphore_get(sema);
 	BUG_ON(!atomic_read(&sema->value));
-	add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
+	add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false);

 	/*
 	 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
@@ -565,8 +564,8 @@ static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,

 static int gk20a_channel_semaphore_wait_fd(
 		struct gk20a_channel_sync *s, int fd,
-		struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence)
+		struct priv_cmd_entry *entry,
+		struct gk20a_fence *fence)
 {
 	struct gk20a_channel_semaphore *sema =
 		container_of(s, struct gk20a_channel_semaphore, ops);
@@ -574,7 +573,7 @@ static int gk20a_channel_semaphore_wait_fd(
 #ifdef CONFIG_SYNC
 	struct gk20a_semaphore *fp_sema;
 	struct sync_fence *sync_fence;
-	struct priv_cmd_entry *wait_cmd = NULL;
+	struct priv_cmd_entry *wait_cmd = entry;
 	struct wait_fence_work *w = NULL;
 	int err, ret, status;

@@ -582,19 +581,24 @@ static int gk20a_channel_semaphore_wait_fd(
 	if (!sync_fence)
 		return -EINVAL;

-	ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
+	ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
 	if (ret == 0) {
-		if (fp_sema)
-			*fence = gk20a_fence_from_semaphore(sema->timeline,
-							    fp_sema,
-							    &c->semaphore_wq,
-							    NULL, false, false);
-		else
+		if (fp_sema) {
+			err = gk20a_fence_from_semaphore(fence,
+					sema->timeline,
+					fp_sema,
+					&c->semaphore_wq,
+					NULL, false, false);
+			if (err) {
+				gk20a_semaphore_put(fp_sema);
+				goto clean_up_priv_cmd;
+			}
+		} else
 			/*
-			 * Allocate an empty fence. It will instantly return
+			 * Init an empty fence. It will instantly return
 			 * from gk20a_fence_wait().
 			 */
-			*fence = gk20a_alloc_fence(NULL, NULL, false);
+			gk20a_init_fence(fence, NULL, NULL, false);

 		sync_fence_put(sync_fence);
 		goto skip_slow_path;
@@ -611,18 +615,17 @@ static int gk20a_channel_semaphore_wait_fd(
 		goto skip_slow_path;
 	}

-	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
 	if (err) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"not enough priv cmd buffer space");
-		sync_fence_put(sync_fence);
-		return -ENOMEM;
+		goto clean_up_sync_fence;
 	}

 	w = kzalloc(sizeof(*w), GFP_KERNEL);
 	if (!w) {
 		err = -ENOMEM;
-		goto fail_free_cmdbuf;
+		goto clean_up_priv_cmd;
 	}

 	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
@@ -631,7 +634,7 @@ static int gk20a_channel_semaphore_wait_fd(
 	if (!w->sema) {
 		gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
 		err = -ENOMEM;
-		goto fail_free_worker;
+		goto clean_up_worker;
 	}

 	/* worker takes one reference */
@@ -641,6 +644,16 @@ static int gk20a_channel_semaphore_wait_fd(
 	/* GPU unblocked when the semaphore value increments. */
 	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);

+	/*
+	 *  We need to create the fence before adding the waiter to ensure
+	 *  that we properly clean up in the event the sync_fence has
+	 *  already signaled
+	 */
+	err = gk20a_fence_from_semaphore(fence, sema->timeline, w->sema,
+			&c->semaphore_wq, NULL, false, false);
+	if (err)
+		goto clean_up_sema;
+
 	ret = sync_fence_wait_async(sync_fence, &w->waiter);

 	/*
@@ -655,24 +668,22 @@ static int gk20a_channel_semaphore_wait_fd(
 		gk20a_semaphore_put(w->sema);
 	}

-	/* XXX - this fixes an actual bug, we need to hold a ref to this
-	   semaphore while the job is in flight. */
-	*fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
-					    &c->semaphore_wq,
-					    NULL, false, false);
-
 skip_slow_path:
-	*entry = wait_cmd;
 	return 0;

-fail_free_worker:
-	if (w && w->sema)
-		gk20a_semaphore_put(w->sema);
+clean_up_sema:
+	/*
+	 * Release the refs to the semaphore, including
+	 * the one for the worker since it will never run.
+	 */
+	gk20a_semaphore_put(w->sema);
+	gk20a_semaphore_put(w->sema);
+clean_up_worker:
 	kfree(w);
+clean_up_priv_cmd:
+	gk20a_free_priv_cmdbuf(c, entry);
+clean_up_sync_fence:
 	sync_fence_put(sync_fence);
-fail_free_cmdbuf:
-	if (wait_cmd)
-		gk20a_free_priv_cmdbuf(c, wait_cmd);
 	return err;
 #else
 	gk20a_err(dev_from_gk20a(c->g),
@@ -684,12 +695,11 @@ fail_free_cmdbuf:
 static int __gk20a_channel_semaphore_incr(
 		struct gk20a_channel_sync *s, bool wfi_cmd,
 		struct sync_fence *dependency,
-		struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence,
+		struct priv_cmd_entry *incr_cmd,
+		struct gk20a_fence *fence,
 		bool need_sync_fence)
 {
 	int incr_cmd_size;
-	struct priv_cmd_entry *incr_cmd = NULL;
 	struct gk20a_channel_semaphore *sp =
 		container_of(s, struct gk20a_channel_semaphore, ops);
 	struct channel_gk20a *c = sp->c;
@@ -704,29 +714,37 @@ static int __gk20a_channel_semaphore_incr(
 	}

 	incr_cmd_size = 10;
-	err = gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, &incr_cmd);
+	err = gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, incr_cmd);
 	if (err) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"not enough priv cmd buffer space");
-		gk20a_semaphore_put(semaphore);
-		return err;
+		goto clean_up_sema;
 	}

 	/* Release the completion semaphore. */
 	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);

-	*fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
-					    &c->semaphore_wq,
-					    dependency, wfi_cmd,
-					    need_sync_fence);
-	*entry = incr_cmd;
+	err = gk20a_fence_from_semaphore(fence,
+			sp->timeline, semaphore,
+			&c->semaphore_wq,
+			dependency, wfi_cmd,
+			need_sync_fence);
+	if (err)
+		goto clean_up_priv_cmd;
+
 	return 0;
+
+clean_up_priv_cmd:
+	gk20a_free_priv_cmdbuf(c, incr_cmd);
+clean_up_sema:
+	gk20a_semaphore_put(semaphore);
+	return err;
 }

 static int gk20a_channel_semaphore_incr_wfi(
 		struct gk20a_channel_sync *s,
-		struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence)
+		struct priv_cmd_entry *entry,
+		struct gk20a_fence *fence)
 {
 	return __gk20a_channel_semaphore_incr(s,
 			true /* wfi */,
@@ -736,8 +754,8 @@ static int gk20a_channel_semaphore_incr_wfi(

 static int gk20a_channel_semaphore_incr(
 		struct gk20a_channel_sync *s,
-		struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence,
+		struct priv_cmd_entry *entry,
+		struct gk20a_fence *fence,
 		bool need_sync_fence)
 {
 	/* Don't put wfi cmd to this one since we're not returning
@@ -751,8 +769,8 @@ static int gk20a_channel_semaphore_incr(
 static int gk20a_channel_semaphore_incr_user(
 		struct gk20a_channel_sync *s,
 		int wait_fence_fd,
-		struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence,
+		struct priv_cmd_entry *entry,
+		struct gk20a_fence *fence,
 		bool wfi,
 		bool need_sync_fence)
 {
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
@@ -36,8 +36,8 @@ struct gk20a_channel_sync {
 	 *    cmdbuf is executed.
 	 */
 	int (*wait_syncpt)(struct gk20a_channel_sync *s, u32 id, u32 thresh,
-			   struct priv_cmd_entry **entry,
-			   struct gk20a_fence **fence);
+			   struct priv_cmd_entry *entry,
+			   struct gk20a_fence *fence);

 	/* Generate a gpu wait cmdbuf from sync fd.
 	 * Returns
@@ -46,8 +46,8 @@ struct gk20a_channel_sync {
 	 *    cmdbuf is executed.
 	 */
 	int (*wait_fd)(struct gk20a_channel_sync *s, int fd,
-		       struct priv_cmd_entry **entry,
-		       struct gk20a_fence **fence);
+		       struct priv_cmd_entry *entry,
+		       struct gk20a_fence *fence);

 	/* Increment syncpoint/semaphore.
 	 * Returns
@@ -55,8 +55,8 @@ struct gk20a_channel_sync {
 	 *  - a fence that can be passed to wait_cpu() and is_expired().
 	 */
 	int (*incr)(struct gk20a_channel_sync *s,
-		    struct priv_cmd_entry **entry,
-		    struct gk20a_fence **fence,
+		    struct priv_cmd_entry *entry,
+		    struct gk20a_fence *fence,
 		    bool need_sync_fence);

 	/* Increment syncpoint/semaphore, preceded by a wfi.
@@ -65,8 +65,8 @@ struct gk20a_channel_sync {
 	 *  - a fence that can be passed to wait_cpu() and is_expired().
 	 */
 	int (*incr_wfi)(struct gk20a_channel_sync *s,
-			struct priv_cmd_entry **entry,
-			struct gk20a_fence **fence);
+			struct priv_cmd_entry *entry,
+			struct gk20a_fence *fence);

 	/* Increment syncpoint/semaphore, so that the returned fence represents
 	 * work completion (may need wfi) and can be returned to user space.
@@ -77,8 +77,8 @@ struct gk20a_channel_sync {
 	 */
 	int (*incr_user)(struct gk20a_channel_sync *s,
 			 int wait_fence_fd,
-			 struct priv_cmd_entry **entry,
-			 struct gk20a_fence **fence,
+			 struct priv_cmd_entry *entry,
+			 struct gk20a_fence *fence,
 			 bool wfi,
 			 bool need_sync_fence);

--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -63,16 +63,27 @@ struct gk20a_fence *gk20a_fence_get(struct gk20a_fence *f)
 	return f;
 }

+static inline bool gk20a_fence_is_valid(struct gk20a_fence *f)
+{
+	bool valid = f->valid;
+
+	rmb();
+	return valid;
+}
+
 int gk20a_fence_wait(struct gk20a_fence *f, int timeout)
 {
-	if (!tegra_platform_is_silicon())
-		timeout = (u32)MAX_SCHEDULE_TIMEOUT;
-	return f->ops->wait(f, timeout);
+	if (f && gk20a_fence_is_valid(f)) {
+		if (!tegra_platform_is_silicon())
+			timeout = (u32)MAX_SCHEDULE_TIMEOUT;
+		return f->ops->wait(f, timeout);
+	}
+	return 0;
 }

 bool gk20a_fence_is_expired(struct gk20a_fence *f)
 {
-	if (f && f->ops)
+	if (f && gk20a_fence_is_valid(f) && f->ops)
 		return f->ops->is_expired(f);
 	else
 		return true;
@@ -83,7 +94,7 @@ int gk20a_fence_install_fd(struct gk20a_fence *f)
 #ifdef CONFIG_SYNC
 	int fd;

-	if (!f->sync_fence)
+	if (!f || !gk20a_fence_is_valid(f) || !f->sync_fence)
 		return -EINVAL;

 	fd = get_unused_fd_flags(O_RDWR);
@@ -98,18 +109,28 @@ int gk20a_fence_install_fd(struct gk20a_fence *f)
 #endif
 }

-struct gk20a_fence *gk20a_alloc_fence(const struct gk20a_fence_ops *ops,
-				      struct sync_fence *sync_fence, bool wfi)
+struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c)
 {
-	struct gk20a_fence *f = kzalloc(sizeof(*f), GFP_KERNEL);
-	if (!f)
+	struct gk20a_fence *fence;
+
+	fence = kzalloc(sizeof(struct gk20a_fence), GFP_KERNEL);
+	if (!fence)
 		return NULL;
-	kref_init(&f->ref);
+
+	kref_init(&fence->ref);
+	return fence;
+}
+
+void gk20a_init_fence(struct gk20a_fence *f,
+		const struct gk20a_fence_ops *ops,
+		struct sync_fence *sync_fence, bool wfi)
+{
+	if (!f)
+		return;
 	f->ops = ops;
 	f->sync_fence = sync_fence;
 	f->wfi = wfi;
 	f->syncpt_id = -1;
-	return f;
 }

 /* Fences that are backed by GPU semaphores: */
@@ -143,14 +164,15 @@ static const struct gk20a_fence_ops gk20a_semaphore_fence_ops = {
 };

 /* This function takes ownership of the semaphore */
-struct gk20a_fence *gk20a_fence_from_semaphore(
+int gk20a_fence_from_semaphore(
+		struct gk20a_fence *fence_out,
 		struct sync_timeline *timeline,
 		struct gk20a_semaphore *semaphore,
 		wait_queue_head_t *semaphore_wq,
 		struct sync_fence *dependency,
 		bool wfi, bool need_sync_fence)
 {
-	struct gk20a_fence *f;
+	struct gk20a_fence *f = fence_out;
 	struct sync_fence *sync_fence = NULL;

 #ifdef CONFIG_SYNC
@@ -159,21 +181,26 @@ struct gk20a_fence *gk20a_fence_from_semaphore(
 					dependency, "f-gk20a-0x%04x",
 					gk20a_semaphore_gpu_ro_va(semaphore));
 		if (!sync_fence)
-			return NULL;
+			return -1;
 	}
 #endif

-	f  = gk20a_alloc_fence(&gk20a_semaphore_fence_ops, sync_fence, wfi);
+	gk20a_init_fence(f, &gk20a_semaphore_fence_ops, sync_fence, wfi);
 	if (!f) {
 #ifdef CONFIG_SYNC
 		sync_fence_put(sync_fence);
 #endif
-		return NULL;
+		return -EINVAL;
 	}

 	f->semaphore = semaphore;
 	f->semaphore_wq = semaphore_wq;
-	return f;
+
+	/* commit previous writes before setting the valid flag */
+	wmb();
+	f->valid = true;
+
+	return 0;
 }

 #ifdef CONFIG_TEGRA_GK20A
@@ -197,11 +224,13 @@ static const struct gk20a_fence_ops gk20a_syncpt_fence_ops = {
 	.is_expired = &gk20a_syncpt_fence_is_expired,
 };

-struct gk20a_fence *gk20a_fence_from_syncpt(struct platform_device *host1x_pdev,
-					    u32 id, u32 value, bool wfi,
-					    bool need_sync_fence)
+int gk20a_fence_from_syncpt(
+		struct gk20a_fence *fence_out,
+		struct platform_device *host1x_pdev,
+		u32 id, u32 value, bool wfi,
+		bool need_sync_fence)
 {
-	struct gk20a_fence *f;
+	struct gk20a_fence *f = fence_out;
 	struct sync_fence *sync_fence = NULL;

 #ifdef CONFIG_SYNC
@@ -214,27 +243,32 @@ struct gk20a_fence *gk20a_fence_from_syncpt(struct platform_device *host1x_pdev,
 		sync_fence = nvhost_sync_create_fence(host1x_pdev, &pt, 1,
 						      "fence");
 		if (IS_ERR(sync_fence))
-			return NULL;
+			return -1;
 	}
 #endif

-	f = gk20a_alloc_fence(&gk20a_syncpt_fence_ops, sync_fence, wfi);
+	gk20a_init_fence(f, &gk20a_syncpt_fence_ops, sync_fence, wfi);
 	if (!f) {
 #ifdef CONFIG_SYNC
 		if (sync_fence)
 			sync_fence_put(sync_fence);
 #endif
-		return NULL;
+		return -EINVAL;
 	}
 	f->host1x_pdev = host1x_pdev;
 	f->syncpt_id = id;
 	f->syncpt_value = value;
-	return f;
+
+	/* commit previous writes before setting the valid flag */
+	wmb();
+	f->valid = true;
+
+	return 0;
 }
 #else
-struct gk20a_fence *gk20a_fence_from_syncpt(struct platform_device *host1x_pdev,
+int gk20a_fence_from_syncpt(struct platform_device *host1x_pdev,
 					    u32 id, u32 value, bool wfi)
 {
-	return NULL;
+	return -EINVAL;
 }
 #endif
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
@@ -31,6 +31,7 @@ struct gk20a_fence_ops;

 struct gk20a_fence {
 	/* Valid for all fence types: */
+	bool valid;
 	struct kref ref;
 	bool wfi;
 	struct sync_fence *sync_fence;
@@ -47,21 +48,25 @@ struct gk20a_fence {
 };

 /* Fences can be created from semaphores or syncpoint (id, value) pairs */
-struct gk20a_fence *gk20a_fence_from_semaphore(
+int gk20a_fence_from_semaphore(
+		struct gk20a_fence *fence_out,
 		struct sync_timeline *timeline,
 		struct gk20a_semaphore *semaphore,
 		wait_queue_head_t *semaphore_wq,
 		struct sync_fence *dependency,
 		bool wfi, bool need_sync_fence);

-struct gk20a_fence *gk20a_fence_from_syncpt(
+int gk20a_fence_from_syncpt(
+		struct gk20a_fence *fence_out,
 		struct platform_device *host1x_pdev,
 		u32 id, u32 value, bool wfi,
 		bool need_sync_fence);

-struct gk20a_fence *gk20a_alloc_fence(const struct gk20a_fence_ops *ops,
-				      struct sync_fence *sync_fence,
-				      bool wfi);
+struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c);
+
+void gk20a_init_fence(struct gk20a_fence *f,
+		const struct gk20a_fence_ops *ops,
+		struct sync_fence *sync_fence, bool wfi);

 /* Fence operations */
 void gk20a_fence_put(struct gk20a_fence *f);
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -198,6 +198,7 @@ struct priv_cmd_queue {
 };

 struct priv_cmd_entry {
+	bool valid;
 	struct mem_desc *mem;
 	u32 off;	/* offset in mem, in u32 entries */
 	u64 gva;