diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 68a303924..651ea08c7 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -33,6 +33,7 @@
 
 #include "gk20a.h"
 #include "dbg_gpu_gk20a.h"
+#include "semaphore_gk20a.h"
 
 #include "hw_ram_gk20a.h"
 #include "hw_fifo_gk20a.h"
@@ -340,7 +341,7 @@ static void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a)
 	 * resource at this point
 	 * if not, then it will be destroyed at channel_free()
 	 */
-	if (ch_gk20a->sync && ch_gk20a->sync->syncpt_aggressive_destroy) {
+	if (ch_gk20a->sync && ch_gk20a->sync->aggressive_destroy) {
 		ch_gk20a->sync->destroy(ch_gk20a->sync);
 		ch_gk20a->sync = NULL;
 	}
@@ -657,6 +658,8 @@ unbind:
 	ch->vpr = false;
 	ch->vm = NULL;
 
+	gk20a_channel_fence_close(&ch->last_submit.pre_fence);
+	gk20a_channel_fence_close(&ch->last_submit.post_fence);
 	if (ch->sync) {
 		ch->sync->destroy(ch->sync);
 		ch->sync = NULL;
@@ -1089,7 +1092,8 @@ static int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 	ch_vm = c->vm;
 
 	c->cmds_pending = false;
-	c->last_submit_fence.valid = false;
+	gk20a_channel_fence_close(&c->last_submit.pre_fence);
+	gk20a_channel_fence_close(&c->last_submit.post_fence);
 
 	c->ramfc.offset = 0;
 	c->ramfc.size = ram_in_ramfc_s() / 8;
@@ -1272,13 +1276,16 @@ static int gk20a_channel_submit_wfi(struct channel_gk20a *c)
 		}
 	}
 
-	err = c->sync->incr_wfi(c->sync, &cmd, &c->last_submit_fence);
+	gk20a_channel_fence_close(&c->last_submit.pre_fence);
+	gk20a_channel_fence_close(&c->last_submit.post_fence);
+
+	err = c->sync->incr_wfi(c->sync, &cmd, &c->last_submit.post_fence);
 	if (unlikely(err)) {
 		mutex_unlock(&c->submit_lock);
 		return err;
 	}
 
-	WARN_ON(!c->last_submit_fence.wfi);
+	WARN_ON(!c->last_submit.post_fence.wfi);
 
 	c->gpfifo.cpu_va[c->gpfifo.put].entry0 = u64_lo32(cmd->gva);
 	c->gpfifo.cpu_va[c->gpfifo.put].entry1 = u64_hi32(cmd->gva) |
@@ -1344,7 +1351,8 @@ static void trace_write_pushbuffer(struct channel_gk20a *c, struct gpfifo *g)
 }
 
 static int gk20a_channel_add_job(struct channel_gk20a *c,
-				 struct gk20a_channel_fence *fence)
+				 struct gk20a_channel_fence *pre_fence,
+				 struct gk20a_channel_fence *post_fence)
 {
 	struct vm_gk20a *vm = c->vm;
 	struct channel_gk20a_job *job = NULL;
@@ -1369,7 +1377,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 
 	job->num_mapped_buffers = num_mapped_buffers;
 	job->mapped_buffers = mapped_buffers;
-	job->fence = *fence;
+	gk20a_channel_fence_dup(pre_fence, &job->pre_fence);
+	gk20a_channel_fence_dup(post_fence, &job->post_fence);
 
 	mutex_lock(&c->jobs_lock);
 	list_add_tail(&job->list, &c->jobs);
@@ -1391,13 +1400,18 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 	mutex_lock(&c->jobs_lock);
 	list_for_each_entry_safe(job, n, &c->jobs, list) {
 		bool completed = WARN_ON(!c->sync) ||
-			c->sync->is_expired(c->sync, &job->fence);
+			c->sync->is_expired(c->sync, &job->post_fence);
 		if (!completed)
 			break;
 
 		gk20a_vm_put_buffers(vm, job->mapped_buffers,
 				job->num_mapped_buffers);
 
+		/* Close the fences (this will unref the semaphores and release
+		 * them to the pool). */
+		gk20a_channel_fence_close(&job->pre_fence);
+		gk20a_channel_fence_close(&job->post_fence);
+
 		/* job is done. release its reference to vm */
 		gk20a_vm_put(vm);
 
@@ -1413,8 +1427,8 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 	 * the sync resource
 	 */
 	if (list_empty(&c->jobs)) {
-		if (c->sync && c->sync->syncpt_aggressive_destroy &&
-			  c->sync->is_expired(c->sync, &c->last_submit_fence)) {
+		if (c->sync && c->sync->aggressive_destroy &&
+			  c->sync->is_expired(c->sync, &c->last_submit.post_fence)) {
 			c->sync->destroy(c->sync);
 			c->sync = NULL;
 		}
@@ -1448,8 +1462,11 @@ static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	struct device *d = dev_from_gk20a(g);
 	int err = 0;
 	int i;
+	int wait_fence_fd = -1;
 	struct priv_cmd_entry *wait_cmd = NULL;
 	struct priv_cmd_entry *incr_cmd = NULL;
+	struct gk20a_channel_fence pre_fence = { 0 };
+	struct gk20a_channel_fence post_fence = { 0 };
 	/* we might need two extra gpfifo entries - one for pre fence
 	 * and one for post fence. */
 	const int extra_entries = 2;
@@ -1534,12 +1551,14 @@ static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	 * keep running some tests which trigger this condition
 	 */
 	if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
-		if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)
-			err = c->sync->wait_fd(c->sync, fence->syncpt_id,
-					&wait_cmd);
-		else
+		if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
+			wait_fence_fd = fence->syncpt_id;
+			err = c->sync->wait_fd(c->sync, wait_fence_fd,
+					&wait_cmd, &pre_fence);
+		} else {
 			err = c->sync->wait_syncpt(c->sync, fence->syncpt_id,
-					fence->value, &wait_cmd);
+					fence->value, &wait_cmd, &pre_fence);
+		}
 	}
 	if (err) {
 		mutex_unlock(&c->submit_lock);
@@ -1551,19 +1570,19 @@ static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	   to keep track of method completion for idle railgating */
 	if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET &&
 			flags & NVHOST_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)
-		err = c->sync->incr_user_fd(c->sync, &incr_cmd,
-					    &c->last_submit_fence,
+		err = c->sync->incr_user_fd(c->sync, wait_fence_fd, &incr_cmd,
+					    &post_fence,
 					    need_wfi,
 					    &fence->syncpt_id);
 	else if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
 		err = c->sync->incr_user_syncpt(c->sync, &incr_cmd,
-						&c->last_submit_fence,
+						&post_fence,
 						need_wfi,
 						&fence->syncpt_id,
 						&fence->value);
 	else
 		err = c->sync->incr(c->sync, &incr_cmd,
-				    &c->last_submit_fence);
+				    &post_fence);
 	if (err) {
 		mutex_unlock(&c->submit_lock);
 		goto clean_up;
@@ -1611,8 +1630,13 @@ static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		incr_cmd->gp_put = c->gpfifo.put;
 	}
 
+	gk20a_channel_fence_close(&c->last_submit.pre_fence);
+	gk20a_channel_fence_close(&c->last_submit.post_fence);
+	c->last_submit.pre_fence = pre_fence;
+	c->last_submit.post_fence = post_fence;
+
 	/* TODO! Check for errors... */
-	gk20a_channel_add_job(c, &c->last_submit_fence);
+	gk20a_channel_add_job(c, &pre_fence, &post_fence);
 
 	c->cmds_pending = true;
 	gk20a_bar1_writel(g,
@@ -1637,6 +1661,8 @@ clean_up:
 	gk20a_err(d, "fail");
 	free_priv_cmdbuf(c, wait_cmd);
 	free_priv_cmdbuf(c, incr_cmd);
+	gk20a_channel_fence_close(&pre_fence);
+	gk20a_channel_fence_close(&post_fence);
 	gk20a_idle(g->dev);
 	return err;
 }
@@ -1669,6 +1695,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 int gk20a_channel_finish(struct channel_gk20a *ch, unsigned long timeout)
 {
 	int err = 0;
+	struct gk20a_channel_fence *fence = &ch->last_submit.post_fence;
 
 	if (!ch->cmds_pending)
 		return 0;
@@ -1677,21 +1704,20 @@ int gk20a_channel_finish(struct channel_gk20a *ch, unsigned long timeout)
 	if (ch->has_timedout)
 		return -ETIMEDOUT;
 
-	if (!(ch->last_submit_fence.valid && ch->last_submit_fence.wfi)) {
+	if (!(fence->valid && fence->wfi)) {
 		gk20a_dbg_fn("issuing wfi, incr to finish the channel");
 		err = gk20a_channel_submit_wfi(ch);
 	}
 	if (err)
 		return err;
 
-	BUG_ON(!(ch->last_submit_fence.valid && ch->last_submit_fence.wfi));
+	BUG_ON(!(fence->valid && fence->wfi));
 
-	gk20a_dbg_fn("waiting for channel to finish thresh:%d",
-		      ch->last_submit_fence.thresh);
+	gk20a_dbg_fn("waiting for channel to finish thresh:%d sema:%p",
+		      fence->thresh, fence->semaphore);
 
 	if (ch->sync) {
-		err = ch->sync->wait_cpu(ch->sync, &ch->last_submit_fence,
-								timeout);
+		err = ch->sync->wait_cpu(ch->sync, fence, timeout);
 		if (WARN_ON(err))
 			dev_warn(dev_from_gk20a(ch->g),
 			       "timed out waiting for gk20a channel to finish");
@@ -1900,7 +1926,8 @@ int gk20a_channel_suspend(struct gk20a *g)
 
 			if (c->sync)
 				c->sync->wait_cpu(c->sync,
-						&c->last_submit_fence, 500000);
+						  &c->last_submit.post_fence,
+						  500000);
 			break;
 		}
 	}
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index dd0197d6f..84983cc65 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -68,7 +68,8 @@ struct channel_ctx_gk20a {
 struct channel_gk20a_job {
 	struct mapped_buffer_node **mapped_buffers;
 	int num_mapped_buffers;
-	struct gk20a_channel_fence fence;
+	struct gk20a_channel_fence pre_fence;
+	struct gk20a_channel_fence post_fence;
 	struct list_head list;
 };
 
@@ -112,7 +113,10 @@ struct channel_gk20a {
 	u32 timeout_gpfifo_get;
 
 	bool cmds_pending;
-	struct gk20a_channel_fence last_submit_fence;
+	struct {
+		struct gk20a_channel_fence pre_fence;
+		struct gk20a_channel_fence post_fence;
+	} last_submit;
 
 	void (*remove_support)(struct channel_gk20a *);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index f91dd52d6..677c4b493 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -19,6 +19,9 @@
 
 #include "channel_sync_gk20a.h"
 #include "gk20a.h"
+#include "semaphore_gk20a.h"
+#include "sync_gk20a.h"
+#include "mm_gk20a.h"
 
 #ifdef CONFIG_SYNC
 #include "../../../staging/android/sync.h"
@@ -74,7 +77,8 @@ bool gk20a_channel_syncpt_is_expired(struct gk20a_channel_sync *s,
 }
 
 int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s, u32 id,
-		u32 thresh, struct priv_cmd_entry **entry)
+		u32 thresh, struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence)
 {
 	struct gk20a_channel_syncpt *sp =
 		container_of(s, struct gk20a_channel_syncpt, ops);
@@ -99,11 +103,13 @@ int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s, u32 id,
 	add_wait_cmd(&wait_cmd->ptr[0], id, thresh);
 
 	*entry = wait_cmd;
+	fence->valid = false;
 	return 0;
 }
 
 int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
-		       struct priv_cmd_entry **entry)
+		       struct priv_cmd_entry **entry,
+		       struct gk20a_channel_fence *fence)
 {
 #ifdef CONFIG_SYNC
 	int i;
@@ -158,6 +164,7 @@ int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
 	sync_fence_put(sync_fence);
 
 	*entry = wait_cmd;
+	fence->valid = false;
 	return 0;
 #else
 	return -ENODEV;
@@ -301,6 +308,7 @@ int gk20a_channel_syncpt_incr_user_syncpt(struct gk20a_channel_sync *s,
 }
 
 int gk20a_channel_syncpt_incr_user_fd(struct gk20a_channel_sync *s,
+				      int wait_fence_fd,
 				      struct priv_cmd_entry **entry,
 				      struct gk20a_channel_fence *fence,
 				      bool wfi,
@@ -366,18 +374,424 @@ gk20a_channel_syncpt_create(struct channel_gk20a *c)
 	sp->ops.set_min_eq_max		= gk20a_channel_syncpt_set_min_eq_max;
 	sp->ops.destroy			= gk20a_channel_syncpt_destroy;
 
-	sp->ops.syncpt_aggressive_destroy = true;
+	sp->ops.aggressive_destroy	= true;
 
 	return &sp->ops;
 }
 #endif /* CONFIG_TEGRA_GK20A */
 
+struct gk20a_channel_semaphore {
+	struct gk20a_channel_sync ops;
+	struct channel_gk20a *c;
+
+	/* A semaphore pool owned by this channel. */
+	struct gk20a_semaphore_pool *pool;
+
+	/* A sync timeline that advances when gpu completes work. */
+	struct sync_timeline *timeline;
+};
+
+#ifdef CONFIG_SYNC
+struct wait_fence_work {
+	struct sync_fence_waiter waiter;
+	struct channel_gk20a *ch;
+	struct gk20a_semaphore *sema;
+};
+
+static void gk20a_channel_semaphore_launcher(
+		struct sync_fence *fence,
+		struct sync_fence_waiter *waiter)
+{
+	int err;
+	struct wait_fence_work *w =
+		container_of(waiter, struct wait_fence_work, waiter);
+	struct gk20a *g = w->ch->g;
+
+	gk20a_dbg_info("waiting for pre fence %p '%s'",
+			fence, fence->name);
+	err = sync_fence_wait(fence, -1);
+	if (err < 0)
+		dev_err(&g->dev->dev, "error waiting pre-fence: %d\n", err);
+
+	gk20a_dbg_info(
+		  "wait completed (%d) for fence %p '%s', triggering gpu work",
+		  err, fence, fence->name);
+	sync_fence_put(fence);
+	gk20a_semaphore_release(w->sema);
+	gk20a_semaphore_put(w->sema);
+	kfree(w);
+}
+#endif
+
+static int add_sema_cmd(u32 *ptr, u64 sema, u32 payload,
+			bool acquire, bool wfi)
+{
+	int i = 0;
+	/* semaphore_a */
+	ptr[i++] = 0x20010004;
+	/* offset_upper */
+	ptr[i++] = (sema >> 32) & 0xff;
+	/* semaphore_b */
+	ptr[i++] = 0x20010005;
+	/* offset */
+	ptr[i++] = sema & 0xffffffff;
+	/* semaphore_c */
+	ptr[i++] = 0x20010006;
+	/* payload */
+	ptr[i++] = payload;
+	if (acquire) {
+		/* semaphore_d */
+		ptr[i++] = 0x20010007;
+		/* operation: acq_geq, switch_en */
+		ptr[i++] = 0x4 | (0x1 << 12);
+	} else {
+		/* semaphore_d */
+		ptr[i++] = 0x20010007;
+		/* operation: release, wfi */
+		ptr[i++] = 0x2 | ((wfi ? 0x0 : 0x1) << 20);
+		/* non_stall_int */
+		ptr[i++] = 0x20010008;
+		/* ignored */
+		ptr[i++] = 0;
+	}
+	return i;
+}
+
+static int gk20a_channel_semaphore_wait_cpu(
+		struct gk20a_channel_sync *s,
+		struct gk20a_channel_fence *fence,
+		int timeout)
+{
+	int remain;
+	struct gk20a_channel_semaphore *sp =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	if (!fence->valid || WARN_ON(!fence->semaphore))
+		return 0;
+
+	remain = wait_event_interruptible_timeout(
+		sp->c->semaphore_wq,
+		!gk20a_semaphore_is_acquired(fence->semaphore),
+		timeout);
+	if (remain == 0 && gk20a_semaphore_is_acquired(fence->semaphore))
+		return -ETIMEDOUT;
+	else if (remain < 0)
+		return remain;
+	return 0;
+}
+
+static bool gk20a_channel_semaphore_is_expired(
+		struct gk20a_channel_sync *s,
+		struct gk20a_channel_fence *fence)
+{
+	bool expired;
+	struct gk20a_channel_semaphore *sp =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	if (!fence->valid || WARN_ON(!fence->semaphore))
+		return true;
+
+	expired = !gk20a_semaphore_is_acquired(fence->semaphore);
+	if (expired)
+		gk20a_sync_timeline_signal(sp->timeline);
+	return expired;
+}
+
+static int gk20a_channel_semaphore_wait_syncpt(
+		struct gk20a_channel_sync *s, u32 id,
+		u32 thresh, struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence)
+{
+	struct gk20a_channel_semaphore *sema =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	struct device *dev = dev_from_gk20a(sema->c->g);
+	gk20a_err(dev, "trying to use syncpoint synchronization");
+	return -ENODEV;
+}
+
+static int gk20a_channel_semaphore_wait_fd(
+		struct gk20a_channel_sync *s, int fd,
+		struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence)
+{
+	struct gk20a_channel_semaphore *sema =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	struct channel_gk20a *c = sema->c;
+#ifdef CONFIG_SYNC
+	struct sync_fence *sync_fence;
+	struct priv_cmd_entry *wait_cmd = NULL;
+	struct wait_fence_work *w;
+	int written;
+	int err;
+	u64 va;
+
+	sync_fence = gk20a_sync_fence_fdget(fd);
+	if (!sync_fence)
+		return -EINVAL;
+
+	w = kzalloc(sizeof(*w), GFP_KERNEL);
+	if (!w) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
+	w->ch = c;
+	w->sema = gk20a_semaphore_alloc(sema->pool);
+	if (!w->sema) {
+		gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
+		err = -EAGAIN;
+		goto fail;
+	}
+
+	gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+	if (wait_cmd == NULL) {
+		gk20a_err(dev_from_gk20a(c->g),
+				"not enough priv cmd buffer space");
+		err = -EAGAIN;
+		goto fail;
+	}
+
+	va = gk20a_semaphore_gpu_va(w->sema, c->vm);
+	/* GPU unblocked when when the semaphore value becomes 1. */
+	written = add_sema_cmd(wait_cmd->ptr, va, 1, true, false);
+	WARN_ON(written != wait_cmd->size);
+	sync_fence_wait_async(sync_fence, &w->waiter);
+
+	*entry = wait_cmd;
+	return 0;
+fail:
+	if (w && w->sema)
+		gk20a_semaphore_put(w->sema);
+	kfree(w);
+	sync_fence_put(sync_fence);
+	return err;
+#else
+	gk20a_err(dev_from_gk20a(c->g),
+		  "trying to use sync fds with CONFIG_SYNC disabled");
+	return -ENODEV;
+#endif
+}
+
+static int __gk20a_channel_semaphore_incr(
+		struct gk20a_channel_sync *s, bool wfi_cmd,
+		struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence)
+{
+	u64 va;
+	int incr_cmd_size;
+	int written;
+	struct priv_cmd_entry *incr_cmd = NULL;
+	struct gk20a_channel_semaphore *sp =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	struct channel_gk20a *c = sp->c;
+	struct gk20a_semaphore *semaphore;
+
+	semaphore = gk20a_semaphore_alloc(sp->pool);
+	if (!semaphore) {
+		gk20a_err(dev_from_gk20a(c->g),
+				"ran out of semaphores");
+		return -EAGAIN;
+	}
+
+	incr_cmd_size = 10;
+	gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, &incr_cmd);
+	if (incr_cmd == NULL) {
+		gk20a_err(dev_from_gk20a(c->g),
+				"not enough priv cmd buffer space");
+		gk20a_semaphore_put(semaphore);
+		return -EAGAIN;
+	}
+
+	/* Release the completion semaphore. */
+	va = gk20a_semaphore_gpu_va(semaphore, c->vm);
+	written = add_sema_cmd(incr_cmd->ptr, va, 1, false, wfi_cmd);
+	WARN_ON(written != incr_cmd_size);
+
+	fence->valid = true;
+	fence->wfi = wfi_cmd;
+	fence->semaphore = semaphore;
+	*entry = incr_cmd;
+	return 0;
+}
+
+static int gk20a_channel_semaphore_incr_wfi(
+		struct gk20a_channel_sync *s,
+		struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence)
+{
+	return __gk20a_channel_semaphore_incr(s,
+			true /* wfi */,
+			entry, fence);
+}
+
+static int gk20a_channel_semaphore_incr(
+		struct gk20a_channel_sync *s,
+		struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence)
+{
+	/* Don't put wfi cmd to this one since we're not returning
+	 * a fence to user space. */
+	return __gk20a_channel_semaphore_incr(s, false /* no wfi */,
+					      entry, fence);
+}
+
+static int gk20a_channel_semaphore_incr_user_syncpt(
+		struct gk20a_channel_sync *s,
+		struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence,
+		bool wfi,
+		u32 *id, u32 *thresh)
+{
+	struct gk20a_channel_semaphore *sema =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	struct device *dev = dev_from_gk20a(sema->c->g);
+	gk20a_err(dev, "trying to use syncpoint synchronization");
+	return -ENODEV;
+}
+
+static int gk20a_channel_semaphore_incr_user_fd(
+		struct gk20a_channel_sync *s,
+		int wait_fence_fd,
+		struct priv_cmd_entry **entry,
+		struct gk20a_channel_fence *fence,
+		bool wfi,
+		int *fd)
+{
+	struct gk20a_channel_semaphore *sema =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+#ifdef CONFIG_SYNC
+	struct sync_fence *dependency = NULL;
+	int err;
+
+	err = __gk20a_channel_semaphore_incr(s, wfi,
+					     entry, fence);
+	if (err)
+		return err;
+
+	if (wait_fence_fd >= 0) {
+		dependency = gk20a_sync_fence_fdget(wait_fence_fd);
+		if (!dependency)
+			return -EINVAL;
+	}
+
+	*fd = gk20a_sync_fence_create(sema->timeline, fence->semaphore,
+				      dependency, "fence");
+	if (*fd < 0) {
+		if (dependency)
+			sync_fence_put(dependency);
+		return *fd;
+	}
+	return 0;
+#else
+	gk20a_err(dev_from_gk20a(sema->c->g),
+		  "trying to use sync fds with CONFIG_SYNC disabled");
+	return -ENODEV;
+#endif
+}
+
+static void gk20a_channel_semaphore_set_min_eq_max(struct gk20a_channel_sync *s)
+{
+	/* Nothing to do. */
+}
+
+static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
+{
+	struct gk20a_channel_semaphore *sema =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	if (sema->timeline)
+		gk20a_sync_timeline_destroy(sema->timeline);
+	if (sema->pool) {
+		gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);
+		gk20a_semaphore_pool_put(sema->pool);
+	}
+	kfree(sema);
+}
+
+static struct gk20a_channel_sync *
+gk20a_channel_semaphore_create(struct channel_gk20a *c)
+{
+	int err;
+	int asid = -1;
+	struct gk20a_channel_semaphore *sema;
+	char pool_name[20];
+
+	if (WARN_ON(!c->vm))
+		return NULL;
+
+	sema = kzalloc(sizeof(*sema), GFP_KERNEL);
+	if (!sema)
+		return NULL;
+	sema->c = c;
+
+	if (c->vm->as_share)
+		asid = c->vm->as_share->id;
+
+	/* A pool of 256 semaphores fits into one 4k page. */
+	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
+	sema->pool = gk20a_semaphore_pool_alloc(dev_from_gk20a(c->g),
+						pool_name, 256);
+	if (!sema->pool)
+		goto clean_up;
+
+	/* Map the semaphore pool to the channel vm. Map as read-write to the
+	 * owner channel (all other channels should map as read only!). */
+	err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
+	if (err)
+		goto clean_up;
+
+#ifdef CONFIG_SYNC
+	sema->timeline = gk20a_sync_timeline_create(
+			"gk20a_ch%d_as%d", c->hw_chid, asid);
+	if (!sema->timeline)
+		goto clean_up;
+#endif
+	sema->ops.wait_cpu	= gk20a_channel_semaphore_wait_cpu;
+	sema->ops.is_expired	= gk20a_channel_semaphore_is_expired;
+	sema->ops.wait_syncpt	= gk20a_channel_semaphore_wait_syncpt;
+	sema->ops.wait_fd	= gk20a_channel_semaphore_wait_fd;
+	sema->ops.incr		= gk20a_channel_semaphore_incr;
+	sema->ops.incr_wfi	= gk20a_channel_semaphore_incr_wfi;
+	sema->ops.incr_user_syncpt = gk20a_channel_semaphore_incr_user_syncpt;
+	sema->ops.incr_user_fd	= gk20a_channel_semaphore_incr_user_fd;
+	sema->ops.set_min_eq_max = gk20a_channel_semaphore_set_min_eq_max;
+	sema->ops.destroy	= gk20a_channel_semaphore_destroy;
+
+	/* Aggressively destroying the semaphore sync would cause overhead
+	 * since the pool needs to be mapped to GMMU. */
+	sema->ops.aggressive_destroy = false;
+
+	return &sema->ops;
+clean_up:
+	gk20a_channel_semaphore_destroy(&sema->ops);
+	return NULL;
+}
+
 struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c)
 {
 #ifdef CONFIG_TEGRA_GK20A
 	if (gk20a_platform_has_syncpoints(c->g->dev))
 		return gk20a_channel_syncpt_create(c);
 #endif
-	WARN_ON(1);
-	return NULL;
+	return gk20a_channel_semaphore_create(c);
+}
+
+static inline bool gk20a_channel_fence_is_closed(struct gk20a_channel_fence *f)
+{
+	if (f->valid || f->semaphore)
+		return false;
+	return true;
+}
+
+void gk20a_channel_fence_close(struct gk20a_channel_fence *f)
+{
+	if (f->semaphore)
+		gk20a_semaphore_put(f->semaphore);
+	memset(f, 0, sizeof(*f));
+}
+
+void gk20a_channel_fence_dup(struct gk20a_channel_fence *from,
+			     struct gk20a_channel_fence *to)
+{
+	WARN_ON(!gk20a_channel_fence_is_closed(to));
+	*to = *from;
+	if (to->semaphore)
+		gk20a_semaphore_get(to->semaphore);
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
index 90b61bfd7..baa4a151a 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
@@ -23,11 +23,13 @@
 struct gk20a_channel_sync;
 struct priv_cmd_entry;
 struct channel_gk20a;
+struct gk20a_semaphore;
 
 struct gk20a_channel_fence {
 	bool valid;
 	bool wfi; /* was issued with preceding wfi */
-	u32 thresh; /* either semaphore or syncpoint value */
+	u32 thresh; /* syncpoint fences only */
+	struct gk20a_semaphore *semaphore; /* semaphore fences only */
 };
 
 struct gk20a_channel_sync {
@@ -43,11 +45,13 @@ struct gk20a_channel_sync {
 
 	/* Generate a gpu wait cmdbuf from syncpoint. */
 	int (*wait_syncpt)(struct gk20a_channel_sync *s, u32 id, u32 thresh,
-			   struct priv_cmd_entry **entry);
+			   struct priv_cmd_entry **entry,
+			   struct gk20a_channel_fence *fence);
 
 	/* Generate a gpu wait cmdbuf from sync fd. */
 	int (*wait_fd)(struct gk20a_channel_sync *s, int fd,
-		       struct priv_cmd_entry **entry);
+		       struct priv_cmd_entry **entry,
+		       struct gk20a_channel_fence *fence);
 
 	/* Increment syncpoint/semaphore.
 	 * Returns
@@ -88,6 +92,7 @@ struct gk20a_channel_sync {
 	 *  - a sync fd that can be returned to user space.
 	 */
 	int (*incr_user_fd)(struct gk20a_channel_sync *s,
+			    int wait_fence_fd,
 			    struct priv_cmd_entry **entry,
 			    struct gk20a_channel_fence *fence,
 			    bool wfi,
@@ -96,12 +101,16 @@ struct gk20a_channel_sync {
 	/* Reset the channel syncpoint/semaphore. */
 	void (*set_min_eq_max)(struct gk20a_channel_sync *s);
 
-	/* flag to set syncpt destroy aggressiveness */
-	bool syncpt_aggressive_destroy;
+	/* flag to set sync destroy aggressiveness */
+	bool aggressive_destroy;
 
 	/* Free the resources allocated by gk20a_channel_sync_create. */
 	void (*destroy)(struct gk20a_channel_sync *s);
 };
 
 struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c);
+
+void gk20a_channel_fence_close(struct gk20a_channel_fence *f);
+void gk20a_channel_fence_dup(struct gk20a_channel_fence *from,
+			     struct gk20a_channel_fence *to);
 #endif