diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 3f9b04327..6c7ff551f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1002,6 +1002,9 @@ unbind:
 
 	mutex_unlock(&g->dbg_sessions_lock);
 
+	/* Make sure that when the ch is re-opened it will get a new HW sema. */
+	ch->hw_sema = NULL;
+
 	/* make sure we catch accesses of unopened channels in case
 	 * there's non-refcounted channel pointers hanging around */
 	ch->g = NULL;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index acd272b47..c5a1bd24f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -108,6 +108,8 @@ struct channel_gk20a {
 	atomic_t ref_count;
 	wait_queue_head_t ref_count_dec_wq;
 
+	struct gk20a_semaphore_int *hw_sema;
+
 	int hw_chid;
 	bool wdt_enabled;
 	bool bound;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index d2d8c0947..9c8911e96 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
 }
 #endif
 
-static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
-		u64 sema, u32 payload, bool acquire, bool wfi)
+static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
+			 struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
+			 int cmd_size, bool acquire, bool wfi)
 {
 	u32 off = cmd->off;
+	u64 va;
+
+	/*
+	 * RO for acquire (since we just need to read the mem) and RW for
+	 * release since we will need to write back to the semaphore memory.
+	 */
+	va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
+		       gk20a_semaphore_gpu_rw_va(s);
+
+	/*
+	 * If the op is not an acquire (so therefor a release) we should
+	 * incr the underlying sema next_value.
+	 */
+	if (!acquire)
+		gk20a_semaphore_incr(s);
+
 	/* semaphore_a */
 	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
 	/* offset_upper */
-	gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff);
+	gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
 	/* semaphore_b */
 	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
 	/* offset */
-	gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff);
-	/* semaphore_c */
-	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
-	/* payload */
-	gk20a_mem_wr32(g, cmd->mem, off++, payload);
+	gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
+
 	if (acquire) {
+		/* semaphore_c */
+		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+		/* payload */
+		gk20a_mem_wr32(g, cmd->mem, off++,
+			       gk20a_semaphore_get_value(s));
 		/* semaphore_d */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
 		/* operation: acq_geq, switch_en */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
 	} else {
+		/* semaphore_c */
+		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+		/* payload */
+		gk20a_mem_wr32(g, cmd->mem, off++,
+			       gk20a_semaphore_get_value(s));
 		/* semaphore_d */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
 		/* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
 		/* ignored */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0);
 	}
-	return off - cmd->off;
 }
 
 static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
 	return -ENODEV;
 }
 
+/*
+ * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
+ * But since there's no API for getting the underlying sync_pts we have to do
+ * some conditional compilation.
+ */
+#ifdef CONFIG_SYNC
+static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+	struct sync_pt *pt;
+
+	pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
+	return gk20a_sync_pt_inst_get_sema(pt);
+#else
+	return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
+#endif
+}
+
+/*
+ * Attempt a fast path for waiting on a sync_fence. Basically if the passed
+ * sync_fence is backed by a gk20a_semaphore then there's no reason to go
+ * through the rigmarole of setting up a separate semaphore which waits on an
+ * interrupt from the GPU and then triggers a worker thread to execute a SW
+ * based semaphore release. Instead just have the GPU wait on the same semaphore
+ * that is going to be incremented by the GPU.
+ *
+ * This function returns 2 possible values: -ENODEV or 0 on success. In the case
+ * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
+ * a GPU semaphore.
+ */
+static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
+					 struct sync_fence *fence,
+					 struct priv_cmd_entry **wait_cmd,
+					 struct gk20a_semaphore **fp_sema)
+{
+	struct gk20a_semaphore *sema;
+	int err;
+
+	if (!gk20a_is_sema_backed_sync_fence(fence))
+		return -ENODEV;
+
+	sema = sema_from_sync_fence(fence);
+
+	/*
+	 * If there's no underlying sema then that means the underlying sema has
+	 * already signaled.
+	 */
+	if (!sema) {
+		*fp_sema = NULL;
+		return 0;
+	}
+
+	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+	if (err)
+		return err;
+
+	gk20a_semaphore_get(sema);
+	BUG_ON(!atomic_read(&sema->value));
+	add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
+
+	/*
+	 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
+	 * fence with the underlying semaphore.
+	 */
+	*fp_sema = sema;
+
+	return 0;
+}
+#endif
+
 static int gk20a_channel_semaphore_wait_fd(
 		struct gk20a_channel_sync *s, int fd,
 		struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
 		container_of(s, struct gk20a_channel_semaphore, ops);
 	struct channel_gk20a *c = sema->c;
 #ifdef CONFIG_SYNC
+	struct gk20a_semaphore *fp_sema;
 	struct sync_fence *sync_fence;
 	struct priv_cmd_entry *wait_cmd = NULL;
-	struct wait_fence_work *w;
-	int written;
-	int err, ret;
-	u64 va;
+	struct wait_fence_work *w = NULL;
+	int err, ret, status;
 
 	sync_fence = gk20a_sync_fence_fdget(fd);
 	if (!sync_fence)
 		return -EINVAL;
 
-	w = kzalloc(sizeof(*w), GFP_KERNEL);
-	if (!w) {
-		err = -ENOMEM;
-		goto fail;
-	}
-	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
-	w->ch = c;
-	w->sema = gk20a_semaphore_alloc(sema->pool);
-	if (!w->sema) {
-		gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
-		err = -ENOMEM;
-		goto fail;
+	ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
+	if (ret == 0) {
+		if (fp_sema)
+			*fence = gk20a_fence_from_semaphore(sema->timeline,
+							    fp_sema,
+							    &c->semaphore_wq,
+							    NULL, false);
+		else
+			/*
+			 * Allocate an empty fence. It will instantly return
+			 * from gk20a_fence_wait().
+			 */
+			*fence = gk20a_alloc_fence(NULL, NULL, false);
+
+		sync_fence_put(sync_fence);
+		goto skip_slow_path;
 	}
 
-	/* worker takes one reference */
-	gk20a_semaphore_get(w->sema);
+	/* If the fence has signaled there is no reason to wait on it. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+	status = sync_fence->status;
+#else
+	status = atomic_read(&sync_fence->status);
+#endif
+	if (status) {
+		sync_fence_put(sync_fence);
+		goto skip_slow_path;
+	}
 
 	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
 	if (err) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"not enough priv cmd buffer space");
-		goto fail;
+		sync_fence_put(sync_fence);
+		return -ENOMEM;
 	}
 
-	va = gk20a_semaphore_gpu_va(w->sema, c->vm);
-	/* GPU unblocked when when the semaphore value becomes 1. */
-	written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
+	w = kzalloc(sizeof(*w), GFP_KERNEL);
+	if (!w) {
+		err = -ENOMEM;
+		goto fail_free_cmdbuf;
+	}
+
+	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
+	w->ch = c;
+	w->sema = gk20a_semaphore_alloc(c);
+	if (!w->sema) {
+		gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
+		err = -ENOMEM;
+		goto fail_free_worker;
+	}
+
+	/* worker takes one reference */
+	gk20a_semaphore_get(w->sema);
+	gk20a_semaphore_incr(w->sema);
+
+	/* GPU unblocked when the semaphore value increments. */
+	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
 
-	WARN_ON(written != wait_cmd->size);
 	ret = sync_fence_wait_async(sync_fence, &w->waiter);
 
 	/*
 	 * If the sync_fence has already signaled then the above async_wait
 	 * will never trigger. This causes the semaphore release op to never
 	 * happen which, in turn, hangs the GPU. That's bad. So let's just
-	 * do the semaphore_release right now.
+	 * do the gk20a_semaphore_release() right now.
 	 */
-	if (ret == 1)
+	if (ret == 1) {
+		sync_fence_put(sync_fence);
 		gk20a_semaphore_release(w->sema);
+		gk20a_semaphore_put(w->sema);
+	}
 
 	/* XXX - this fixes an actual bug, we need to hold a ref to this
 	   semaphore while the job is in flight. */
 	*fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
 					    &c->semaphore_wq,
 					    NULL, false);
+
+skip_slow_path:
 	*entry = wait_cmd;
 	return 0;
-fail:
+
+fail_free_worker:
 	if (w && w->sema)
 		gk20a_semaphore_put(w->sema);
 	kfree(w);
 	sync_fence_put(sync_fence);
+fail_free_cmdbuf:
+	if (wait_cmd)
+		gk20a_free_priv_cmdbuf(c, wait_cmd);
 	return err;
 #else
 	gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
 		struct gk20a_fence **fence,
 		bool need_sync_fence)
 {
-	u64 va;
 	int incr_cmd_size;
-	int written;
 	struct priv_cmd_entry *incr_cmd = NULL;
 	struct gk20a_channel_semaphore *sp =
 		container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
 	struct gk20a_semaphore *semaphore;
 	int err = 0;
 
-	semaphore = gk20a_semaphore_alloc(sp->pool);
+	semaphore = gk20a_semaphore_alloc(c);
 	if (!semaphore) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
 	}
 
 	/* Release the completion semaphore. */
-	va = gk20a_semaphore_gpu_va(semaphore, c->vm);
-	written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
-	WARN_ON(written != incr_cmd_size);
+	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
 
 	*fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
 					    &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
 {
 	/* Don't put wfi cmd to this one since we're not returning
 	 * a fence to user space. */
-	return __gk20a_channel_semaphore_incr(s, false /* no wfi */,
-				      NULL, entry, fence, need_sync_fence);
+	return __gk20a_channel_semaphore_incr(s,
+			false /* no wfi */,
+			NULL,
+			entry, fence, need_sync_fence);
 }
 
 static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
 		container_of(s, struct gk20a_channel_semaphore, ops);
 	if (sema->timeline)
 		gk20a_sync_timeline_destroy(sema->timeline);
-	if (sema->pool) {
-		gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);
-		gk20a_semaphore_pool_put(sema->pool);
-	}
+
+	/* The sema pool is cleaned up by the VM destroy. */
+	sema->pool = NULL;
+
 	kfree(sema);
 }
 
 static struct gk20a_channel_sync *
 gk20a_channel_semaphore_create(struct channel_gk20a *c)
 {
-	int err;
 	int asid = -1;
 	struct gk20a_channel_semaphore *sema;
 	char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
 		asid = c->vm->as_share->id;
 
 	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
-	sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024);
-	if (!sema->pool)
-		goto clean_up;
-
-	/* Map the semaphore pool to the channel vm. Map as read-write to the
-	 * owner channel (all other channels should map as read only!). */
-	err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
-	if (err)
-		goto clean_up;
+	sema->pool = c->vm->sema_pool;
 
 #ifdef CONFIG_SYNC
 	sema->timeline = gk20a_sync_timeline_create(
 			"gk20a_ch%d_as%d", c->hw_chid, asid);
-	if (!sema->timeline)
-		goto clean_up;
+	if (!sema->timeline) {
+		gk20a_channel_semaphore_destroy(&sema->ops);
+		return NULL;
+	}
 #endif
 	atomic_set(&sema->ops.refcount, 0);
 	sema->ops.wait_syncpt	= gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
 	sema->ops.destroy	= gk20a_channel_semaphore_destroy;
 
 	return &sema->ops;
-clean_up:
-	gk20a_channel_semaphore_destroy(&sema->ops);
-	return NULL;
 }
 
 void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index 23522882f..fbbaa2a7a 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -155,8 +155,8 @@ struct gk20a_fence *gk20a_fence_from_semaphore(
 
 #ifdef CONFIG_SYNC
 	sync_fence = gk20a_sync_fence_create(timeline, semaphore,
-					     dependency, "f-gk20a-0x%04x",
-					     semaphore->offset & 0xffff);
+					dependency, "f-gk20a-0x%04x",
+					gk20a_semaphore_gpu_ro_va(semaphore));
 	if (!sync_fence)
 		return NULL;
 #endif
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 5ab09ac38..7bd9775e4 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -738,6 +738,11 @@ struct gk20a {
 #endif
 	struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
 
+	/*
+	 * A group of semaphore pools. One for each channel.
+	 */
+	struct gk20a_semaphore_sea *sema_sea;
+
 	/* held while manipulating # of debug/profiler sessions present */
 	/* also prevents debug sessions from attaching until released */
 	struct mutex dbg_sessions_lock;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3b21e8432..9299266fa 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
 	struct rb_node *node;
 
 	gk20a_dbg_fn("");
+
+	/*
+	 * Do this outside of the update_gmmu_lock since unmapping the semaphore
+	 * pool involves unmapping a GMMU mapping which means aquiring the
+	 * update_gmmu_lock.
+	 */
+	if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
+		gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+		gk20a_semaphore_pool_put(vm->sema_pool);
+	}
+
 	mutex_lock(&vm->update_gmmu_lock);
 
 	/* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
 	{.update_entry = NULL}
 };
 
+/*
+ * Initialize a semaphore pool. Just return successfully if we do not need
+ * semaphores (i.e when sync-pts are active).
+ */
+int gk20a_init_sema_pool(struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_sea *sema_sea;
+	struct mm_gk20a *mm = vm->mm;
+	struct gk20a *g = mm->g;
+	int err;
+
+	/*
+	 * Don't waste the memory on semaphores if we don't need them.
+	 */
+	if (gk20a_platform_has_syncpoints(g->dev))
+		return 0;
+
+	if (vm->sema_pool)
+		return 0;
+
+	sema_sea = gk20a_semaphore_sea_create(g);
+	if (!sema_sea)
+		return -ENOMEM;
+
+	vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
+	if (!vm->sema_pool) {
+		gk20a_vm_put(vm);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Allocate a chunk of GPU VA space for mapping the semaphores. We will
+	 * do a fixed alloc in the kernel VM so that all channels have the same
+	 * RO address range for the semaphores.
+	 *
+	 * !!! TODO: cleanup.
+	 */
+	sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
+					      vm->va_limit -
+					      mm->channel.kernel_size,
+					      512 * PAGE_SIZE);
+	if (!sema_sea->gpu_va) {
+		gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
+		gk20a_vm_put(vm);
+		return -ENOMEM;
+	}
+
+	err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
+	if (err) {
+		gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+		gk20a_bfree(&vm->vma[gmmu_page_size_small],
+			    vm->sema_pool->gpu_va);
+		gk20a_vm_put(vm);
+	}
+
+	return 0;
+}
+
 int gk20a_init_vm(struct mm_gk20a *mm,
 		struct vm_gk20a *vm,
 		u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	vm->big_pages = big_pages;
 
 	vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
-
 	vm->userspace_managed = userspace_managed;
-
 	vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
 			vm->big_page_size);
 
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	kref_init(&vm->ref);
 	INIT_LIST_HEAD(&vm->reserved_va_list);
 
+	/*
+	 * This is only necessary for channel address spaces. The best way to
+	 * distinguish channel address spaces from other address spaces is by
+	 * size - if the address space is 4GB or less, it's not a channel.
+	 */
+	if (vm->va_limit > SZ_4G) {
+		err = gk20a_init_sema_pool(vm);
+		if (err)
+			goto clean_up_big_allocator;
+	}
+
 	return 0;
 
 clean_up_big_allocator:
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index db74a5ca6..7bb4d011c 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -287,6 +287,11 @@ struct vm_gk20a {
 	/* if non-NULL, kref_put will use this batch when
 	   unmapping. Must hold vm->update_gmmu_lock. */
 	struct vm_gk20a_mapping_batch *kref_put_batch;
+
+	/*
+	 * Each address space needs to have a semaphore pool.
+	 */
+	struct gk20a_semaphore_pool *sema_pool;
 };
 
 struct gk20a;
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 3b17bfcb2..aa375b245 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -15,63 +15,284 @@
  * more details.
  */
 
-#include "semaphore_gk20a.h"
+#define pr_fmt(fmt) "gpu_sema: " fmt
+
 #include <linux/dma-mapping.h>
+#include <linux/highmem.h>
 #include <linux/slab.h>
+
+#include <asm/pgtable.h>
+
 #include "gk20a.h"
 #include "mm_gk20a.h"
+#include "semaphore_gk20a.h"
 
-static const int SEMAPHORE_SIZE = 16;
+#define __lock_sema_sea(s)						\
+	do {								\
+		mutex_lock(&s->sea_lock);				\
+	} while (0)
 
-struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct gk20a *g,
-		const char *unique_name, size_t capacity)
+#define __unlock_sema_sea(s)						\
+	do {								\
+		mutex_unlock(&s->sea_lock);				\
+	} while (0)
+
+/*
+ * Return the sema_sea pointer.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g)
 {
-	struct gk20a_semaphore_pool *p;
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
-	if (!p)
+	return g->sema_sea;
+}
+
+static int __gk20a_semaphore_sea_grow(struct gk20a_semaphore_sea *sea)
+{
+	int ret = 0;
+	struct gk20a *gk20a = sea->gk20a;
+
+	__lock_sema_sea(sea);
+
+	ret = gk20a_gmmu_alloc_attr(gk20a, DMA_ATTR_NO_KERNEL_MAPPING,
+				    PAGE_SIZE * SEMAPHORE_POOL_COUNT,
+				    &sea->sea_mem);
+	if (ret)
+		goto out;
+
+	sea->ro_sg_table = sea->sea_mem.sgt;
+	sea->size = SEMAPHORE_POOL_COUNT;
+	sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE;
+
+out:
+	__unlock_sema_sea(sea);
+	return ret;
+}
+
+/*
+ * Create the semaphore sea. Only create it once - subsequent calls to this will
+ * return the originally created sea pointer.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *g)
+{
+	if (g->sema_sea)
+		return g->sema_sea;
+
+	g->sema_sea = kzalloc(sizeof(*g->sema_sea), GFP_KERNEL);
+	if (!g->sema_sea)
 		return NULL;
 
-	kref_init(&p->ref);
-	INIT_LIST_HEAD(&p->maps);
-	mutex_init(&p->maps_mutex);
-	p->g = g;
+	g->sema_sea->size = 0;
+	g->sema_sea->page_count = 0;
+	g->sema_sea->gk20a = g;
+	INIT_LIST_HEAD(&g->sema_sea->pool_list);
+	mutex_init(&g->sema_sea->sea_lock);
 
-	/* Alloc one 4k page of semaphore per channel. */
-	if (gk20a_gmmu_alloc(g, roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE),
-				&p->mem))
-		goto clean_up;
+	if (__gk20a_semaphore_sea_grow(g->sema_sea))
+		goto cleanup;
 
-	/* Sacrifice one semaphore in the name of returning error codes. */
-	if (gk20a_allocator_init(&p->alloc, unique_name,
-				 SEMAPHORE_SIZE, p->mem.size - SEMAPHORE_SIZE,
-				 SEMAPHORE_SIZE))
-		goto clean_up;
+	return g->sema_sea;
 
-	gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->mem.cpu_va,
-		(u64)sg_dma_address(p->mem.sgt->sgl),
-		(u64)sg_phys(p->mem.sgt->sgl));
-	return p;
-
-clean_up:
-	if (p->mem.size)
-		gk20a_gmmu_free(p->g, &p->mem);
-	kfree(p);
+cleanup:
+	kfree(g->sema_sea);
+	g->sema_sea = NULL;
 	return NULL;
 }
 
+static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len)
+{
+	unsigned long idx = find_first_zero_bit(bitmap, len);
+
+	if (idx == len)
+		return -ENOSPC;
+
+	set_bit(idx, bitmap);
+
+	return (int)idx;
+}
+
+/*
+ * Allocate a pool from the sea.
+ */
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
+				struct gk20a_semaphore_sea *sea)
+{
+	struct gk20a_semaphore_pool *p;
+	unsigned long page_idx;
+	int err = 0;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	__lock_sema_sea(sea);
+
+	page_idx = __semaphore_bitmap_alloc(sea->pools_alloced,
+					    SEMAPHORE_POOL_COUNT);
+	if (page_idx < 0) {
+		err = page_idx;
+		goto fail;
+	}
+
+	p->page = sea->sea_mem.pages[page_idx];
+	p->ro_sg_table = sea->ro_sg_table;
+	p->page_idx = page_idx;
+	p->sema_sea = sea;
+	INIT_LIST_HEAD(&p->hw_semas);
+	kref_init(&p->ref);
+	mutex_init(&p->pool_lock);
+
+	sea->page_count++;
+	list_add(&p->pool_list_entry, &sea->pool_list);
+	__unlock_sema_sea(sea);
+
+	return p;
+
+fail:
+	__unlock_sema_sea(sea);
+	kfree(p);
+	return ERR_PTR(err);
+}
+
+/*
+ * Map a pool into the passed vm's address space. This handles both the fixed
+ * global RO mapping and the non-fixed private RW mapping.
+ */
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
+			     struct vm_gk20a *vm)
+{
+	int ents, err = 0;
+	u64 addr;
+
+	p->cpu_va = vmap(&p->page, 1, 0,
+			 pgprot_writecombine(PAGE_KERNEL));
+
+	/* First do the RW mapping. */
+	p->rw_sg_table = kzalloc(sizeof(*p->rw_sg_table), GFP_KERNEL);
+	if (!p->rw_sg_table)
+		return -ENOMEM;
+
+	err = sg_alloc_table_from_pages(p->rw_sg_table, &p->page, 1, 0,
+					PAGE_SIZE, GFP_KERNEL);
+	if (err) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	/* Add IOMMU mapping... */
+	ents = dma_map_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+			  DMA_BIDIRECTIONAL);
+	if (ents != 1) {
+		err = -ENOMEM;
+		goto fail_free_sgt;
+	}
+
+	/* Map into the GPU... Doesn't need to be fixed. */
+	p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE,
+				   0, gk20a_mem_flag_none, false);
+	if (!p->gpu_va) {
+		err = -ENOMEM;
+		goto fail_unmap_sgt;
+	}
+
+	/*
+	 * And now the global mapping. Take the sea lock so that we don't race
+	 * with a concurrent remap.
+	 */
+	__lock_sema_sea(p->sema_sea);
+
+	BUG_ON(p->mapped);
+	addr = gk20a_gmmu_fixed_map(vm, &p->sema_sea->ro_sg_table,
+				    p->sema_sea->gpu_va, p->sema_sea->map_size,
+				    0,
+				    gk20a_mem_flag_read_only,
+				    false);
+	if (!addr) {
+		err = -ENOMEM;
+		BUG();
+		goto fail_unlock;
+	}
+	p->gpu_va_ro = addr;
+	p->mapped = 1;
+
+	__unlock_sema_sea(p->sema_sea);
+
+	return 0;
+
+fail_unlock:
+	__unlock_sema_sea(p->sema_sea);
+fail_unmap_sgt:
+	dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+		     DMA_BIDIRECTIONAL);
+fail_free_sgt:
+	sg_free_table(p->rw_sg_table);
+fail:
+	kfree(p->rw_sg_table);
+	p->rw_sg_table = NULL;
+	return err;
+}
+
+/*
+ * Unmap a semaphore_pool.
+ */
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
+				struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_int *hw_sema;
+
+	kunmap(p->cpu_va);
+
+	/* First the global RO mapping... */
+	__lock_sema_sea(p->sema_sea);
+	gk20a_gmmu_unmap(vm, p->gpu_va_ro,
+			 p->sema_sea->map_size, gk20a_mem_flag_none);
+	p->ro_sg_table = NULL;
+	__unlock_sema_sea(p->sema_sea);
+
+	/* And now the private RW mapping. */
+	gk20a_gmmu_unmap(vm, p->gpu_va, PAGE_SIZE, gk20a_mem_flag_none);
+	p->gpu_va = 0;
+
+	dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+		     DMA_BIDIRECTIONAL);
+
+	sg_free_table(p->rw_sg_table);
+	kfree(p->rw_sg_table);
+	p->rw_sg_table = NULL;
+
+	gk20a_dbg_info("Unmapped sema-pool: idx = %d", p->page_idx);
+	list_for_each_entry(hw_sema, &p->hw_semas, hw_sema_list)
+		/*
+		 * Make sure the mem addresses are all NULL so if this gets
+		 * reused we will fault.
+		 */
+		hw_sema->value = NULL;
+}
+
+/*
+ * Completely free a sempahore_pool. You should make sure this pool is not
+ * mapped otherwise there's going to be a memory leak.
+ */
 static void gk20a_semaphore_pool_free(struct kref *ref)
 {
 	struct gk20a_semaphore_pool *p =
 		container_of(ref, struct gk20a_semaphore_pool, ref);
-	mutex_lock(&p->maps_mutex);
-	WARN_ON(!list_empty(&p->maps));
-	mutex_unlock(&p->maps_mutex);
-	gk20a_gmmu_free(p->g, &p->mem);
-	gk20a_allocator_destroy(&p->alloc);
+	struct gk20a_semaphore_sea *s = p->sema_sea;
+	struct gk20a_semaphore_int *hw_sema, *tmp;
+
+	WARN_ON(p->gpu_va || p->rw_sg_table || p->ro_sg_table);
+
+	__lock_sema_sea(s);
+	list_del(&p->pool_list_entry);
+	clear_bit(p->page_idx, s->pools_alloced);
+	s->page_count--;
+	__unlock_sema_sea(s);
+
+	list_for_each_entry_safe(hw_sema, tmp, &p->hw_semas, hw_sema_list)
+		kfree(hw_sema);
+
 	kfree(p);
 }
 
-static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
+void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
 {
 	kref_get(&p->ref);
 }
@@ -81,104 +302,96 @@ void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p)
 	kref_put(&p->ref, gk20a_semaphore_pool_free);
 }
 
-static struct gk20a_semaphore_pool_map *
-gk20a_semaphore_pool_find_map_locked(struct gk20a_semaphore_pool *p,
-				     struct vm_gk20a *vm)
+/*
+ * Get the address for a semaphore_pool - if global is true then return the
+ * global RO address instead of the RW address owned by the semaphore's VM.
+ */
+u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global)
 {
-	struct gk20a_semaphore_pool_map *map, *found = NULL;
-	list_for_each_entry(map, &p->maps, list) {
-		if (map->vm == vm) {
-			found = map;
-			break;
-		}
-	}
-	return found;
+	if (!global)
+		return p->gpu_va;
+
+	return p->gpu_va_ro + (PAGE_SIZE * p->page_idx);
 }
 
-int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
-			     struct vm_gk20a *vm,
-			     enum gk20a_mem_rw_flag rw_flag)
+static int __gk20a_init_hw_sema(struct channel_gk20a *ch)
 {
-	struct gk20a_semaphore_pool_map *map;
+	int hw_sema_idx;
+	int ret = 0;
+	struct gk20a_semaphore_int *hw_sema;
+	struct gk20a_semaphore_pool *p = ch->vm->sema_pool;
 
-	map = kzalloc(sizeof(*map), GFP_KERNEL);
-	if (!map)
-		return -ENOMEM;
-	map->vm = vm;
-	map->rw_flag = rw_flag;
-	map->gpu_va = gk20a_gmmu_map(vm, &p->mem.sgt, p->mem.size,
-				     0/*uncached*/, rw_flag,
-				     false);
-	if (!map->gpu_va) {
-		kfree(map);
-		return -ENOMEM;
+	BUG_ON(!p);
+
+	mutex_lock(&p->pool_lock);
+
+	/* Find an available HW semaphore. */
+	hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced,
+					       PAGE_SIZE / SEMAPHORE_SIZE);
+	if (hw_sema_idx < 0) {
+		ret = hw_sema_idx;
+		goto fail;
 	}
-	gk20a_vm_get(vm);
 
-	mutex_lock(&p->maps_mutex);
-	WARN_ON(gk20a_semaphore_pool_find_map_locked(p, vm));
-	list_add(&map->list, &p->maps);
-	mutex_unlock(&p->maps_mutex);
+	hw_sema = kzalloc(sizeof(struct gk20a_semaphore_int), GFP_KERNEL);
+	if (!hw_sema) {
+		ret = -ENOMEM;
+		goto fail_free_idx;
+	}
+
+	ch->hw_sema = hw_sema;
+	hw_sema->ch = ch;
+	hw_sema->p = p;
+	hw_sema->idx = hw_sema_idx;
+	hw_sema->offset = SEMAPHORE_SIZE * hw_sema_idx;
+	atomic_set(&hw_sema->next_value, 0);
+	hw_sema->value = p->cpu_va + hw_sema->offset;
+	writel(0, hw_sema->value);
+
+	list_add(&hw_sema->hw_sema_list, &p->hw_semas);
+
+	mutex_unlock(&p->pool_lock);
+
 	return 0;
+
+fail_free_idx:
+	clear_bit(hw_sema_idx, p->semas_alloced);
+fail:
+	mutex_unlock(&p->pool_lock);
+	return ret;
 }
 
-void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
-		struct vm_gk20a *vm)
-{
-	struct gk20a_semaphore_pool_map *map;
-	WARN_ON(!vm);
-
-	mutex_lock(&p->maps_mutex);
-	map = gk20a_semaphore_pool_find_map_locked(p, vm);
-	if (map) {
-		gk20a_gmmu_unmap(vm, map->gpu_va, p->mem.size, map->rw_flag);
-		gk20a_vm_put(vm);
-		list_del(&map->list);
-		kfree(map);
-	}
-	mutex_unlock(&p->maps_mutex);
-}
-
-u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p,
-		struct vm_gk20a *vm)
-{
-	struct gk20a_semaphore_pool_map *map;
-	u64 gpu_va = 0;
-
-	mutex_lock(&p->maps_mutex);
-	map = gk20a_semaphore_pool_find_map_locked(p, vm);
-	if (map)
-		gpu_va = map->gpu_va;
-	mutex_unlock(&p->maps_mutex);
-
-	return gpu_va;
-}
-
-struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
+/*
+ * Allocate a semaphore from the passed pool.
+ *
+ * Since semaphores are ref-counted there's no explicit free for external code
+ * to use. When the ref-count hits 0 the internal free will happen.
+ */
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch)
 {
 	struct gk20a_semaphore *s;
+	int ret;
+
+	if (!ch->hw_sema) {
+		ret = __gk20a_init_hw_sema(ch);
+		if (ret)
+			return ERR_PTR(ret);
+	}
 
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return NULL;
 
-	s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
-	if (!s->offset) {
-		gk20a_err(dev_from_gk20a(pool->g),
-				"failed to allocate semaphore");
-		kfree(s);
-		return NULL;
-	}
-
-	gk20a_semaphore_pool_get(pool);
-	s->pool = pool;
-
 	kref_init(&s->ref);
-	/* Initially acquired. */
-	gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 0);
-	gk20a_dbg_info("created semaphore offset=%d, value=%d",
-			s->offset,
-			gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset));
+	s->hw_sema = ch->hw_sema;
+	atomic_set(&s->value, 0);
+
+	/*
+	 * Take a ref on the pool so that we can keep this pool alive for
+	 * as long as this semaphore is alive.
+	 */
+	gk20a_semaphore_pool_get(s->hw_sema->p);
+
 	return s;
 }
 
@@ -187,8 +400,8 @@ static void gk20a_semaphore_free(struct kref *ref)
 	struct gk20a_semaphore *s =
 		container_of(ref, struct gk20a_semaphore, ref);
 
-	gk20a_bfree(&s->pool->alloc, s->offset);
-	gk20a_semaphore_pool_put(s->pool);
+	gk20a_semaphore_pool_put(s->hw_sema->p);
+
 	kfree(s);
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
index 1f12e262b..58081b56f 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
 #define SEMAPHORE_GK20A_H
 
 #include <linux/kref.h>
-#include "gk20a_allocator.h"
-#include "mm_gk20a.h"
+#include <linux/list.h>
+#include <linux/delay.h>
+
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include "channel_gk20a.h"
+
+/*
+ * Max number of channels that can be used is 512. This of course needs to be
+ * fixed to be dynamic but still fast.
+ */
+#define SEMAPHORE_POOL_COUNT		512
+#define SEMAPHORE_SIZE			16
+#define SEMAPHORE_SEA_GROWTH_RATE	32
+
+struct gk20a_semaphore_sea;
+
+/*
+ * Underlying semaphore data structure. This semaphore can be shared amongst
+ * other semaphore instances.
+ */
+struct gk20a_semaphore_int {
+	int idx;			/* Semaphore index. */
+	u32 offset;			/* Offset into the pool. */
+	atomic_t next_value;		/* Next available value. */
+	u32 *value;			/* Current value (access w/ readl()). */
+	u32 nr_incrs;			/* Number of increments programmed. */
+	struct gk20a_semaphore_pool *p;	/* Pool that owns this sema. */
+	struct channel_gk20a *ch;	/* Channel that owns this sema. */
+	struct list_head hw_sema_list;	/* List of HW semaphores. */
+};
+
+/*
+ * A semaphore which the rest of the driver actually uses. This consists of a
+ * pointer to a real semaphore and a value to wait for. This allows one physical
+ * semaphore to be shared among an essentially infinite number of submits.
+ */
+struct gk20a_semaphore {
+	struct gk20a_semaphore_int *hw_sema;
+
+	atomic_t value;
+	int incremented;
 
-/* A memory pool for holding semaphores. */
-struct gk20a_semaphore_pool {
-	struct mem_desc mem;
-	struct gk20a *g;
-	struct list_head maps;
-	struct mutex maps_mutex;
 	struct kref ref;
-	struct gk20a_allocator alloc;
+};
+
+/*
+ * A semaphore pool. Each address space will own exactly one of these.
+ */
+struct gk20a_semaphore_pool {
+	struct page *page;			/* This pool's page of memory */
+	struct list_head pool_list_entry;	/* Node for list of pools. */
+	void *cpu_va;				/* CPU access to the pool. */
+	u64 gpu_va;				/* GPU access to the pool. */
+	u64 gpu_va_ro;				/* GPU access to the pool. */
+	int page_idx;				/* Index into sea bitmap. */
+
+	struct list_head hw_semas;		/* List of HW semas. */
+	DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
+
+	struct gk20a_semaphore_sea *sema_sea;	/* Sea that owns this pool. */
+
+	struct mutex pool_lock;
+
+	/*
+	 * This is the address spaces's personal RW table. Other channels will
+	 * ultimately map this page as RO.
+	 */
+	struct sg_table *rw_sg_table;
+
+	/*
+	 * This is to keep track of whether the pool has had its sg_table
+	 * updated during sea resizing.
+	 */
+	struct sg_table *ro_sg_table;
+
+	int mapped;
+
+	/*
+	 * Sometimes a channel can be released before other channels are
+	 * done waiting on it. This ref count ensures that the pool doesn't
+	 * go away until all semaphores using this pool are cleaned up first.
+	 */
+	struct kref ref;
+};
+
+/*
+ * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
+ * channels can share a VM each channel gets it's own HW semaphore from the
+ * pool. Channels then allocate regular semaphores - basically just a value that
+ * signifies when a particular job is done.
+ */
+struct gk20a_semaphore_sea {
+	struct list_head pool_list;	/* List of pools in this sea. */
+	struct gk20a *gk20a;
+
+	size_t size;			/* Number of pages available. */
+	u64 gpu_va;			/* GPU virtual address of sema sea. */
+	u64 map_size;			/* Size of the mapping. */
+
+	/*
+	 * TODO:
+	 * List of pages that we use to back the pools. The number of pages
+	 * can grow dynamically since allocating 512 pages for all channels at
+	 * once would be a tremendous waste.
+	 */
+	int page_count;			/* Pages allocated to pools. */
+
+	struct sg_table *ro_sg_table;
+	/*
+	struct page *pages[SEMAPHORE_POOL_COUNT];
+	*/
+
+	struct mem_desc sea_mem;
+
+	/*
+	 * Can't use a regular allocator here since the full range of pools are
+	 * not always allocated. Instead just use a bitmap.
+	 */
+	DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
+
+	struct mutex sea_lock;		/* Lock alloc/free calls. */
 };
 
 enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
 	gk20a_mem_flag_write_only = 2,
 };
 
-/* A semaphore pool can be mapped to multiple GPU address spaces. */
-struct gk20a_semaphore_pool_map {
-	u64 gpu_va;
-	enum gk20a_mem_rw_flag rw_flag;
-	struct vm_gk20a *vm;
-	struct list_head list;
-};
+/*
+ * Semaphore sea functions.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a);
+int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
+			    struct vm_gk20a *vm);
+void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
+			       struct vm_gk20a *vm);
+struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g);
 
-/* A semaphore that lives inside a semaphore pool. */
-struct gk20a_semaphore {
-	struct gk20a_semaphore_pool *pool;
-	/*
-	 * value exists within the pool's memory at the specified offset.
-	 * 0=acquired, 1=released.
-	 */
-	u32 offset; /* byte offset within pool */
-	struct kref ref;
-};
+/*
+ * Semaphore pool functions.
+ */
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
+	struct gk20a_semaphore_sea *sea);
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
+			     struct vm_gk20a *vm);
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
+				struct vm_gk20a *vm);
+u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
+void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
+void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);
 
-/* Create a semaphore pool that can hold at most 'capacity' semaphores. */
-struct gk20a_semaphore_pool *
-gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name,
-			   size_t capacity);
-void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
-int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
-			     struct vm_gk20a *,
-			     enum gk20a_mem_rw_flag);
-void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
-				struct vm_gk20a *);
-u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
-				struct vm_gk20a *);
+/*
+ * Semaphore functions.
+ */
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch);
+void gk20a_semaphore_put(struct gk20a_semaphore *s);
+void gk20a_semaphore_get(struct gk20a_semaphore *s);
 
-/* Allocate a semaphore from the semaphore pool. The newly allocated
- * semaphore will be in acquired state (value=0). */
-struct gk20a_semaphore *
-gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
-void gk20a_semaphore_put(struct gk20a_semaphore *);
-void gk20a_semaphore_get(struct gk20a_semaphore *);
-
-static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
-					 struct vm_gk20a *vm)
+/*
+ * Return the address of a specific semaphore.
+ *
+ * Don't call this on a semaphore you don't own - the VA returned will make no
+ * sense in your specific channel's VM.
+ */
+static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
 {
-	return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset;
+	return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
+		s->hw_sema->offset;
+}
+
+/*
+ * Get the global RO address for the semaphore. Can be called on any semaphore
+ * regardless of whether you own it.
+ */
+static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
+{
+	return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
+		s->hw_sema->offset;
+}
+
+static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
+{
+	return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
+		hw_sema->offset;
+}
+
+/*
+ * TODO: handle wrap around... Hmm, how to do this?
+ */
+static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
+{
+	u32 sema_val = readl(s->hw_sema->value);
+
+	/*
+	 * If the underlying semaphore value is greater than or equal to
+	 * the value of the semaphore then the semaphore has been signaled
+	 * (a.k.a. released).
+	 */
+	return sema_val >= atomic_read(&s->value);
 }
 
 static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
 {
-	u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset);
-
-	/* When often block on value reaching a certain threshold. We must make
-	 * sure that if we get unblocked, we haven't read anything too early. */
-	smp_rmb();
-	return v == 0;
+	return !gk20a_semaphore_is_released(s);
 }
 
+/*
+ * Read the underlying value from a semaphore.
+ */
+static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
+{
+	return readl(s->hw_sema->value);
+}
+
+static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
+{
+	return atomic_read(&s->value);
+}
+
+static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
+{
+	return atomic_read(&s->hw_sema->next_value);
+}
+
+/*
+ * Note - if you call this then any prior semaphores will also be released.
+ */
 static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
 {
-	smp_wmb();
-	gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1);
+	u32 current_val;
+	u32 val = gk20a_semaphore_get_value(s);
+	int attempts = 0;
+
+	/*
+	 * Wait until the sema value is 1 less than the write value. That
+	 * way this function is essentially an increment.
+	 *
+	 * TODO: tune the wait a little better.
+	 */
+	while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
+		msleep(100);
+		attempts += 1;
+		if (attempts > 100) {
+			WARN(1, "Stall on sema release!");
+			return;
+		}
+	}
+
+	/*
+	 * If the semaphore has already passed the value we would write then
+	 * this is really just a NO-OP.
+	 */
+	if (current_val >= val)
+		return;
+
+	writel(val, s->hw_sema->value);
+}
+
+/*
+ * Configure a software based increment on this semaphore. This is useful for
+ * when we want the GPU to wait on a SW event before processing a channel.
+ * Another way to describe this is when the GPU needs to wait on a SW pre-fence.
+ * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
+ * then allows the GPU to continue.
+ *
+ * Also used to prep a semaphore for an INCR by the GPU.
+ */
+static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
+{
+	BUG_ON(s->incremented);
+
+	atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
+	s->incremented = 1;
 }
 #endif