gpu: nvgpu: Revamp semaphore support

Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2016-04-27 12:27:36 -07:00
parent b30990ea6d
commit dfd5ec53fc
9 changed files with 861 additions and 239 deletions
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1002,6 +1002,9 @@ unbind:

 	mutex_unlock(&g->dbg_sessions_lock);

+	/* Make sure that when the ch is re-opened it will get a new HW sema. */
+	ch->hw_sema = NULL;
+
 	/* make sure we catch accesses of unopened channels in case
 	 * there's non-refcounted channel pointers hanging around */
 	ch->g = NULL;
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -108,6 +108,8 @@ struct channel_gk20a {
 	atomic_t ref_count;
 	wait_queue_head_t ref_count_dec_wq;

+	struct gk20a_semaphore_int *hw_sema;
+
 	int hw_chid;
 	bool wdt_enabled;
 	bool bound;
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
 }
 #endif

-static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
-		u64 sema, u32 payload, bool acquire, bool wfi)
+static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
+			 struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
+			 int cmd_size, bool acquire, bool wfi)
 {
 	u32 off = cmd->off;
+	u64 va;
+
+	/*
+	 * RO for acquire (since we just need to read the mem) and RW for
+	 * release since we will need to write back to the semaphore memory.
+	 */
+	va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
+		       gk20a_semaphore_gpu_rw_va(s);
+
+	/*
+	 * If the op is not an acquire (so therefor a release) we should
+	 * incr the underlying sema next_value.
+	 */
+	if (!acquire)
+		gk20a_semaphore_incr(s);
+
 	/* semaphore_a */
 	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
 	/* offset_upper */
-	gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff);
+	gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
 	/* semaphore_b */
 	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
 	/* offset */
-	gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff);
-	/* semaphore_c */
-	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
-	/* payload */
-	gk20a_mem_wr32(g, cmd->mem, off++, payload);
+	gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
+
 	if (acquire) {
+		/* semaphore_c */
+		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+		/* payload */
+		gk20a_mem_wr32(g, cmd->mem, off++,
+			       gk20a_semaphore_get_value(s));
 		/* semaphore_d */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
 		/* operation: acq_geq, switch_en */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
 	} else {
+		/* semaphore_c */
+		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+		/* payload */
+		gk20a_mem_wr32(g, cmd->mem, off++,
+			       gk20a_semaphore_get_value(s));
 		/* semaphore_d */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
 		/* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
 		/* ignored */
 		gk20a_mem_wr32(g, cmd->mem, off++, 0);
 	}
-	return off - cmd->off;
 }

 static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
 	return -ENODEV;
 }

+/*
+ * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
+ * But since there's no API for getting the underlying sync_pts we have to do
+ * some conditional compilation.
+ */
+#ifdef CONFIG_SYNC
+static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+	struct sync_pt *pt;
+
+	pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
+	return gk20a_sync_pt_inst_get_sema(pt);
+#else
+	return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
+#endif
+}
+
+/*
+ * Attempt a fast path for waiting on a sync_fence. Basically if the passed
+ * sync_fence is backed by a gk20a_semaphore then there's no reason to go
+ * through the rigmarole of setting up a separate semaphore which waits on an
+ * interrupt from the GPU and then triggers a worker thread to execute a SW
+ * based semaphore release. Instead just have the GPU wait on the same semaphore
+ * that is going to be incremented by the GPU.
+ *
+ * This function returns 2 possible values: -ENODEV or 0 on success. In the case
+ * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
+ * a GPU semaphore.
+ */
+static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
+					 struct sync_fence *fence,
+					 struct priv_cmd_entry **wait_cmd,
+					 struct gk20a_semaphore **fp_sema)
+{
+	struct gk20a_semaphore *sema;
+	int err;
+
+	if (!gk20a_is_sema_backed_sync_fence(fence))
+		return -ENODEV;
+
+	sema = sema_from_sync_fence(fence);
+
+	/*
+	 * If there's no underlying sema then that means the underlying sema has
+	 * already signaled.
+	 */
+	if (!sema) {
+		*fp_sema = NULL;
+		return 0;
+	}
+
+	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+	if (err)
+		return err;
+
+	gk20a_semaphore_get(sema);
+	BUG_ON(!atomic_read(&sema->value));
+	add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
+
+	/*
+	 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
+	 * fence with the underlying semaphore.
+	 */
+	*fp_sema = sema;
+
+	return 0;
+}
+#endif
+
 static int gk20a_channel_semaphore_wait_fd(
 		struct gk20a_channel_sync *s, int fd,
 		struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
 		container_of(s, struct gk20a_channel_semaphore, ops);
 	struct channel_gk20a *c = sema->c;
 #ifdef CONFIG_SYNC
+	struct gk20a_semaphore *fp_sema;
 	struct sync_fence *sync_fence;
 	struct priv_cmd_entry *wait_cmd = NULL;
-	struct wait_fence_work *w;
-	int written;
-	int err, ret;
-	u64 va;
+	struct wait_fence_work *w = NULL;
+	int err, ret, status;

 	sync_fence = gk20a_sync_fence_fdget(fd);
 	if (!sync_fence)
 		return -EINVAL;

-	w = kzalloc(sizeof(*w), GFP_KERNEL);
-	if (!w) {
-		err = -ENOMEM;
-		goto fail;
-	}
-	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
-	w->ch = c;
-	w->sema = gk20a_semaphore_alloc(sema->pool);
-	if (!w->sema) {
-		gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
-		err = -ENOMEM;
-		goto fail;
+	ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
+	if (ret == 0) {
+		if (fp_sema)
+			*fence = gk20a_fence_from_semaphore(sema->timeline,
+							    fp_sema,
+							    &c->semaphore_wq,
+							    NULL, false);
+		else
+			/*
+			 * Allocate an empty fence. It will instantly return
+			 * from gk20a_fence_wait().
+			 */
+			*fence = gk20a_alloc_fence(NULL, NULL, false);
+
+		sync_fence_put(sync_fence);
+		goto skip_slow_path;
 	}

-	/* worker takes one reference */
-	gk20a_semaphore_get(w->sema);
+	/* If the fence has signaled there is no reason to wait on it. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+	status = sync_fence->status;
+#else
+	status = atomic_read(&sync_fence->status);
+#endif
+	if (status) {
+		sync_fence_put(sync_fence);
+		goto skip_slow_path;
+	}

 	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
 	if (err) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"not enough priv cmd buffer space");
-		goto fail;
+		sync_fence_put(sync_fence);
+		return -ENOMEM;
 	}

-	va = gk20a_semaphore_gpu_va(w->sema, c->vm);
-	/* GPU unblocked when when the semaphore value becomes 1. */
-	written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
+	w = kzalloc(sizeof(*w), GFP_KERNEL);
+	if (!w) {
+		err = -ENOMEM;
+		goto fail_free_cmdbuf;
+	}
+
+	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
+	w->ch = c;
+	w->sema = gk20a_semaphore_alloc(c);
+	if (!w->sema) {
+		gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
+		err = -ENOMEM;
+		goto fail_free_worker;
+	}
+
+	/* worker takes one reference */
+	gk20a_semaphore_get(w->sema);
+	gk20a_semaphore_incr(w->sema);
+
+	/* GPU unblocked when the semaphore value increments. */
+	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);

-	WARN_ON(written != wait_cmd->size);
 	ret = sync_fence_wait_async(sync_fence, &w->waiter);

 	/*
 	 * If the sync_fence has already signaled then the above async_wait
 	 * will never trigger. This causes the semaphore release op to never
 	 * happen which, in turn, hangs the GPU. That's bad. So let's just
-	 * do the semaphore_release right now.
+	 * do the gk20a_semaphore_release() right now.
 	 */
-	if (ret == 1)
+	if (ret == 1) {
+		sync_fence_put(sync_fence);
 		gk20a_semaphore_release(w->sema);
+		gk20a_semaphore_put(w->sema);
+	}

 	/* XXX - this fixes an actual bug, we need to hold a ref to this
 	   semaphore while the job is in flight. */
 	*fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
 					    &c->semaphore_wq,
 					    NULL, false);
+
+skip_slow_path:
 	*entry = wait_cmd;
 	return 0;
-fail:
+
+fail_free_worker:
 	if (w && w->sema)
 		gk20a_semaphore_put(w->sema);
 	kfree(w);
 	sync_fence_put(sync_fence);
+fail_free_cmdbuf:
+	if (wait_cmd)
+		gk20a_free_priv_cmdbuf(c, wait_cmd);
 	return err;
 #else
 	gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
 		struct gk20a_fence **fence,
 		bool need_sync_fence)
 {
-	u64 va;
 	int incr_cmd_size;
-	int written;
 	struct priv_cmd_entry *incr_cmd = NULL;
 	struct gk20a_channel_semaphore *sp =
 		container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
 	struct gk20a_semaphore *semaphore;
 	int err = 0;

-	semaphore = gk20a_semaphore_alloc(sp->pool);
+	semaphore = gk20a_semaphore_alloc(c);
 	if (!semaphore) {
 		gk20a_err(dev_from_gk20a(c->g),
 				"ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
 	}

 	/* Release the completion semaphore. */
-	va = gk20a_semaphore_gpu_va(semaphore, c->vm);
-	written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
-	WARN_ON(written != incr_cmd_size);
+	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);

 	*fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
 					    &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
 {
 	/* Don't put wfi cmd to this one since we're not returning
 	 * a fence to user space. */
-	return __gk20a_channel_semaphore_incr(s, false /* no wfi */,
-				      NULL, entry, fence, need_sync_fence);
+	return __gk20a_channel_semaphore_incr(s,
+			false /* no wfi */,
+			NULL,
+			entry, fence, need_sync_fence);
 }

 static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
 		container_of(s, struct gk20a_channel_semaphore, ops);
 	if (sema->timeline)
 		gk20a_sync_timeline_destroy(sema->timeline);
-	if (sema->pool) {
-		gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);
-		gk20a_semaphore_pool_put(sema->pool);
-	}
+
+	/* The sema pool is cleaned up by the VM destroy. */
+	sema->pool = NULL;
+
 	kfree(sema);
 }

 static struct gk20a_channel_sync *
 gk20a_channel_semaphore_create(struct channel_gk20a *c)
 {
-	int err;
 	int asid = -1;
 	struct gk20a_channel_semaphore *sema;
 	char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
 		asid = c->vm->as_share->id;

 	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
-	sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024);
-	if (!sema->pool)
-		goto clean_up;
-
-	/* Map the semaphore pool to the channel vm. Map as read-write to the
-	 * owner channel (all other channels should map as read only!). */
-	err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
-	if (err)
-		goto clean_up;
+	sema->pool = c->vm->sema_pool;

 #ifdef CONFIG_SYNC
 	sema->timeline = gk20a_sync_timeline_create(
 			"gk20a_ch%d_as%d", c->hw_chid, asid);
-	if (!sema->timeline)
-		goto clean_up;
+	if (!sema->timeline) {
+		gk20a_channel_semaphore_destroy(&sema->ops);
+		return NULL;
+	}
 #endif
 	atomic_set(&sema->ops.refcount, 0);
 	sema->ops.wait_syncpt	= gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
 	sema->ops.destroy	= gk20a_channel_semaphore_destroy;

 	return &sema->ops;
-clean_up:
-	gk20a_channel_semaphore_destroy(&sema->ops);
-	return NULL;
 }

 void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -155,8 +155,8 @@ struct gk20a_fence *gk20a_fence_from_semaphore(

 #ifdef CONFIG_SYNC
 	sync_fence = gk20a_sync_fence_create(timeline, semaphore,
-					     dependency, "f-gk20a-0x%04x",
-					     semaphore->offset & 0xffff);
+					dependency, "f-gk20a-0x%04x",
+					gk20a_semaphore_gpu_ro_va(semaphore));
 	if (!sync_fence)
 		return NULL;
 #endif
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -738,6 +738,11 @@ struct gk20a {
 #endif
 	struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;

+	/*
+	 * A group of semaphore pools. One for each channel.
+	 */
+	struct gk20a_semaphore_sea *sema_sea;
+
 	/* held while manipulating # of debug/profiler sessions present */
 	/* also prevents debug sessions from attaching until released */
 	struct mutex dbg_sessions_lock;
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
 	struct rb_node *node;

 	gk20a_dbg_fn("");
+
+	/*
+	 * Do this outside of the update_gmmu_lock since unmapping the semaphore
+	 * pool involves unmapping a GMMU mapping which means aquiring the
+	 * update_gmmu_lock.
+	 */
+	if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
+		gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+		gk20a_semaphore_pool_put(vm->sema_pool);
+	}
+
 	mutex_lock(&vm->update_gmmu_lock);

 	/* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
 	{.update_entry = NULL}
 };

+/*
+ * Initialize a semaphore pool. Just return successfully if we do not need
+ * semaphores (i.e when sync-pts are active).
+ */
+int gk20a_init_sema_pool(struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_sea *sema_sea;
+	struct mm_gk20a *mm = vm->mm;
+	struct gk20a *g = mm->g;
+	int err;
+
+	/*
+	 * Don't waste the memory on semaphores if we don't need them.
+	 */
+	if (gk20a_platform_has_syncpoints(g->dev))
+		return 0;
+
+	if (vm->sema_pool)
+		return 0;
+
+	sema_sea = gk20a_semaphore_sea_create(g);
+	if (!sema_sea)
+		return -ENOMEM;
+
+	vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
+	if (!vm->sema_pool) {
+		gk20a_vm_put(vm);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Allocate a chunk of GPU VA space for mapping the semaphores. We will
+	 * do a fixed alloc in the kernel VM so that all channels have the same
+	 * RO address range for the semaphores.
+	 *
+	 * !!! TODO: cleanup.
+	 */
+	sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
+					      vm->va_limit -
+					      mm->channel.kernel_size,
+					      512 * PAGE_SIZE);
+	if (!sema_sea->gpu_va) {
+		gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
+		gk20a_vm_put(vm);
+		return -ENOMEM;
+	}
+
+	err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
+	if (err) {
+		gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+		gk20a_bfree(&vm->vma[gmmu_page_size_small],
+			    vm->sema_pool->gpu_va);
+		gk20a_vm_put(vm);
+	}
+
+	return 0;
+}
+
 int gk20a_init_vm(struct mm_gk20a *mm,
 		struct vm_gk20a *vm,
 		u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	vm->big_pages = big_pages;

 	vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
-
 	vm->userspace_managed = userspace_managed;
-
 	vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
 			vm->big_page_size);

@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	kref_init(&vm->ref);
 	INIT_LIST_HEAD(&vm->reserved_va_list);

+	/*
+	 * This is only necessary for channel address spaces. The best way to
+	 * distinguish channel address spaces from other address spaces is by
+	 * size - if the address space is 4GB or less, it's not a channel.
+	 */
+	if (vm->va_limit > SZ_4G) {
+		err = gk20a_init_sema_pool(vm);
+		if (err)
+			goto clean_up_big_allocator;
+	}
+
 	return 0;

 clean_up_big_allocator:
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -287,6 +287,11 @@ struct vm_gk20a {
 	/* if non-NULL, kref_put will use this batch when
 	   unmapping. Must hold vm->update_gmmu_lock. */
 	struct vm_gk20a_mapping_batch *kref_put_batch;
+
+	/*
+	 * Each address space needs to have a semaphore pool.
+	 */
+	struct gk20a_semaphore_pool *sema_pool;
 };

 struct gk20a;
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -15,63 +15,284 @@
 * more details.
 */

-#include "semaphore_gk20a.h"
+#define pr_fmt(fmt) "gpu_sema: " fmt
+
 #include <linux/dma-mapping.h>
+#include <linux/highmem.h>
 #include <linux/slab.h>
+
+#include <asm/pgtable.h>
+
 #include "gk20a.h"
 #include "mm_gk20a.h"
+#include "semaphore_gk20a.h"

-static const int SEMAPHORE_SIZE = 16;
+#define __lock_sema_sea(s)						\
+	do {								\
+		mutex_lock(&s->sea_lock);				\
+	} while (0)

-struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct gk20a *g,
-		const char *unique_name, size_t capacity)
+#define __unlock_sema_sea(s)						\
+	do {								\
+		mutex_unlock(&s->sea_lock);				\
+	} while (0)
+
+/*
+ * Return the sema_sea pointer.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g)
 {
-	struct gk20a_semaphore_pool *p;
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
-	if (!p)
+	return g->sema_sea;
+}
+
+static int __gk20a_semaphore_sea_grow(struct gk20a_semaphore_sea *sea)
+{
+	int ret = 0;
+	struct gk20a *gk20a = sea->gk20a;
+
+	__lock_sema_sea(sea);
+
+	ret = gk20a_gmmu_alloc_attr(gk20a, DMA_ATTR_NO_KERNEL_MAPPING,
+				    PAGE_SIZE * SEMAPHORE_POOL_COUNT,
+				    &sea->sea_mem);
+	if (ret)
+		goto out;
+
+	sea->ro_sg_table = sea->sea_mem.sgt;
+	sea->size = SEMAPHORE_POOL_COUNT;
+	sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE;
+
+out:
+	__unlock_sema_sea(sea);
+	return ret;
+}
+
+/*
+ * Create the semaphore sea. Only create it once - subsequent calls to this will
+ * return the originally created sea pointer.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *g)
+{
+	if (g->sema_sea)
+		return g->sema_sea;
+
+	g->sema_sea = kzalloc(sizeof(*g->sema_sea), GFP_KERNEL);
+	if (!g->sema_sea)
 		return NULL;

-	kref_init(&p->ref);
-	INIT_LIST_HEAD(&p->maps);
-	mutex_init(&p->maps_mutex);
-	p->g = g;
+	g->sema_sea->size = 0;
+	g->sema_sea->page_count = 0;
+	g->sema_sea->gk20a = g;
+	INIT_LIST_HEAD(&g->sema_sea->pool_list);
+	mutex_init(&g->sema_sea->sea_lock);

-	/* Alloc one 4k page of semaphore per channel. */
-	if (gk20a_gmmu_alloc(g, roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE),
-				&p->mem))
-		goto clean_up;
+	if (__gk20a_semaphore_sea_grow(g->sema_sea))
+		goto cleanup;

-	/* Sacrifice one semaphore in the name of returning error codes. */
-	if (gk20a_allocator_init(&p->alloc, unique_name,
-				 SEMAPHORE_SIZE, p->mem.size - SEMAPHORE_SIZE,
-				 SEMAPHORE_SIZE))
-		goto clean_up;
+	return g->sema_sea;

-	gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->mem.cpu_va,
-		(u64)sg_dma_address(p->mem.sgt->sgl),
-		(u64)sg_phys(p->mem.sgt->sgl));
-	return p;
-
-clean_up:
-	if (p->mem.size)
-		gk20a_gmmu_free(p->g, &p->mem);
-	kfree(p);
+cleanup:
+	kfree(g->sema_sea);
+	g->sema_sea = NULL;
 	return NULL;
 }

+static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len)
+{
+	unsigned long idx = find_first_zero_bit(bitmap, len);
+
+	if (idx == len)
+		return -ENOSPC;
+
+	set_bit(idx, bitmap);
+
+	return (int)idx;
+}
+
+/*
+ * Allocate a pool from the sea.
+ */
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
+				struct gk20a_semaphore_sea *sea)
+{
+	struct gk20a_semaphore_pool *p;
+	unsigned long page_idx;
+	int err = 0;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	__lock_sema_sea(sea);
+
+	page_idx = __semaphore_bitmap_alloc(sea->pools_alloced,
+					    SEMAPHORE_POOL_COUNT);
+	if (page_idx < 0) {
+		err = page_idx;
+		goto fail;
+	}
+
+	p->page = sea->sea_mem.pages[page_idx];
+	p->ro_sg_table = sea->ro_sg_table;
+	p->page_idx = page_idx;
+	p->sema_sea = sea;
+	INIT_LIST_HEAD(&p->hw_semas);
+	kref_init(&p->ref);
+	mutex_init(&p->pool_lock);
+
+	sea->page_count++;
+	list_add(&p->pool_list_entry, &sea->pool_list);
+	__unlock_sema_sea(sea);
+
+	return p;
+
+fail:
+	__unlock_sema_sea(sea);
+	kfree(p);
+	return ERR_PTR(err);
+}
+
+/*
+ * Map a pool into the passed vm's address space. This handles both the fixed
+ * global RO mapping and the non-fixed private RW mapping.
+ */
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
+			     struct vm_gk20a *vm)
+{
+	int ents, err = 0;
+	u64 addr;
+
+	p->cpu_va = vmap(&p->page, 1, 0,
+			 pgprot_writecombine(PAGE_KERNEL));
+
+	/* First do the RW mapping. */
+	p->rw_sg_table = kzalloc(sizeof(*p->rw_sg_table), GFP_KERNEL);
+	if (!p->rw_sg_table)
+		return -ENOMEM;
+
+	err = sg_alloc_table_from_pages(p->rw_sg_table, &p->page, 1, 0,
+					PAGE_SIZE, GFP_KERNEL);
+	if (err) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	/* Add IOMMU mapping... */
+	ents = dma_map_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+			  DMA_BIDIRECTIONAL);
+	if (ents != 1) {
+		err = -ENOMEM;
+		goto fail_free_sgt;
+	}
+
+	/* Map into the GPU... Doesn't need to be fixed. */
+	p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE,
+				   0, gk20a_mem_flag_none, false);
+	if (!p->gpu_va) {
+		err = -ENOMEM;
+		goto fail_unmap_sgt;
+	}
+
+	/*
+	 * And now the global mapping. Take the sea lock so that we don't race
+	 * with a concurrent remap.
+	 */
+	__lock_sema_sea(p->sema_sea);
+
+	BUG_ON(p->mapped);
+	addr = gk20a_gmmu_fixed_map(vm, &p->sema_sea->ro_sg_table,
+				    p->sema_sea->gpu_va, p->sema_sea->map_size,
+				    0,
+				    gk20a_mem_flag_read_only,
+				    false);
+	if (!addr) {
+		err = -ENOMEM;
+		BUG();
+		goto fail_unlock;
+	}
+	p->gpu_va_ro = addr;
+	p->mapped = 1;
+
+	__unlock_sema_sea(p->sema_sea);
+
+	return 0;
+
+fail_unlock:
+	__unlock_sema_sea(p->sema_sea);
+fail_unmap_sgt:
+	dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+		     DMA_BIDIRECTIONAL);
+fail_free_sgt:
+	sg_free_table(p->rw_sg_table);
+fail:
+	kfree(p->rw_sg_table);
+	p->rw_sg_table = NULL;
+	return err;
+}
+
+/*
+ * Unmap a semaphore_pool.
+ */
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
+				struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_int *hw_sema;
+
+	kunmap(p->cpu_va);
+
+	/* First the global RO mapping... */
+	__lock_sema_sea(p->sema_sea);
+	gk20a_gmmu_unmap(vm, p->gpu_va_ro,
+			 p->sema_sea->map_size, gk20a_mem_flag_none);
+	p->ro_sg_table = NULL;
+	__unlock_sema_sea(p->sema_sea);
+
+	/* And now the private RW mapping. */
+	gk20a_gmmu_unmap(vm, p->gpu_va, PAGE_SIZE, gk20a_mem_flag_none);
+	p->gpu_va = 0;
+
+	dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+		     DMA_BIDIRECTIONAL);
+
+	sg_free_table(p->rw_sg_table);
+	kfree(p->rw_sg_table);
+	p->rw_sg_table = NULL;
+
+	gk20a_dbg_info("Unmapped sema-pool: idx = %d", p->page_idx);
+	list_for_each_entry(hw_sema, &p->hw_semas, hw_sema_list)
+		/*
+		 * Make sure the mem addresses are all NULL so if this gets
+		 * reused we will fault.
+		 */
+		hw_sema->value = NULL;
+}
+
+/*
+ * Completely free a sempahore_pool. You should make sure this pool is not
+ * mapped otherwise there's going to be a memory leak.
+ */
 static void gk20a_semaphore_pool_free(struct kref *ref)
 {
 	struct gk20a_semaphore_pool *p =
 		container_of(ref, struct gk20a_semaphore_pool, ref);
-	mutex_lock(&p->maps_mutex);
-	WARN_ON(!list_empty(&p->maps));
-	mutex_unlock(&p->maps_mutex);
-	gk20a_gmmu_free(p->g, &p->mem);
-	gk20a_allocator_destroy(&p->alloc);
+	struct gk20a_semaphore_sea *s = p->sema_sea;
+	struct gk20a_semaphore_int *hw_sema, *tmp;
+
+	WARN_ON(p->gpu_va || p->rw_sg_table || p->ro_sg_table);
+
+	__lock_sema_sea(s);
+	list_del(&p->pool_list_entry);
+	clear_bit(p->page_idx, s->pools_alloced);
+	s->page_count--;
+	__unlock_sema_sea(s);
+
+	list_for_each_entry_safe(hw_sema, tmp, &p->hw_semas, hw_sema_list)
+		kfree(hw_sema);
+
 	kfree(p);
 }

-static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
+void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
 {
 	kref_get(&p->ref);
 }
@@ -81,104 +302,96 @@ void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p)
 	kref_put(&p->ref, gk20a_semaphore_pool_free);
 }

-static struct gk20a_semaphore_pool_map *
-gk20a_semaphore_pool_find_map_locked(struct gk20a_semaphore_pool *p,
-				     struct vm_gk20a *vm)
+/*
+ * Get the address for a semaphore_pool - if global is true then return the
+ * global RO address instead of the RW address owned by the semaphore's VM.
+ */
+u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global)
 {
-	struct gk20a_semaphore_pool_map *map, *found = NULL;
-	list_for_each_entry(map, &p->maps, list) {
-		if (map->vm == vm) {
-			found = map;
-			break;
-		}
-	}
-	return found;
+	if (!global)
+		return p->gpu_va;
+
+	return p->gpu_va_ro + (PAGE_SIZE * p->page_idx);
 }

-int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
-			     struct vm_gk20a *vm,
-			     enum gk20a_mem_rw_flag rw_flag)
+static int __gk20a_init_hw_sema(struct channel_gk20a *ch)
 {
-	struct gk20a_semaphore_pool_map *map;
+	int hw_sema_idx;
+	int ret = 0;
+	struct gk20a_semaphore_int *hw_sema;
+	struct gk20a_semaphore_pool *p = ch->vm->sema_pool;

-	map = kzalloc(sizeof(*map), GFP_KERNEL);
-	if (!map)
-		return -ENOMEM;
-	map->vm = vm;
-	map->rw_flag = rw_flag;
-	map->gpu_va = gk20a_gmmu_map(vm, &p->mem.sgt, p->mem.size,
-				     0/*uncached*/, rw_flag,
-				     false);
-	if (!map->gpu_va) {
-		kfree(map);
-		return -ENOMEM;
+	BUG_ON(!p);
+
+	mutex_lock(&p->pool_lock);
+
+	/* Find an available HW semaphore. */
+	hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced,
+					       PAGE_SIZE / SEMAPHORE_SIZE);
+	if (hw_sema_idx < 0) {
+		ret = hw_sema_idx;
+		goto fail;
 	}
-	gk20a_vm_get(vm);

-	mutex_lock(&p->maps_mutex);
-	WARN_ON(gk20a_semaphore_pool_find_map_locked(p, vm));
-	list_add(&map->list, &p->maps);
-	mutex_unlock(&p->maps_mutex);
+	hw_sema = kzalloc(sizeof(struct gk20a_semaphore_int), GFP_KERNEL);
+	if (!hw_sema) {
+		ret = -ENOMEM;
+		goto fail_free_idx;
+	}
+
+	ch->hw_sema = hw_sema;
+	hw_sema->ch = ch;
+	hw_sema->p = p;
+	hw_sema->idx = hw_sema_idx;
+	hw_sema->offset = SEMAPHORE_SIZE * hw_sema_idx;
+	atomic_set(&hw_sema->next_value, 0);
+	hw_sema->value = p->cpu_va + hw_sema->offset;
+	writel(0, hw_sema->value);
+
+	list_add(&hw_sema->hw_sema_list, &p->hw_semas);
+
+	mutex_unlock(&p->pool_lock);
+
 	return 0;
+
+fail_free_idx:
+	clear_bit(hw_sema_idx, p->semas_alloced);
+fail:
+	mutex_unlock(&p->pool_lock);
+	return ret;
 }

-void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
-		struct vm_gk20a *vm)
-{
-	struct gk20a_semaphore_pool_map *map;
-	WARN_ON(!vm);
-
-	mutex_lock(&p->maps_mutex);
-	map = gk20a_semaphore_pool_find_map_locked(p, vm);
-	if (map) {
-		gk20a_gmmu_unmap(vm, map->gpu_va, p->mem.size, map->rw_flag);
-		gk20a_vm_put(vm);
-		list_del(&map->list);
-		kfree(map);
-	}
-	mutex_unlock(&p->maps_mutex);
-}
-
-u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p,
-		struct vm_gk20a *vm)
-{
-	struct gk20a_semaphore_pool_map *map;
-	u64 gpu_va = 0;
-
-	mutex_lock(&p->maps_mutex);
-	map = gk20a_semaphore_pool_find_map_locked(p, vm);
-	if (map)
-		gpu_va = map->gpu_va;
-	mutex_unlock(&p->maps_mutex);
-
-	return gpu_va;
-}
-
-struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
+/*
+ * Allocate a semaphore from the passed pool.
+ *
+ * Since semaphores are ref-counted there's no explicit free for external code
+ * to use. When the ref-count hits 0 the internal free will happen.
+ */
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch)
 {
 	struct gk20a_semaphore *s;
+	int ret;
+
+	if (!ch->hw_sema) {
+		ret = __gk20a_init_hw_sema(ch);
+		if (ret)
+			return ERR_PTR(ret);
+	}

 	s = kzalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return NULL;

-	s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
-	if (!s->offset) {
-		gk20a_err(dev_from_gk20a(pool->g),
-				"failed to allocate semaphore");
-		kfree(s);
-		return NULL;
-	}
-
-	gk20a_semaphore_pool_get(pool);
-	s->pool = pool;
-
 	kref_init(&s->ref);
-	/* Initially acquired. */
-	gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 0);
-	gk20a_dbg_info("created semaphore offset=%d, value=%d",
-			s->offset,
-			gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset));
+	s->hw_sema = ch->hw_sema;
+	atomic_set(&s->value, 0);
+
+	/*
+	 * Take a ref on the pool so that we can keep this pool alive for
+	 * as long as this semaphore is alive.
+	 */
+	gk20a_semaphore_pool_get(s->hw_sema->p);
+
 	return s;
 }

@@ -187,8 +400,8 @@ static void gk20a_semaphore_free(struct kref *ref)
 	struct gk20a_semaphore *s =
 		container_of(ref, struct gk20a_semaphore, ref);

-	gk20a_bfree(&s->pool->alloc, s->offset);
-	gk20a_semaphore_pool_put(s->pool);
+	gk20a_semaphore_pool_put(s->hw_sema->p);
+
 	kfree(s);
 }

--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
 #define SEMAPHORE_GK20A_H

 #include <linux/kref.h>
-#include "gk20a_allocator.h"
-#include "mm_gk20a.h"
+#include <linux/list.h>
+#include <linux/delay.h>
+
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include "channel_gk20a.h"
+
+/*
+ * Max number of channels that can be used is 512. This of course needs to be
+ * fixed to be dynamic but still fast.
+ */
+#define SEMAPHORE_POOL_COUNT		512
+#define SEMAPHORE_SIZE			16
+#define SEMAPHORE_SEA_GROWTH_RATE	32
+
+struct gk20a_semaphore_sea;
+
+/*
+ * Underlying semaphore data structure. This semaphore can be shared amongst
+ * other semaphore instances.
+ */
+struct gk20a_semaphore_int {
+	int idx;			/* Semaphore index. */
+	u32 offset;			/* Offset into the pool. */
+	atomic_t next_value;		/* Next available value. */
+	u32 *value;			/* Current value (access w/ readl()). */
+	u32 nr_incrs;			/* Number of increments programmed. */
+	struct gk20a_semaphore_pool *p;	/* Pool that owns this sema. */
+	struct channel_gk20a *ch;	/* Channel that owns this sema. */
+	struct list_head hw_sema_list;	/* List of HW semaphores. */
+};
+
+/*
+ * A semaphore which the rest of the driver actually uses. This consists of a
+ * pointer to a real semaphore and a value to wait for. This allows one physical
+ * semaphore to be shared among an essentially infinite number of submits.
+ */
+struct gk20a_semaphore {
+	struct gk20a_semaphore_int *hw_sema;
+
+	atomic_t value;
+	int incremented;

-/* A memory pool for holding semaphores. */
-struct gk20a_semaphore_pool {
-	struct mem_desc mem;
-	struct gk20a *g;
-	struct list_head maps;
-	struct mutex maps_mutex;
 	struct kref ref;
-	struct gk20a_allocator alloc;
+};
+
+/*
+ * A semaphore pool. Each address space will own exactly one of these.
+ */
+struct gk20a_semaphore_pool {
+	struct page *page;			/* This pool's page of memory */
+	struct list_head pool_list_entry;	/* Node for list of pools. */
+	void *cpu_va;				/* CPU access to the pool. */
+	u64 gpu_va;				/* GPU access to the pool. */
+	u64 gpu_va_ro;				/* GPU access to the pool. */
+	int page_idx;				/* Index into sea bitmap. */
+
+	struct list_head hw_semas;		/* List of HW semas. */
+	DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
+
+	struct gk20a_semaphore_sea *sema_sea;	/* Sea that owns this pool. */
+
+	struct mutex pool_lock;
+
+	/*
+	 * This is the address spaces's personal RW table. Other channels will
+	 * ultimately map this page as RO.
+	 */
+	struct sg_table *rw_sg_table;
+
+	/*
+	 * This is to keep track of whether the pool has had its sg_table
+	 * updated during sea resizing.
+	 */
+	struct sg_table *ro_sg_table;
+
+	int mapped;
+
+	/*
+	 * Sometimes a channel can be released before other channels are
+	 * done waiting on it. This ref count ensures that the pool doesn't
+	 * go away until all semaphores using this pool are cleaned up first.
+	 */
+	struct kref ref;
+};
+
+/*
+ * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
+ * channels can share a VM each channel gets it's own HW semaphore from the
+ * pool. Channels then allocate regular semaphores - basically just a value that
+ * signifies when a particular job is done.
+ */
+struct gk20a_semaphore_sea {
+	struct list_head pool_list;	/* List of pools in this sea. */
+	struct gk20a *gk20a;
+
+	size_t size;			/* Number of pages available. */
+	u64 gpu_va;			/* GPU virtual address of sema sea. */
+	u64 map_size;			/* Size of the mapping. */
+
+	/*
+	 * TODO:
+	 * List of pages that we use to back the pools. The number of pages
+	 * can grow dynamically since allocating 512 pages for all channels at
+	 * once would be a tremendous waste.
+	 */
+	int page_count;			/* Pages allocated to pools. */
+
+	struct sg_table *ro_sg_table;
+	/*
+	struct page *pages[SEMAPHORE_POOL_COUNT];
+	*/
+
+	struct mem_desc sea_mem;
+
+	/*
+	 * Can't use a regular allocator here since the full range of pools are
+	 * not always allocated. Instead just use a bitmap.
+	 */
+	DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
+
+	struct mutex sea_lock;		/* Lock alloc/free calls. */
 };

 enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
 	gk20a_mem_flag_write_only = 2,
 };

-/* A semaphore pool can be mapped to multiple GPU address spaces. */
-struct gk20a_semaphore_pool_map {
-	u64 gpu_va;
-	enum gk20a_mem_rw_flag rw_flag;
-	struct vm_gk20a *vm;
-	struct list_head list;
-};
+/*
+ * Semaphore sea functions.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a);
+int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
+			    struct vm_gk20a *vm);
+void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
+			       struct vm_gk20a *vm);
+struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g);

-/* A semaphore that lives inside a semaphore pool. */
-struct gk20a_semaphore {
-	struct gk20a_semaphore_pool *pool;
-	/*
-	 * value exists within the pool's memory at the specified offset.
-	 * 0=acquired, 1=released.
-	 */
-	u32 offset; /* byte offset within pool */
-	struct kref ref;
-};
+/*
+ * Semaphore pool functions.
+ */
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
+	struct gk20a_semaphore_sea *sea);
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
+			     struct vm_gk20a *vm);
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
+				struct vm_gk20a *vm);
+u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
+void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
+void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);

-/* Create a semaphore pool that can hold at most 'capacity' semaphores. */
-struct gk20a_semaphore_pool *
-gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name,
-			   size_t capacity);
-void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
-int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
-			     struct vm_gk20a *,
-			     enum gk20a_mem_rw_flag);
-void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
-				struct vm_gk20a *);
-u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
-				struct vm_gk20a *);
+/*
+ * Semaphore functions.
+ */
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch);
+void gk20a_semaphore_put(struct gk20a_semaphore *s);
+void gk20a_semaphore_get(struct gk20a_semaphore *s);

-/* Allocate a semaphore from the semaphore pool. The newly allocated
- * semaphore will be in acquired state (value=0). */
-struct gk20a_semaphore *
-gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
-void gk20a_semaphore_put(struct gk20a_semaphore *);
-void gk20a_semaphore_get(struct gk20a_semaphore *);
-
-static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
-					 struct vm_gk20a *vm)
+/*
+ * Return the address of a specific semaphore.
+ *
+ * Don't call this on a semaphore you don't own - the VA returned will make no
+ * sense in your specific channel's VM.
+ */
+static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
 {
-	return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset;
+	return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
+		s->hw_sema->offset;
+}
+
+/*
+ * Get the global RO address for the semaphore. Can be called on any semaphore
+ * regardless of whether you own it.
+ */
+static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
+{
+	return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
+		s->hw_sema->offset;
+}
+
+static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
+{
+	return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
+		hw_sema->offset;
+}
+
+/*
+ * TODO: handle wrap around... Hmm, how to do this?
+ */
+static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
+{
+	u32 sema_val = readl(s->hw_sema->value);
+
+	/*
+	 * If the underlying semaphore value is greater than or equal to
+	 * the value of the semaphore then the semaphore has been signaled
+	 * (a.k.a. released).
+	 */
+	return sema_val >= atomic_read(&s->value);
 }

 static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
 {
-	u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset);
-
-	/* When often block on value reaching a certain threshold. We must make
-	 * sure that if we get unblocked, we haven't read anything too early. */
-	smp_rmb();
-	return v == 0;
+	return !gk20a_semaphore_is_released(s);
 }

+/*
+ * Read the underlying value from a semaphore.
+ */
+static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
+{
+	return readl(s->hw_sema->value);
+}
+
+static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
+{
+	return atomic_read(&s->value);
+}
+
+static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
+{
+	return atomic_read(&s->hw_sema->next_value);
+}
+
+/*
+ * Note - if you call this then any prior semaphores will also be released.
+ */
 static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
 {
-	smp_wmb();
-	gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1);
+	u32 current_val;
+	u32 val = gk20a_semaphore_get_value(s);
+	int attempts = 0;
+
+	/*
+	 * Wait until the sema value is 1 less than the write value. That
+	 * way this function is essentially an increment.
+	 *
+	 * TODO: tune the wait a little better.
+	 */
+	while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
+		msleep(100);
+		attempts += 1;
+		if (attempts > 100) {
+			WARN(1, "Stall on sema release!");
+			return;
+		}
+	}
+
+	/*
+	 * If the semaphore has already passed the value we would write then
+	 * this is really just a NO-OP.
+	 */
+	if (current_val >= val)
+		return;
+
+	writel(val, s->hw_sema->value);
+}
+
+/*
+ * Configure a software based increment on this semaphore. This is useful for
+ * when we want the GPU to wait on a SW event before processing a channel.
+ * Another way to describe this is when the GPU needs to wait on a SW pre-fence.
+ * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
+ * then allows the GPU to continue.
+ *
+ * Also used to prep a semaphore for an INCR by the GPU.
+ */
+static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
+{
+	BUG_ON(s->incremented);
+
+	atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
+	s->incremented = 1;
 }
 #endif