diff --git a/drivers/gpu/nvgpu/gk20a/Makefile b/drivers/gpu/nvgpu/gk20a/Makefile
index 81ae027e0..e5eb817d4 100644
--- a/drivers/gpu/nvgpu/gk20a/Makefile
+++ b/drivers/gpu/nvgpu/gk20a/Makefile
@@ -19,6 +19,7 @@ nvgpu-y := \
 	mm_gk20a.o \
 	pmu_gk20a.o \
 	priv_ring_gk20a.o \
+	semaphore_gk20a.o \
 	clk_gk20a.o \
 	therm_gk20a.o \
 	gr_ctx_gk20a_sim.o \
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
new file mode 100644
index 000000000..55fa0e32e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -0,0 +1,191 @@
+/*
+ * drivers/video/tegra/host/gk20a/semaphore_gk20a.c
+ *
+ * GK20A Semaphores
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "semaphore_gk20a.h"
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include "gk20a.h"
+#include "mm_gk20a.h"
+
+static const int SEMAPHORE_SIZE = 16;
+
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct device *d,
+		const char *unique_name, size_t capacity)
+{
+	struct gk20a_semaphore_pool *p;
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	kref_init(&p->ref);
+	INIT_LIST_HEAD(&p->maps);
+	mutex_init(&p->maps_mutex);
+	p->dev = d;
+
+	/* Alloc one 4k page of semaphore per channel. */
+	p->size = roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE);
+	p->cpu_va = dma_alloc_coherent(d, p->size, &p->iova, GFP_KERNEL);
+	if (!p->cpu_va)
+		goto clean_up;
+	if (gk20a_get_sgtable(d, &p->sgt, p->cpu_va, p->iova, p->size))
+		goto clean_up;
+
+	if (gk20a_allocator_init(&p->alloc, unique_name, 0,
+			     p->size, SEMAPHORE_SIZE))
+		goto clean_up;
+
+	gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->cpu_va,
+		(u64)sg_dma_address(p->sgt->sgl), (u64)sg_phys(p->sgt->sgl));
+	return p;
+clean_up:
+	if (p->cpu_va)
+		dma_free_coherent(d, p->size, p->cpu_va, p->iova);
+	if (p->sgt)
+		gk20a_free_sgtable(&p->sgt);
+	kfree(p);
+	return NULL;
+}
+
+static void gk20a_semaphore_pool_free(struct kref *ref)
+{
+	struct gk20a_semaphore_pool *p =
+		container_of(ref, struct gk20a_semaphore_pool, ref);
+	mutex_lock(&p->maps_mutex);
+	WARN_ON(!list_empty(&p->maps));
+	mutex_unlock(&p->maps_mutex);
+	gk20a_free_sgtable(&p->sgt);
+	dma_free_coherent(p->dev, p->size, p->cpu_va, p->iova);
+	gk20a_allocator_destroy(&p->alloc);
+	kfree(p);
+}
+
+static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
+{
+	kref_get(&p->ref);
+}
+
+void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p)
+{
+	kref_put(&p->ref, gk20a_semaphore_pool_free);
+}
+
+static struct gk20a_semaphore_pool_map *
+gk20a_semaphore_pool_find_map(struct gk20a_semaphore_pool *p,
+			      struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_pool_map *map, *found = NULL;
+	mutex_lock(&p->maps_mutex);
+	list_for_each_entry(map, &p->maps, list) {
+		if (map->vm == vm) {
+			found = map;
+			break;
+		}
+	}
+	mutex_unlock(&p->maps_mutex);
+	return found;
+}
+
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
+			     struct vm_gk20a *vm,
+			     enum gk20a_mem_rw_flag rw_flag)
+{
+	struct gk20a_semaphore_pool_map *map;
+
+	WARN_ON(gk20a_semaphore_pool_find_map(p, vm));
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+	map->vm = vm;
+	map->rw_flag = rw_flag;
+	map->gpu_va = gk20a_gmmu_map(vm, &p->sgt, p->size,
+				     0/*uncached*/, rw_flag);
+	if (!map->gpu_va) {
+		kfree(map);
+		return -ENOMEM;
+	}
+	mutex_lock(&p->maps_mutex);
+	list_add(&map->list, &p->maps);
+	mutex_unlock(&p->maps_mutex);
+	return 0;
+}
+
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
+		struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_pool_map *map =
+		gk20a_semaphore_pool_find_map(p, vm);
+	if (!map)
+		return;
+	gk20a_gmmu_unmap(vm, map->gpu_va, p->size, map->rw_flag);
+	list_del(&map->list);
+	kfree(map);
+}
+
+u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p,
+		struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_pool_map *map =
+		gk20a_semaphore_pool_find_map(p, vm);
+	if (!map)
+		return 0;
+	return map->gpu_va;
+}
+
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
+{
+	struct gk20a_semaphore *s;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return NULL;
+
+	if (pool->alloc.alloc(&pool->alloc, &s->offset, SEMAPHORE_SIZE)) {
+		gk20a_err(pool->dev, "failed to allocate semaphore");
+		kfree(s);
+		return NULL;
+	}
+
+	gk20a_semaphore_pool_get(pool);
+	s->pool = pool;
+
+	kref_init(&s->ref);
+	s->value = (volatile u32 *)((uintptr_t)pool->cpu_va + s->offset);
+	*s->value = 0; /* Initially acquired. */
+	gk20a_dbg_info("created semaphore offset=%d, value_cpu=%p, value=%d",
+			s->offset, s->value, *s->value);
+	return s;
+}
+
+static void gk20a_semaphore_free(struct kref *ref)
+{
+	struct gk20a_semaphore *s =
+		container_of(ref, struct gk20a_semaphore, ref);
+
+	s->pool->alloc.free(&s->pool->alloc, s->offset, SEMAPHORE_SIZE);
+	gk20a_semaphore_pool_put(s->pool);
+	kfree(s);
+}
+
+void gk20a_semaphore_put(struct gk20a_semaphore *s)
+{
+	kref_put(&s->ref, gk20a_semaphore_free);
+}
+
+void gk20a_semaphore_get(struct gk20a_semaphore *s)
+{
+	kref_get(&s->ref);
+}
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
new file mode 100644
index 000000000..214db3987
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -0,0 +1,97 @@
+/*
+ * drivers/video/tegra/host/gk20a/semaphore_gk20a.h
+ *
+ * GK20A Semaphores
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _GK20A_SEMAPHORE_H_
+#define _GK20A_SEMAPHORE_H_
+
+#include <linux/kref.h>
+#include "gk20a_allocator.h"
+#include "mm_gk20a.h"
+
+/* A memory pool for holding semaphores. */
+struct gk20a_semaphore_pool {
+	void *cpu_va;
+	dma_addr_t iova;
+	size_t size;
+	struct device *dev;
+	struct sg_table *sgt;
+	struct list_head maps;
+	struct mutex maps_mutex;
+	struct kref ref;
+	struct gk20a_allocator alloc;
+};
+
+/* A semaphore pool can be mapped to multiple GPU address spaces. */
+struct gk20a_semaphore_pool_map {
+	u64 gpu_va;
+	enum gk20a_mem_rw_flag rw_flag;
+	struct vm_gk20a *vm;
+	struct list_head list;
+};
+
+/* A semaphore that lives inside a semaphore pool. */
+struct gk20a_semaphore {
+	struct gk20a_semaphore_pool *pool;
+	u32 offset; /* byte offset within pool */
+	struct kref ref;
+	/* value is a pointer within the pool's coherent cpu_va.
+	 * It is shared between CPU and GPU, hence volatile. */
+	volatile u32 *value; /* 0=acquired, 1=released */
+};
+
+/* Create a semaphore pool that can hold at most 'capacity' semaphores. */
+struct gk20a_semaphore_pool *
+gk20a_semaphore_pool_alloc(struct device *, const char *unique_name,
+			   size_t capacity);
+void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
+			     struct vm_gk20a *,
+			     enum gk20a_mem_rw_flag);
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
+				struct vm_gk20a *);
+u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
+				struct vm_gk20a *);
+
+/* Allocate a semaphore from the semaphore pool. The newly allocated
+ * semaphore will be in acquired state (value=0). */
+struct gk20a_semaphore *
+gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
+void gk20a_semaphore_put(struct gk20a_semaphore *);
+void gk20a_semaphore_get(struct gk20a_semaphore *);
+
+static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
+					 struct vm_gk20a *vm)
+{
+	return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset;
+}
+
+static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
+{
+	u32 v = *s->value;
+
+	/* When often block on value reaching a certain threshold. We must make
+	 * sure that if we get unblocked, we haven't read anything too early. */
+	smp_rmb();
+	return v == 0;
+}
+
+static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
+{
+	smp_wmb();
+	*s->value = 1;
+}
+#endif