diff --git a/drivers/gpu/nvgpu/common/semaphore.c b/drivers/gpu/nvgpu/common/semaphore.c
index 25bd3be33..5814a7370 100644
--- a/drivers/gpu/nvgpu/common/semaphore.c
+++ b/drivers/gpu/nvgpu/common/semaphore.c
@@ -28,6 +28,7 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/sizes.h>
+#include <nvgpu/channel.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
@@ -485,3 +486,141 @@ void nvgpu_semaphore_get(struct nvgpu_semaphore *s)
 {
 	nvgpu_ref_get(&s->ref);
 }
+
+/*
+ * Return the address of a specific semaphore.
+ *
+ * Don't call this on a semaphore you don't own - the VA returned will make no
+ * sense in your specific channel's VM.
+ */
+u64 nvgpu_semaphore_gpu_rw_va(struct nvgpu_semaphore *s)
+{
+	return __nvgpu_semaphore_pool_gpu_va(s->location.pool, false) +
+		s->location.offset;
+}
+
+/*
+ * Get the global RO address for the semaphore. Can be called on any semaphore
+ * regardless of whether you own it.
+ */
+u64 nvgpu_semaphore_gpu_ro_va(struct nvgpu_semaphore *s)
+{
+	return __nvgpu_semaphore_pool_gpu_va(s->location.pool, true) +
+		s->location.offset;
+}
+
+u64 nvgpu_hw_sema_addr(struct nvgpu_semaphore_int *hw_sema)
+{
+	return __nvgpu_semaphore_pool_gpu_va(hw_sema->location.pool, true) +
+		hw_sema->location.offset;
+}
+
+u32 __nvgpu_semaphore_read(struct nvgpu_semaphore_int *hw_sema)
+{
+	return nvgpu_mem_rd(hw_sema->ch->g, &hw_sema->location.pool->rw_mem,
+			hw_sema->location.offset);
+}
+
+/*
+ * Read the underlying value from a semaphore.
+ */
+u32 nvgpu_semaphore_read(struct nvgpu_semaphore *s)
+{
+	return nvgpu_mem_rd(s->g, &s->location.pool->rw_mem,
+			s->location.offset);
+}
+
+/*
+ * Check if "racer" is over "goal" with wraparound handling.
+ */
+static bool __nvgpu_semaphore_value_released(u32 goal, u32 racer)
+{
+	/*
+	 * Handle wraparound with the same heuristic as the hardware does:
+	 * although the integer will eventually wrap around, consider a sema
+	 * released against a threshold if its value has passed that threshold
+	 * but has not wrapped over half of the u32 range over that threshold;
+	 * such wrapping is unlikely to happen during a sema lifetime.
+	 *
+	 * Values for [goal, goal + 0x7fffffff] are considered signaled; that's
+	 * precisely half of the 32-bit space. If racer == goal + 0x80000000,
+	 * then it needs 0x80000000 increments to wrap again and signal.
+	 *
+	 * Unsigned arithmetic is used because it's well-defined. This is
+	 * effectively the same as: signed_racer - signed_goal > 0.
+	 */
+
+	return racer - goal < 0x80000000;
+}
+
+u32 nvgpu_semaphore_get_value(struct nvgpu_semaphore *s)
+{
+	return (u32)nvgpu_atomic_read(&s->value);
+}
+
+bool nvgpu_semaphore_is_released(struct nvgpu_semaphore *s)
+{
+	u32 sema_val = nvgpu_semaphore_read(s);
+	u32 wait_payload = nvgpu_semaphore_get_value(s);
+
+	return __nvgpu_semaphore_value_released(wait_payload, sema_val);
+}
+
+bool nvgpu_semaphore_is_acquired(struct nvgpu_semaphore *s)
+{
+	return !nvgpu_semaphore_is_released(s);
+}
+
+/*
+ * Fast-forward the hw sema to its tracked max value.
+ *
+ * Return true if the sema wasn't at the max value and needed updating, false
+ * otherwise.
+ */
+bool nvgpu_semaphore_reset(struct nvgpu_semaphore_int *hw_sema)
+{
+	u32 threshold = (u32)nvgpu_atomic_read(&hw_sema->next_value);
+	u32 current_val = __nvgpu_semaphore_read(hw_sema);
+
+	/*
+	 * If the semaphore has already reached the value we would write then
+	 * this is really just a NO-OP. However, the sema value shouldn't be
+	 * more than what we expect to be the max.
+	 */
+
+	if (WARN_ON(__nvgpu_semaphore_value_released(threshold + 1,
+						     current_val)))
+		return false;
+
+	if (current_val == threshold)
+		return false;
+
+	nvgpu_mem_wr(hw_sema->ch->g, &hw_sema->location.pool->rw_mem,
+			hw_sema->location.offset, threshold);
+
+	gpu_sema_verbose_dbg(hw_sema->ch->g, "(c=%d) RESET %u -> %u",
+			hw_sema->ch->chid, current_val, threshold);
+
+	return true;
+}
+
+/*
+ * Update nvgpu-tracked shadow of the value in "hw_sema" and mark the threshold
+ * value to "s" which represents the increment that the caller must write in a
+ * pushbuf. The same nvgpu_semaphore will also represent an output fence; when
+ * nvgpu_semaphore_is_released(s) == true, the gpu is done with this increment.
+ */
+void nvgpu_semaphore_prepare(struct nvgpu_semaphore *s,
+		struct nvgpu_semaphore_int *hw_sema)
+{
+	int next = nvgpu_atomic_add_return(1, &hw_sema->next_value);
+
+	/* "s" should be an uninitialized sema. */
+	WARN_ON(s->incremented);
+
+	nvgpu_atomic_set(&s->value, next);
+	s->incremented = 1;
+
+	gpu_sema_verbose_dbg(s->g, "INCR sema for c=%d (%u)",
+			     hw_sema->ch->chid, next);
+}
diff --git a/drivers/gpu/nvgpu/include/nvgpu/semaphore.h b/drivers/gpu/nvgpu/include/nvgpu/semaphore.h
index 9ab6cc674..a7bd79cb0 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/semaphore.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/semaphore.h
@@ -24,16 +24,13 @@
 #define SEMAPHORE_GK20A_H
 
 #include <nvgpu/log.h>
-#include <nvgpu/timers.h>
 #include <nvgpu/atomic.h>
-#include <nvgpu/bug.h>
 #include <nvgpu/kref.h>
 #include <nvgpu/list.h>
 #include <nvgpu/nvgpu_mem.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
-#include "gk20a/channel_gk20a.h"
 
 #define gpu_sema_dbg(g, fmt, args...)		\
 	nvgpu_log(g, gpu_dbg_sema, fmt, ##args)
@@ -191,141 +188,18 @@ void nvgpu_semaphore_put(struct nvgpu_semaphore *s);
 void nvgpu_semaphore_get(struct nvgpu_semaphore *s);
 void nvgpu_semaphore_free_hw_sema(struct channel_gk20a *ch);
 
-/*
- * Return the address of a specific semaphore.
- *
- * Don't call this on a semaphore you don't own - the VA returned will make no
- * sense in your specific channel's VM.
- */
-static inline u64 nvgpu_semaphore_gpu_rw_va(struct nvgpu_semaphore *s)
-{
-	return __nvgpu_semaphore_pool_gpu_va(s->location.pool, false) +
-		s->location.offset;
-}
+u64 nvgpu_semaphore_gpu_rw_va(struct nvgpu_semaphore *s);
+u64 nvgpu_semaphore_gpu_ro_va(struct nvgpu_semaphore *s);
+u64 nvgpu_hw_sema_addr(struct nvgpu_semaphore_int *hw_sema);
 
-/*
- * Get the global RO address for the semaphore. Can be called on any semaphore
- * regardless of whether you own it.
- */
-static inline u64 nvgpu_semaphore_gpu_ro_va(struct nvgpu_semaphore *s)
-{
-	return __nvgpu_semaphore_pool_gpu_va(s->location.pool, true) +
-		s->location.offset;
-}
+u32 __nvgpu_semaphore_read(struct nvgpu_semaphore_int *hw_sema);
+u32 nvgpu_semaphore_read(struct nvgpu_semaphore *s);
+u32 nvgpu_semaphore_get_value(struct nvgpu_semaphore *s);
+bool nvgpu_semaphore_is_released(struct nvgpu_semaphore *s);
+bool nvgpu_semaphore_is_acquired(struct nvgpu_semaphore *s);
 
-static inline u64 nvgpu_hw_sema_addr(struct nvgpu_semaphore_int *hw_sema)
-{
-	return __nvgpu_semaphore_pool_gpu_va(hw_sema->location.pool, true) +
-		hw_sema->location.offset;
-}
+bool nvgpu_semaphore_reset(struct nvgpu_semaphore_int *hw_sema);
+void nvgpu_semaphore_prepare(struct nvgpu_semaphore *s,
+		struct nvgpu_semaphore_int *hw_sema);
 
-static inline u32 __nvgpu_semaphore_read(struct nvgpu_semaphore_int *hw_sema)
-{
-	return nvgpu_mem_rd(hw_sema->ch->g, &hw_sema->location.pool->rw_mem,
-			hw_sema->location.offset);
-}
-
-/*
- * Read the underlying value from a semaphore.
- */
-static inline u32 nvgpu_semaphore_read(struct nvgpu_semaphore *s)
-{
-	return nvgpu_mem_rd(s->g, &s->location.pool->rw_mem,
-			s->location.offset);
-}
-
-/*
- * Check if "racer" is over "goal" with wraparound handling.
- */
-static inline bool __nvgpu_semaphore_value_released(u32 goal, u32 racer)
-{
-	/*
-	 * Handle wraparound with the same heuristic as the hardware does:
-	 * although the integer will eventually wrap around, consider a sema
-	 * released against a threshold if its value has passed that threshold
-	 * but has not wrapped over half of the u32 range over that threshold;
-	 * such wrapping is unlikely to happen during a sema lifetime.
-	 *
-	 * Values for [goal, goal + 0x7fffffff] are considered signaled; that's
-	 * precisely half of the 32-bit space. If racer == goal + 0x80000000,
-	 * then it needs 0x80000000 increments to wrap again and signal.
-	 *
-	 * Unsigned arithmetic is used because it's well-defined. This is
-	 * effectively the same as: signed_racer - signed_goal > 0.
-	 */
-
-	return racer - goal < 0x80000000;
-}
-
-static inline u32 nvgpu_semaphore_get_value(struct nvgpu_semaphore *s)
-{
-	return (u32)nvgpu_atomic_read(&s->value);
-}
-
-static inline bool nvgpu_semaphore_is_released(struct nvgpu_semaphore *s)
-{
-	u32 sema_val = nvgpu_semaphore_read(s);
-	u32 wait_payload = nvgpu_semaphore_get_value(s);
-
-	return __nvgpu_semaphore_value_released(wait_payload, sema_val);
-}
-
-static inline bool nvgpu_semaphore_is_acquired(struct nvgpu_semaphore *s)
-{
-	return !nvgpu_semaphore_is_released(s);
-}
-
-/*
- * Fast-forward the hw sema to its tracked max value.
- *
- * Return true if the sema wasn't at the max value and needed updating, false
- * otherwise.
- */
-static inline bool nvgpu_semaphore_reset(struct nvgpu_semaphore_int *hw_sema)
-{
-	u32 threshold = (u32)nvgpu_atomic_read(&hw_sema->next_value);
-	u32 current_val = __nvgpu_semaphore_read(hw_sema);
-
-	/*
-	 * If the semaphore has already reached the value we would write then
-	 * this is really just a NO-OP. However, the sema value shouldn't be
-	 * more than what we expect to be the max.
-	 */
-
-	if (WARN_ON(__nvgpu_semaphore_value_released(threshold + 1,
-						     current_val)))
-		return false;
-
-	if (current_val == threshold)
-		return false;
-
-	nvgpu_mem_wr(hw_sema->ch->g, &hw_sema->location.pool->rw_mem,
-			hw_sema->location.offset, threshold);
-
-	gpu_sema_verbose_dbg(hw_sema->ch->g, "(c=%d) RESET %u -> %u",
-			hw_sema->ch->chid, current_val, threshold);
-
-	return true;
-}
-
-/*
- * Update nvgpu-tracked shadow of the value in "hw_sema" and mark the threshold
- * value to "s" which represents the increment that the caller must write in a
- * pushbuf. The same nvgpu_semaphore will also represent an output fence; when
- * nvgpu_semaphore_is_released(s) == true, the gpu is done with this increment.
- */
-static inline void nvgpu_semaphore_prepare(struct nvgpu_semaphore *s,
-		struct nvgpu_semaphore_int *hw_sema)
-{
-	int next = nvgpu_atomic_add_return(1, &hw_sema->next_value);
-
-	/* "s" should be an uninitialized sema. */
-	WARN_ON(s->incremented);
-
-	nvgpu_atomic_set(&s->value, next);
-	s->incremented = 1;
-
-	gpu_sema_verbose_dbg(s->g, "INCR sema for c=%d (%u)",
-			     hw_sema->ch->chid, next);
-}
 #endif