diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 904da89be..e2d1515df 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -52,6 +52,7 @@ ccflags-y += -DCONFIG_NVGPU_IOCTL_NON_FUSA
 ccflags-y += -DCONFIG_NVGPU_COMMON_NON_FUSA
 ccflags-y += -DCONFIG_NVGPU_INJECT_HWERR
 ccflags-y += -DCONFIG_NVGPU_GR_FALCON_NON_SECURE_BOOT
+ccflags-y += -DCONFIG_NVGPU_SW_SEMAPHORE
 
 ifeq ($(CONFIG_NVGPU_LOGGING),y)
 ccflags-y += -DCONFIG_NVGPU_LOGGING=1
diff --git a/drivers/gpu/nvgpu/Makefile.shared.configs b/drivers/gpu/nvgpu/Makefile.shared.configs
index cc40756e4..3eebc6e6a 100644
--- a/drivers/gpu/nvgpu/Makefile.shared.configs
+++ b/drivers/gpu/nvgpu/Makefile.shared.configs
@@ -182,6 +182,10 @@ NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_IOCTL_NON_FUSA
 CONFIG_NVGPU_GR_FALCON_NON_SECURE_BOOT := 1
 NVGPU_COMMON_CFLAGS                    += -DCONFIG_NVGPU_GR_FALCON_NON_SECURE_BOOT
 
+# Enable SW Semaphore for normal build
+CONFIG_NVGPU_SW_SEMAPHORE       := 1
+NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_SW_SEMAPHORE
+
 endif
 endif
 
diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources
index 6bc2ce1ee..a38cf0813 100644
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -127,10 +127,6 @@ srcs += common/utils/enabled.c \
 	common/ptimer/ptimer.c \
 	common/sync/channel_sync.c \
 	common/sync/channel_sync_syncpt.c \
-	common/semaphore/semaphore_sea.c \
-	common/semaphore/semaphore_pool.c \
-	common/semaphore/semaphore_hw.c \
-	common/semaphore/semaphore.c \
 	common/power_features/power_features.c \
 	common/power_features/cg/cg.c \
 	common/fifo/preempt.c \
@@ -377,7 +373,14 @@ srcs += common/ce/ce.c
 endif
 
 ifeq ($(CONFIG_NVGPU_KERNEL_MODE_SUBMIT),1)
-srcs += common/fifo/submit.c \
+srcs += common/fifo/submit.c
+endif
+
+ifeq ($(CONFIG_NVGPU_SW_SEMAPHORE),1)
+srcs += common/semaphore/semaphore_sea.c \
+	common/semaphore/semaphore_pool.c \
+	common/semaphore/semaphore_hw.c \
+	common/semaphore/semaphore.c \
 	common/sync/channel_sync_semaphore.c \
 	hal/sync/sema_cmdbuf_gk20a.c \
 	hal/sync/sema_cmdbuf_gv11b.c
diff --git a/drivers/gpu/nvgpu/common/fence/fence.c b/drivers/gpu/nvgpu/common/fence/fence.c
index 8c827ab5a..834e901c0 100644
--- a/drivers/gpu/nvgpu/common/fence/fence.c
+++ b/drivers/gpu/nvgpu/common/fence/fence.c
@@ -45,10 +45,11 @@ static void nvgpu_fence_free(struct nvgpu_ref *ref)
 	if (nvgpu_os_fence_is_initialized(&f->os_fence)) {
 		f->os_fence.ops->drop_ref(&f->os_fence);
 	}
-
+#ifdef CONFIG_NVGPU_SW_SEMAPHPORE
 	if (f->semaphore != NULL) {
 		nvgpu_semaphore_put(f->semaphore);
 	}
+#endif
 
 	if (f->allocator != NULL) {
 		if (nvgpu_alloc_initialized(f->allocator)) {
@@ -195,10 +196,13 @@ void nvgpu_fence_init(struct nvgpu_fence_type *f,
 	}
 	f->ops = ops;
 	f->syncpt_id = NVGPU_INVALID_SYNCPT_ID;
+#ifdef CONFIG_NVGPU_SW_SEMAPHPORE
 	f->semaphore = NULL;
+#endif
 	f->os_fence = os_fence;
 }
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 /* Fences that are backed by GPU semaphores: */
 
 static int nvgpu_semaphore_fence_wait(struct nvgpu_fence_type *f, u32 timeout)
@@ -248,6 +252,8 @@ int nvgpu_fence_from_semaphore(
 	return 0;
 }
 
+#endif
+
 #ifdef CONFIG_TEGRA_GK20A_NVHOST
 /* Fences that are backed by host1x syncpoints: */
 
diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index f81db8b9d..efa8053ca 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -1718,6 +1718,7 @@ static void gk20a_free_channel(struct nvgpu_channel *ch, bool force)
 	}
 	nvgpu_mutex_release(&ch->sync_lock);
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 	/*
 	 * free the channel used semaphore index.
 	 * we need to do this before releasing the address space,
@@ -1726,6 +1727,7 @@ static void gk20a_free_channel(struct nvgpu_channel *ch, bool force)
 	if (ch->hw_sema != NULL) {
 		nvgpu_hw_semaphore_free(ch);
 	}
+#endif
 
 	/*
 	 * When releasing the channel we unbind the VM - so release the ref.
@@ -2739,7 +2741,9 @@ void nvgpu_channel_debug_dump_all(struct gk20a *g,
 	for (chid = 0U; chid < f->num_channels; chid++) {
 		struct nvgpu_channel *ch = &f->channel[chid];
 		struct nvgpu_channel_dump_info *info = infos[chid];
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 		struct nvgpu_hw_semaphore *hw_sema = ch->hw_sema;
+#endif
 
 		/* if this info exists, the above loop took a channel ref */
 		if (info == NULL) {
@@ -2752,12 +2756,14 @@ void nvgpu_channel_debug_dump_all(struct gk20a *g,
 		info->refs = nvgpu_atomic_read(&ch->ref_count);
 		info->deterministic = ch->deterministic;
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 		if (hw_sema != NULL) {
 			info->sema.value = nvgpu_hw_semaphore_read(hw_sema);
 			info->sema.next =
 				(u32)nvgpu_hw_semaphore_read_next(hw_sema);
 			info->sema.addr = nvgpu_hw_semaphore_addr(hw_sema);
 		}
+#endif
 
 		g->ops.channel.read_state(g, ch, &info->hw_state);
 		g->ops.ramfc.capture_ram_dump(g, ch, info);
diff --git a/drivers/gpu/nvgpu/common/mm/mm.c b/drivers/gpu/nvgpu/common/mm/mm.c
index 1901835d3..57a4ec43f 100644
--- a/drivers/gpu/nvgpu/common/mm/mm.c
+++ b/drivers/gpu/nvgpu/common/mm/mm.c
@@ -166,7 +166,9 @@ static void nvgpu_remove_mm_support(struct mm_gk20a *mm)
 		nvgpu_vm_put(mm->cde.vm);
 	}
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 	nvgpu_semaphore_sea_destroy(g);
+#endif
 #ifdef CONFIG_NVGPU_DGPU
 	nvgpu_vidmem_destroy(g);
 #endif
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index a3a2cf391..d653eda73 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -314,6 +314,7 @@ bool nvgpu_big_pages_possible(struct vm_gk20a *vm, u64 base, u64 size)
 	return true;
 }
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 /*
  * Initialize a semaphore pool. Just return successfully if we do not need
  * semaphores (i.e when sync-pts are active).
@@ -375,6 +376,7 @@ static int nvgpu_init_sema_pool(struct vm_gk20a *vm)
 
 	return 0;
 }
+#endif
 
 /*
  * Initialize a preallocated vm
@@ -619,6 +621,7 @@ int nvgpu_vm_do_init(struct mm_gk20a *mm,
 	nvgpu_ref_init(&vm->ref);
 	nvgpu_init_list_node(&vm->vm_area_list);
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 	/*
 	 * This is only necessary for channel address spaces. The best way to
 	 * distinguish channel address spaces from other address spaces is by
@@ -630,12 +633,15 @@ int nvgpu_vm_do_init(struct mm_gk20a *mm,
 			goto clean_up_gmmu_lock;
 		}
 	}
+#endif
 
 	return 0;
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 clean_up_gmmu_lock:
 	nvgpu_mutex_destroy(&vm->update_gmmu_lock);
 	nvgpu_mutex_destroy(&vm->syncpt_ro_map_lock);
+#endif
 clean_up_allocators:
 	if (nvgpu_alloc_initialized(&vm->kernel)) {
 		nvgpu_alloc_destroy(&vm->kernel);
@@ -732,6 +738,7 @@ static void nvgpu_vm_remove(struct vm_gk20a *vm)
 	struct gk20a *g = vm->mm->g;
 	bool done;
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 	/*
 	 * Do this outside of the update_gmmu_lock since unmapping the semaphore
 	 * pool involves unmapping a GMMU mapping which means aquiring the
@@ -743,6 +750,7 @@ static void nvgpu_vm_remove(struct vm_gk20a *vm)
 			nvgpu_semaphore_pool_put(vm->sema_pool);
 		}
 	}
+#endif
 
 	if (nvgpu_mem_is_valid(&g->syncpt_mem) &&
 			vm->syncpt_ro_map_gpu_va != 0ULL) {
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync.c b/drivers/gpu/nvgpu/common/sync/channel_sync.c
index ca867ec6a..f96bc87cc 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync.c
@@ -45,7 +45,7 @@ struct nvgpu_channel_sync *nvgpu_channel_sync_create(struct nvgpu_channel *c,
 	if (nvgpu_has_syncpoints(c->g)) {
 		return nvgpu_channel_sync_syncpt_create(c, user_managed);
 	} else {
-#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 		return nvgpu_channel_sync_semaphore_create(c, user_managed);
 #else
 		return NULL;
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
index 99f24f2fa..0b00a16bb 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -923,7 +923,7 @@ static const struct gpu_ops gv11b_ops = {
 			.get_sync_ro_map = gv11b_syncpt_get_sync_ro_map,
 		},
 #endif /* CONFIG_TEGRA_GK20A_NVHOST */
-#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 		.sema = {
 			.get_wait_cmd_size = gv11b_sema_get_wait_cmd_size,
 			.get_incr_cmd_size = gv11b_sema_get_incr_cmd_size,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h
index ca9044cbb..a9b22545e 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -388,9 +388,10 @@ struct nvgpu_channel {
 	struct nvgpu_spinlock ref_actions_lock;
 #endif
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 	/** Semaphore owned by this channel. */
 	struct nvgpu_hw_semaphore *hw_sema;
-
+#endif
 	/**
 	 * Channel instance has been bound to hardware (i.e. instance block
 	 * has been set up, and bound in CCSR).
diff --git a/drivers/gpu/nvgpu/include/nvgpu/fence.h b/drivers/gpu/nvgpu/include/nvgpu/fence.h
index 48e674d3b..cb2ed9dd3 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/fence.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/fence.h
@@ -31,7 +31,9 @@
 struct gk20a;
 struct nvgpu_channel;
 struct platform_device;
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 struct nvgpu_semaphore;
+#endif
 struct nvgpu_os_fence;
 
 struct nvgpu_fence_type {
@@ -44,9 +46,11 @@ struct nvgpu_fence_type {
 
 	struct nvgpu_os_fence os_fence;
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 	/* Valid for fences created from semaphores: */
 	struct nvgpu_semaphore *semaphore;
 	struct nvgpu_cond *semaphore_wq;
+#endif
 
 	/* Valid for fences created from syncpoints: */
 	struct nvgpu_nvhost_dev *nvhost_dev;
@@ -63,12 +67,14 @@ struct nvgpu_fence_ops {
 	void *(*free)(struct nvgpu_ref *ref);
 };
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
 /* Fences can be created from semaphores or syncpoint (id, value) pairs */
 int nvgpu_fence_from_semaphore(
 		struct nvgpu_fence_type *fence_out,
 		struct nvgpu_semaphore *semaphore,
 		struct nvgpu_cond *semaphore_wq,
 		struct nvgpu_os_fence os_fence);
+#endif
 
 int nvgpu_fence_from_syncpt(
 		struct nvgpu_fence_type *fence_out,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vm.h b/drivers/gpu/nvgpu/include/nvgpu/vm.h
index e17674971..18d14023a 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h
@@ -199,11 +199,13 @@ struct vm_gk20a {
 	   unmapping. Must hold vm->update_gmmu_lock. */
 	struct vm_gk20a_mapping_batch *kref_put_batch;
 
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
+
 	/*
 	 * Each address space needs to have a semaphore pool.
 	 */
 	struct nvgpu_semaphore_pool *sema_pool;
-
+#endif
 	/*
 	 * Create sync point read only map for sync point range.
 	 * Channels sharing same vm will also share same sync point ro map
diff --git a/drivers/gpu/nvgpu/libnvgpu-drv_safe.export b/drivers/gpu/nvgpu/libnvgpu-drv_safe.export
index 05b9a0752..f593a59f0 100644
--- a/drivers/gpu/nvgpu/libnvgpu-drv_safe.export
+++ b/drivers/gpu/nvgpu/libnvgpu-drv_safe.export
@@ -101,7 +101,6 @@ nvgpu_gmmu_init_page_table
 nvgpu_gmmu_map
 nvgpu_gmmu_map_fixed
 nvgpu_gmmu_unmap
-nvgpu_hw_semaphore_init
 nvgpu_init_enabled_flags
 nvgpu_init_hal
 nvgpu_init_mm_support
diff --git a/userspace/units/fifo/channel/nvgpu-channel.c b/userspace/units/fifo/channel/nvgpu-channel.c
index fdc39ec6e..aa6cb2e3d 100644
--- a/userspace/units/fifo/channel/nvgpu-channel.c
+++ b/userspace/units/fifo/channel/nvgpu-channel.c
@@ -322,8 +322,7 @@ done:
 #define F_CHANNEL_CLOSE_AS_BOUND		BIT(7)
 #define F_CHANNEL_CLOSE_FREE_SUBCTX		BIT(8)
 #define F_CHANNEL_CLOSE_USER_SYNC		BIT(9)
-#define F_CHANNEL_CLOSE_HW_SEMA			BIT(10)
-#define F_CHANNEL_CLOSE_LAST			BIT(11)
+#define F_CHANNEL_CLOSE_LAST			BIT(10)
 
 /* nvgpu_tsg_unbind_channel always return 0 */
 
@@ -338,7 +337,6 @@ static const char *f_channel_close[] = {
 	"as_bound",
 	"free_subctx",
 	"user_sync",
-	"hw_sema",
 };
 
 static void stub_os_channel_close(struct nvgpu_channel *ch, bool force)
@@ -365,12 +363,8 @@ static bool channel_close_pruned(u32 branches, u32 final)
 	if ((branches & F_CHANNEL_CLOSE_AS_BOUND) == 0) {
 		branches &= ~F_CHANNEL_CLOSE_FREE_SUBCTX;
 		branches &= ~F_CHANNEL_CLOSE_USER_SYNC;
-		branches &= ~F_CHANNEL_CLOSE_HW_SEMA;
 	}
 
-	/* TODO: add semaphore pool init to support this */
-	branches &= ~F_CHANNEL_CLOSE_HW_SEMA;
-
 	if (branches < branches_init) {
 		return true;
 	}
@@ -418,7 +412,6 @@ static int test_channel_close(struct unit_module *m,
 		ch = gk20a_open_new_channel(g, runlist_id,
 				privileged, getpid(), getpid());
 		assert(ch != NULL);
-		assert(ch->hw_sema == NULL);
 
 		ch->usermode_submit_enabled = true;
 
@@ -460,11 +453,6 @@ static int test_channel_close(struct unit_module *m,
 			assert(err == 0);
 		}
 
-		if (branches & F_CHANNEL_CLOSE_HW_SEMA) {
-			err = nvgpu_hw_semaphore_init(ch);
-			assert(err == 0);
-		}
-
 		if (branches & F_CHANNEL_CLOSE_ALREADY_FREED) {
 			nvgpu_channel_close(ch);
 		}
@@ -527,7 +515,6 @@ static int test_channel_close(struct unit_module *m,
 		assert(nvgpu_ref_put_return(&vm.ref, NULL));
 
 		assert(ch->user_sync == NULL);
-		assert(ch->hw_sema == NULL);
 
 unbind:
 		/*