From ad320f60b93dc70c732d9222cf572ead3d830584 Mon Sep 17 00:00:00 2001
From: Ramalingam C <ramalingamc@nvidia.com>
Date: Wed, 3 May 2023 19:09:57 +0000
Subject: [PATCH] gpu: nvgpu: sema based gpfifo submission tracking

Implement a hw semaphore which is used to track the gpfifo submission.
This is implementation used when the userd.gp_get() is not defined and
also the feature flag NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET is set.

At the end of each job submitted, submit a semaphore to write the
gpfifo get pointer at hw semaphore addr. At next job submission
processing we will read the gpfifo.get from the designated hw semaphore
location.

JIRA NVGPU-9588

Change-Id: Ic88ace1a3f60e3f38f159e1861464ebcaea04469
Signed-off-by: Ramalingam C <ramalingamc@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2898143
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-by: Martin Radev <mradev@nvidia.com>
Reviewed-by: Ankur Kishore <ankkishore@nvidia.com>
Tested-by: Martin Radev <mradev@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/channel.c       | 54 ++++++++++-
 drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c   | 34 ++++---
 drivers/gpu/nvgpu/common/fifo/submit.c        | 69 ++++++++++++--
 .../gpu/nvgpu/common/semaphore/semaphore.c    | 25 ++++-
 .../gpu/nvgpu/common/semaphore/semaphore_hw.c |  5 +
 .../common/sync/channel_sync_semaphore.c      | 91 +++++++++++++++++--
 drivers/gpu/nvgpu/hal/fifo/userd_gv11b.c      | 15 ++-
 drivers/gpu/nvgpu/include/nvgpu/channel.h     |  3 +
 .../include/nvgpu/channel_sync_semaphore.h    | 19 +++-
 drivers/gpu/nvgpu/include/nvgpu/job.h         |  4 +-
 drivers/gpu/nvgpu/include/nvgpu/semaphore.h   |  5 +
 11 files changed, 286 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index d01b66daf..70d2d4af7 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -251,6 +251,10 @@ static void channel_kernelmode_deinit(struct nvgpu_channel *ch)
 		nvgpu_channel_sync_destroy(ch->sync);
 		ch->sync = NULL;
 	}
+	if (ch->gpfifo_sync != NULL) {
+		nvgpu_channel_sync_destroy(ch->gpfifo_sync);
+		ch->gpfifo_sync = NULL;
+	}
 	nvgpu_mutex_release(&ch->sync_lock);
 }
 
@@ -370,6 +374,18 @@ static int channel_setup_kernelmode(struct nvgpu_channel *c,
 			nvgpu_mutex_release(&c->sync_lock);
 			goto clean_up_unmap;
 		}
+
+		if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET)) {
+			c->gpfifo_sync = nvgpu_channel_sync_semaphore_create(c);
+			if (c->gpfifo_sync == NULL) {
+				err = -ENOMEM;
+				goto clean_up_sync;
+			}
+			nvgpu_mutex_acquire(&c->gpfifo_hw_sema_lock);
+			nvgpu_channel_sync_hw_semaphore_init(c->gpfifo_sync);
+			nvgpu_mutex_release(&c->gpfifo_hw_sema_lock);
+		}
+
 		nvgpu_mutex_release(&c->sync_lock);
 
 		if (g->ops.channel.set_syncpt != NULL) {
@@ -431,6 +447,10 @@ clean_up_priv_cmd:
 clean_up_prealloc:
 	nvgpu_channel_joblist_deinit(c);
 clean_up_sync:
+	if (c->gpfifo_sync != NULL) {
+		nvgpu_channel_sync_destroy(c->gpfifo_sync);
+		c->gpfifo_sync = NULL;
+	}
 	if (c->sync != NULL) {
 		nvgpu_channel_sync_destroy(c->sync);
 		c->sync = NULL;
@@ -448,9 +468,9 @@ clean_up:
 }
 
 /* Update with this periodically to determine how the gpfifo is draining. */
-static inline u32 channel_update_gpfifo_get(struct gk20a *g,
-				struct nvgpu_channel *c)
+static inline u32 channel_update_gpfifo_get(struct nvgpu_channel *c)
 {
+	struct gk20a *g = c->g;
 	u32 new_get = 0U;
 
 	if (g->ops.userd.gp_get != NULL) {
@@ -469,7 +489,7 @@ u32 nvgpu_channel_get_gpfifo_free_count(struct nvgpu_channel *ch)
 
 u32 nvgpu_channel_update_gpfifo_get_and_get_free_count(struct nvgpu_channel *ch)
 {
-	(void)channel_update_gpfifo_get(ch->g, ch);
+	(void)channel_update_gpfifo_get(ch);
 	return nvgpu_channel_get_gpfifo_free_count(ch);
 }
 
@@ -514,6 +534,9 @@ static void nvgpu_channel_finalize_job(struct nvgpu_channel *c,
 	 * semaphore or even a syncfd.
 	 */
 	nvgpu_fence_put(&job->post_fence);
+	if (job->gpfifo_sema != NULL) {
+		nvgpu_semaphore_put(job->gpfifo_sema);
+	}
 
 	/*
 	 * Free the private command buffers (in order of allocation)
@@ -522,6 +545,9 @@ static void nvgpu_channel_finalize_job(struct nvgpu_channel *c,
 		nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->wait_cmd);
 	}
 	nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->incr_cmd);
+	if (job->gpfifo_incr_cmd != NULL) {
+		nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->gpfifo_incr_cmd);
+	}
 
 	nvgpu_channel_free_job(c, job);
 
@@ -590,9 +616,22 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c)
 
 		WARN_ON(c->sync == NULL);
 
+		if (c->gpfifo_sync != NULL) {
+			if (g->aggressive_sync_destroy_thresh != 0U) {
+				nvgpu_mutex_acquire(&c->sync_lock);
+				if (nvgpu_channel_sync_put_ref_and_check(c->gpfifo_sync)
+					&& g->aggressive_sync_destroy) {
+					nvgpu_channel_sync_destroy(c->gpfifo_sync);
+					c->gpfifo_sync = NULL;
+				}
+				nvgpu_mutex_release(&c->sync_lock);
+			}
+		}
+
 		if (c->sync != NULL) {
 			if (c->has_os_fence_framework_support &&
-			    g->os_channel.os_fence_framework_inst_exists(c)) {
+			    g->os_channel.os_fence_framework_inst_exists(c) &&
+			    !nvgpu_has_syncpoints(g)) {
 				g->os_channel.signal_os_fence_framework(c,
 						&job->post_fence);
 			}
@@ -689,7 +728,7 @@ bool nvgpu_channel_update_and_check_ctxsw_timeout(struct nvgpu_channel *ch,
 		goto done;
 	}
 
-	gpfifo_get = channel_update_gpfifo_get(ch->g, ch);
+	gpfifo_get = channel_update_gpfifo_get(ch);
 
 	if (gpfifo_get == ch->ctxsw_timeout_gpfifo_get) {
 		/* didn't advance since previous ctxsw timeout check */
@@ -1042,6 +1081,9 @@ unbind:
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 	WARN_ON(ch->sync != NULL);
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET)) {
+		WARN_ON(ch->gpfifo_sync != NULL);
+	}
 #endif
 
 	channel_free_unlink_debug_session(ch);
@@ -1751,6 +1793,7 @@ static void nvgpu_channel_destroy(struct nvgpu_channel *c)
 	nvgpu_mutex_destroy(&c->ioctl_lock);
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 	nvgpu_mutex_destroy(&c->joblist.pre_alloc.read_lock);
+	nvgpu_mutex_destroy(&c->gpfifo_hw_sema_lock);
 #endif
 	nvgpu_mutex_destroy(&c->sync_lock);
 #if defined(CONFIG_NVGPU_CYCLESTATS)
@@ -1815,6 +1858,7 @@ int nvgpu_channel_init_support(struct gk20a *g, u32 chid)
 	nvgpu_init_list_node(&c->worker_item);
 
 	nvgpu_mutex_init(&c->joblist.pre_alloc.read_lock);
+	nvgpu_mutex_init(&c->gpfifo_hw_sema_lock);
 
 #endif /* CONFIG_NVGPU_KERNEL_MODE_SUBMIT */
 	nvgpu_mutex_init(&c->ioctl_lock);
diff --git a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
index 335bfade4..b9922c726 100644
--- a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
+++ b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
@@ -67,6 +67,7 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct vm_gk20a *vm,
 	int err = 0;
 	u32 wait_size, incr_size;
 	u32 mem_per_job;
+	u32 gpfifo_incr_size = 0;
 
 	/*
 	 * sema size is at least as much as syncpt size, but semas may not be
@@ -77,6 +78,9 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct vm_gk20a *vm,
 #ifdef CONFIG_NVGPU_SW_SEMAPHORE
 	wait_size = g->ops.sync.sema.get_wait_cmd_size();
 	incr_size = g->ops.sync.sema.get_incr_cmd_size();
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET)) {
+		gpfifo_incr_size = g->ops.sync.sema.get_incr_cmd_size();
+	}
 #else
 	wait_size = g->ops.sync.syncpt.get_wait_cmd_size();
 	incr_size = g->ops.sync.syncpt.get_incr_cmd_size(true);
@@ -84,22 +88,24 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct vm_gk20a *vm,
 
 	/*
 	 * Compute the amount of priv_cmdbuf space we need. In general the
-	 * worst case is the kernel inserts both a semaphore pre-fence and
-	 * post-fence. Any sync-pt fences will take less memory so we can
-	 * ignore them unless they're the only supported type. Jobs can also
-	 * have more than one pre-fence but that's abnormal and we'll -EAGAIN
-	 * if such jobs would fill the queue.
+	 * worst case is the kernel inserts both a semaphore pre-fence,
+	 * post-fence and semaphore for gp.get tracking. Any sync-pt fences
+	 * will take less memory so we can ignore them unless they're the only
+	 * supported type. Jobs can also have more than one pre-fence but
+	 * that's abnormal and we'll -EAGAIN if such jobs would fill the queue.
 	 *
 	 * A semaphore ACQ (fence-wait) is 8 words: semaphore_a, semaphore_b,
 	 * semaphore_c, and semaphore_d. A semaphore INCR (fence-get) will be
 	 * 10 words: all the same as an ACQ plus a non-stalling intr which is
-	 * another 2 words. In reality these numbers vary by chip but we'll use
-	 * 8 and 10 as examples.
+	 * another 2 words. Semaphore for updating the gp.get also needs same
+	 * as A semaphore INCR (fence-get) i.e 10 words. In reality these
+	 * numbers vary by chip but we'll use 8, 10 and 10 as examples.
 	 *
 	 * Given the job count, cmdbuf space is allocated such that each job
-	 * can get one wait command and one increment command:
+	 * can get one wait command, one increment command and a semaphore for
+	 * gp.get tracking
 	 *
-	 *   job_count * (8 + 10) * 4 bytes
+	 *   job_count * (8 + 10 + 10) * 4 bytes
 	 *
 	 * These cmdbufs are inserted as gpfifo entries right before and after
 	 * the user submitted gpfifo entries per submit.
@@ -109,13 +115,15 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct vm_gk20a *vm,
 	 * is full when the number of consumed entries is one less than the
 	 * allocation size:
 	 *
-	 * alloc bytes = job_count * (wait + incr + 1) * slot in bytes
+	 * alloc bytes = job_count * (wait + incr + gpfifo_incr + 1) * slot
+	 * in bytes
 	 */
-	mem_per_job = nvgpu_safe_mult_u32(
+	mem_per_job = nvgpu_safe_add_u32(
 			nvgpu_safe_add_u32(
 				nvgpu_safe_add_u32(wait_size, incr_size),
-				1U),
-			(u32)sizeof(u32));
+				gpfifo_incr_size), 1U);
+	mem_per_job = nvgpu_safe_mult_u32(mem_per_job, (u32)sizeof(u32));
+
 	/* both 32 bit and mem_per_job is small */
 	size = nvgpu_safe_mult_u64((u64)job_count, (u64)mem_per_job);
 
diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c
index ee16f4777..3279d7876 100644
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -39,6 +39,7 @@
 #include <nvgpu/trace.h>
 #include <nvgpu/nvhost.h>
 #include <nvgpu/user_fence.h>
+#include <nvgpu/channel_sync_semaphore.h>
 
 #include <nvgpu/fifo/swprofile.h>
 
@@ -106,7 +107,7 @@ static int nvgpu_submit_create_incr_cmd(struct nvgpu_channel *c,
 static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 				      struct nvgpu_channel_fence *fence,
 				      struct nvgpu_channel_job *job,
-				      u32 flags)
+				      u32 flags, u32 gpfifo_entries)
 {
 	struct gk20a *g = c->g;
 	bool need_sync_fence;
@@ -116,6 +117,8 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 	bool flag_fence_get = (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) != 0U;
 	bool flag_sync_fence = (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) != 0U;
 	bool flag_fence_wait = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) != 0U;
+	bool sema_tracking = nvgpu_is_enabled(g,
+					NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET);
 
 	if (g->aggressive_sync_destroy_thresh != 0U) {
 		nvgpu_mutex_acquire(&c->sync_lock);
@@ -128,6 +131,20 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 			new_sync_created = true;
 		}
 		nvgpu_channel_sync_get_ref(c->sync);
+
+		if (c->gpfifo_sync == NULL && sema_tracking) {
+			c->gpfifo_sync = nvgpu_channel_sync_semaphore_create(c);
+			if (c->gpfifo_sync == NULL) {
+				err = -ENOMEM;
+				goto clean_up_put_sync;
+			}
+			nvgpu_mutex_acquire(&c->gpfifo_hw_sema_lock);
+			nvgpu_channel_sync_hw_semaphore_init(c->gpfifo_sync);
+			nvgpu_mutex_release(&c->gpfifo_hw_sema_lock);
+		}
+		if (c->gpfifo_sync != NULL) {
+			nvgpu_channel_sync_get_ref(c->gpfifo_sync);
+		}
 	}
 
 	if ((g->ops.channel.set_syncpt != NULL) && new_sync_created) {
@@ -151,6 +168,7 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 
 	need_sync_fence = flag_fence_get && flag_sync_fence;
 
+
 	/*
 	 * Always generate an increment at the end of a GPFIFO submission. When
 	 * we do job tracking, post fences are needed for various reasons even
@@ -162,19 +180,41 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 		goto clean_up_wait_cmd;
 	}
 
+	if (sema_tracking) {
+		err = nvgpu_submit_create_gpfifo_tracking_semaphore(
+				c->gpfifo_sync, &job->gpfifo_sema,
+				&job->gpfifo_incr_cmd,
+				nvgpu_safe_add_u32(gpfifo_entries,
+					(flag_fence_wait ? 3U : 2U)));
+		if (err != 0) {
+			goto clean_up_incr_cmd;
+		}
+	}
+
 	if (g->aggressive_sync_destroy_thresh != 0U) {
 		nvgpu_mutex_release(&c->sync_lock);
 	}
 	return 0;
 
+clean_up_incr_cmd:
+	if (job->incr_cmd != NULL) {
+		nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->incr_cmd);
+		job->incr_cmd = NULL;
+	}
 clean_up_wait_cmd:
 	if (job->wait_cmd != NULL) {
 		nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->wait_cmd);
+		job->wait_cmd = NULL;
 	}
-	job->wait_cmd = NULL;
 clean_up_put_sync:
 	if (g->aggressive_sync_destroy_thresh != 0U) {
-		if (nvgpu_channel_sync_put_ref_and_check(c->sync)
+		if (c->gpfifo_sync != NULL &&
+			nvgpu_channel_sync_put_ref_and_check(c->gpfifo_sync)
+		    && g->aggressive_sync_destroy) {
+			nvgpu_channel_sync_destroy(c->gpfifo_sync);
+		}
+		if (c->sync != NULL &&
+			nvgpu_channel_sync_put_ref_and_check(c->sync)
 		    && g->aggressive_sync_destroy) {
 			nvgpu_channel_sync_destroy(c->sync);
 		}
@@ -349,7 +389,7 @@ static int nvgpu_submit_prepare_gpfifo_track(struct nvgpu_channel *c,
 		return err;
 	}
 
-	err = nvgpu_submit_prepare_syncs(c, fence, job, flags);
+	err = nvgpu_submit_prepare_syncs(c, fence, job, flags, num_entries);
 	if (err != 0) {
 		goto clean_up_job;
 	}
@@ -369,9 +409,10 @@ static int nvgpu_submit_prepare_gpfifo_track(struct nvgpu_channel *c,
 	if (err != 0) {
 		goto clean_up_gpfifo_wait;
 	}
-
 	nvgpu_submit_append_priv_cmdbuf(c, job->incr_cmd);
-
+	if (c->gpfifo_sync != NULL) {
+		nvgpu_submit_append_priv_cmdbuf(c, job->gpfifo_incr_cmd);
+	}
 	err = nvgpu_channel_add_job(c, job, skip_buffer_refcounting);
 	if (err != 0) {
 		goto clean_up_gpfifo_incr;
@@ -403,6 +444,17 @@ clean_up_gpfifo_incr:
 		  nvgpu_safe_sub_u32(c->gpfifo.entry_num,
 		    nvgpu_safe_add_u32(1U, num_entries)))) &
 		nvgpu_safe_sub_u32(c->gpfifo.entry_num, 1U);
+
+	/*
+	 * undo the gpfifo incr priv cmdbuf which is similar to undo of
+	 * wait_cmd priv cmdbuf.
+	 */
+	if (job->gpfifo_incr_cmd != NULL) {
+		c->gpfifo.put =
+			nvgpu_safe_add_u32(c->gpfifo.put,
+			  nvgpu_safe_sub_u32(c->gpfifo.entry_num, 1U)) &
+			nvgpu_safe_sub_u32(c->gpfifo.entry_num, 1U);
+	}
 clean_up_gpfifo_wait:
 	if (job->wait_cmd != NULL) {
 		/*
@@ -419,6 +471,9 @@ clean_up_gpfifo_wait:
 	}
 	nvgpu_fence_put(&job->post_fence);
 	nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->incr_cmd);
+	if (job->gpfifo_incr_cmd != NULL) {
+		nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->gpfifo_incr_cmd);
+	}
 	if (job->wait_cmd != NULL) {
 		nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->wait_cmd);
 	}
diff --git a/drivers/gpu/nvgpu/common/semaphore/semaphore.c b/drivers/gpu/nvgpu/common/semaphore/semaphore.c
index e0b349c5a..c8f143eed 100644
--- a/drivers/gpu/nvgpu/common/semaphore/semaphore.c
+++ b/drivers/gpu/nvgpu/common/semaphore/semaphore.c
@@ -1,7 +1,7 @@
 /*
  * Nvgpu Semaphores
  *
- * Copyright (c) 2014-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -168,8 +168,29 @@ void nvgpu_semaphore_prepare(struct nvgpu_semaphore *s,
 			     hw_sema->chid, next);
 }
 
+void nvgpu_semaphore_prepare_for_gpfifo_get(struct nvgpu_channel *c,
+		struct nvgpu_semaphore *s, struct nvgpu_hw_semaphore *hw_sema,
+		u32 new_entries)
+{
+	u32 next_get;
+
+	nvgpu_mutex_acquire(&c->gpfifo_hw_sema_lock);
+	next_get = nvgpu_safe_add_u32((u32)nvgpu_hw_semaphore_read_next(hw_sema),
+			new_entries) & nvgpu_safe_sub_u32(c->gpfifo.entry_num,
+			1U);
+	nvgpu_atomic_set(&hw_sema->next_value, (s32)next_get);
+	nvgpu_mutex_release(&c->gpfifo_hw_sema_lock);
+
+	WARN_ON(s->ready_to_wait);
+
+	nvgpu_atomic_set(&s->value, (s32)next_get);
+	s->ready_to_wait = true;
+
+	gpu_sema_verbose_dbg(s->g, "PREP sema for c=%d (%u)",
+				hw_sema->chid, next_get);
+}
+
 u64 nvgpu_semaphore_get_hw_pool_page_idx(struct nvgpu_semaphore *s)
 {
 	return nvgpu_semaphore_pool_get_page_idx(s->location.pool);
 }
-
diff --git a/drivers/gpu/nvgpu/common/semaphore/semaphore_hw.c b/drivers/gpu/nvgpu/common/semaphore/semaphore_hw.c
index 43e88266e..accc21340 100644
--- a/drivers/gpu/nvgpu/common/semaphore/semaphore_hw.c
+++ b/drivers/gpu/nvgpu/common/semaphore/semaphore_hw.c
@@ -159,6 +159,11 @@ void nvgpu_hw_semaphore_set(struct nvgpu_hw_semaphore *hw_sema, u32 val)
 	nvgpu_mem_wr(g, &pool->rw_mem, hw_sema->location.offset, val);
 }
 
+void nvgpu_hw_semaphore_init_next(struct nvgpu_hw_semaphore *hw_sema)
+{
+	nvgpu_atomic_set(&hw_sema->next_value, 0);
+}
+
 int nvgpu_hw_semaphore_read_next(struct nvgpu_hw_semaphore *hw_sema)
 {
 	return nvgpu_atomic_read(&hw_sema->next_value);
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
index fcc5fa02f..16e2e4037 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
@@ -1,7 +1,7 @@
 /*
  * GK20A Channel Synchronization Abstraction
  *
- * Copyright (c) 2014-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -41,12 +41,6 @@
 
 #include "channel_sync_priv.h"
 
-struct nvgpu_channel_sync_semaphore {
-	struct nvgpu_channel_sync base;
-	struct nvgpu_channel *c;
-	struct nvgpu_hw_semaphore *hw_sema;
-};
-
 static struct nvgpu_channel_sync_semaphore *
 nvgpu_channel_sync_semaphore_from_base(struct nvgpu_channel_sync *base)
 {
@@ -113,6 +107,34 @@ static void add_sema_incr_cmd(struct gk20a *g, struct nvgpu_channel *c,
 			     va, cmd);
 }
 
+static void add_sema_incr_cmd_to_write_next_get(struct nvgpu_channel *c,
+			 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
+			 struct nvgpu_hw_semaphore *hw_sema,
+			 u32 entries)
+{
+	struct gk20a *g = c->g;
+	u32 ch = c->chid;
+	u64 va;
+
+	/* release will need to write back to the semaphore memory. */
+	va = nvgpu_semaphore_gpu_rw_va(s);
+
+	/* find the right sema next_value to write (like syncpt's max). */
+	nvgpu_semaphore_prepare_for_gpfifo_get(c, s, hw_sema, entries);
+
+	/*
+	 * gp.get should be updated only when all the cmds are completed.
+	 * Hence forcing the wfi to be true always.
+	 */
+	g->ops.sync.sema.add_incr_cmd(g, cmd, s, va, true);
+	gpu_sema_verbose_dbg(g, "(R) c=%u INCR %u (%u) pool=%-3llu"
+			     "va=0x%llx entry=%p",
+			     ch, nvgpu_semaphore_get_value(s),
+			     nvgpu_semaphore_read(s),
+			     nvgpu_semaphore_get_hw_pool_page_idx(s),
+			     va, cmd);
+}
+
 static int channel_sync_semaphore_wait_fd(
 		struct nvgpu_channel_sync *s, int fd,
 		struct priv_cmd_entry **entry, u32 max_wait_cmds)
@@ -228,6 +250,42 @@ clean_up_sema:
 	return err;
 }
 
+s32 nvgpu_submit_create_gpfifo_tracking_semaphore(
+		struct nvgpu_channel_sync *s,
+		struct nvgpu_semaphore **semaphore,
+		struct priv_cmd_entry **incr_cmd,
+		u32 gpfifo_entries)
+{
+	u32 incr_cmd_size;
+	struct nvgpu_channel_sync_semaphore *sp =
+		nvgpu_channel_sync_semaphore_from_base(s);
+	struct nvgpu_channel *c = sp->c;
+	s32 err = 0;
+
+	*semaphore = nvgpu_semaphore_alloc(sp->hw_sema);
+	if (*semaphore == NULL) {
+		nvgpu_err(c->g,
+				"ran out of semaphores");
+		return -ENOMEM;
+	}
+
+	incr_cmd_size = c->g->ops.sync.sema.get_incr_cmd_size();
+	err = nvgpu_priv_cmdbuf_alloc(c->priv_cmd_q, incr_cmd_size, incr_cmd);
+	if (err != 0) {
+		goto clean_up_sema;
+	}
+
+	/* Release the completion semaphore. */
+	add_sema_incr_cmd_to_write_next_get(c, *semaphore, *incr_cmd,
+			sp->hw_sema, gpfifo_entries);
+
+	return 0;
+
+clean_up_sema:
+	nvgpu_semaphore_put(*semaphore);
+	return err;
+}
+
 static int channel_sync_semaphore_incr(
 		struct nvgpu_channel_sync *s,
 		struct priv_cmd_entry **entry,
@@ -396,3 +454,22 @@ err_free_sema:
 	nvgpu_kfree(g, sema);
 	return NULL;
 }
+
+void nvgpu_channel_sync_hw_semaphore_init(struct nvgpu_channel_sync *sync)
+{
+	struct nvgpu_channel_sync_semaphore *sp =
+		nvgpu_channel_sync_semaphore_from_base(sync);
+
+	nvgpu_hw_semaphore_set(sp->hw_sema, 0);
+	nvgpu_hw_semaphore_init_next(sp->hw_sema);
+}
+
+void nvgpu_channel_update_gpfifo_get(struct nvgpu_channel *c)
+{
+	struct nvgpu_channel_sync_semaphore *sp;
+
+	if (c->gpfifo_sync != NULL) {
+		sp = nvgpu_channel_sync_semaphore_from_base(c->gpfifo_sync);
+		c->gpfifo.get = nvgpu_hw_semaphore_read(sp->hw_sema);
+	}
+}
diff --git a/drivers/gpu/nvgpu/hal/fifo/userd_gv11b.c b/drivers/gpu/nvgpu/hal/fifo/userd_gv11b.c
index fed7c91fb..5753466f4 100644
--- a/drivers/gpu/nvgpu/hal/fifo/userd_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/fifo/userd_gv11b.c
@@ -28,6 +28,7 @@
 #include <nvgpu/channel.h>
 
 #include <nvgpu/hw/gv11b/hw_ram_gv11b.h>
+#include <nvgpu/channel_sync_semaphore.h>
 
 #include "userd_gv11b.h"
 
@@ -35,8 +36,20 @@ u32 gv11b_userd_gp_get(struct gk20a *g, struct nvgpu_channel *ch)
 {
 	struct nvgpu_mem *mem = ch->userd_mem;
 	u32 offset = ch->userd_offset / U32(sizeof(u32));
+	u32 ret;
 
-	return nvgpu_mem_rd32(g, mem, offset + ram_userd_gp_get_w());
+	/*
+	 * NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET is enabled when userd get
+	 * is not getting updated by gpu anymore.
+	 */
+	if (nvgpu_is_enabled(g, (u32)NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET)) {
+		nvgpu_channel_update_gpfifo_get(ch);
+		ret = ch->gpfifo.get;
+	} else {
+		ret = nvgpu_mem_rd32(g, mem, offset + ram_userd_gp_get_w());
+	}
+
+	return ret;
 }
 
 u64 gv11b_userd_pb_get(struct gk20a *g, struct nvgpu_channel *ch)
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h
index 158f33734..0f0d65790 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -387,6 +387,9 @@ struct nvgpu_channel {
 	struct gpfifo_desc gpfifo;
 	struct priv_cmd_queue *priv_cmd_q;
 	struct nvgpu_channel_sync *sync;
+	struct nvgpu_channel_sync *gpfifo_sync;
+	/* lock for gpfifo hw_sema access */
+	struct nvgpu_mutex gpfifo_hw_sema_lock;
 	/* for job cleanup handling in the background worker */
 	struct nvgpu_list_node worker_item;
 #endif /* CONFIG_NVGPU_KERNEL_MODE_SUBMIT */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel_sync_semaphore.h b/drivers/gpu/nvgpu/include/nvgpu/channel_sync_semaphore.h
index 72da85ecc..31c16db2f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel_sync_semaphore.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel_sync_semaphore.h
@@ -2,7 +2,7 @@
  *
  * Nvgpu Channel Synchronization Abstraction (Semaphore)
  *
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -28,11 +28,18 @@
 
 #include <nvgpu/types.h>
 #include <nvgpu/channel_sync.h>
+#include "../../common/sync/channel_sync_priv.h"
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 
 struct nvgpu_channel;
-struct nvgpu_channel_sync_semaphore;
+struct nvgpu_channel_sync;
+
+struct nvgpu_channel_sync_semaphore {
+	struct nvgpu_channel_sync base;
+	struct nvgpu_channel *c;
+	struct nvgpu_hw_semaphore *hw_sema;
+};
 
 /*
  * Converts a valid struct nvgpu_channel_sync ptr to
@@ -54,6 +61,14 @@ nvgpu_channel_sync_semaphore_hw_sema(
  */
 struct nvgpu_channel_sync *
 nvgpu_channel_sync_semaphore_create(struct nvgpu_channel *c);
+void nvgpu_channel_sync_hw_semaphore_init(struct nvgpu_channel_sync *sync);
+void nvgpu_channel_update_gpfifo_get(struct nvgpu_channel *c);
+s32 nvgpu_submit_create_gpfifo_tracking_semaphore(
+		struct nvgpu_channel_sync *s,
+		struct nvgpu_semaphore **semaphore,
+		struct priv_cmd_entry **incr_cmd,
+		u32 gpfifo_entries);
+
 
 #endif
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/job.h b/drivers/gpu/nvgpu/include/nvgpu/job.h
index 68d77ca22..9c2c9f6f4 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/job.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/job.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -37,8 +37,10 @@ struct nvgpu_channel_job {
 	struct nvgpu_mapped_buf **mapped_buffers;
 	u32 num_mapped_buffers;
 	struct nvgpu_fence_type post_fence;
+	struct nvgpu_semaphore *gpfifo_sema;
 	struct priv_cmd_entry *wait_cmd;
 	struct priv_cmd_entry *incr_cmd;
+	struct priv_cmd_entry *gpfifo_incr_cmd;
 	struct nvgpu_list_node list;
 };
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/semaphore.h b/drivers/gpu/nvgpu/include/nvgpu/semaphore.h
index 05c5353b5..86cb4f594 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/semaphore.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/semaphore.h
@@ -35,6 +35,7 @@ struct nvgpu_hw_semaphore;
 struct nvgpu_semaphore;
 struct vm_gk20a;
 struct nvgpu_allocator;
+struct nvgpu_channel;
 
 #define gpu_sema_dbg(g, fmt, args...)		\
 	nvgpu_log(g, gpu_dbg_sema, fmt, ##args)
@@ -77,6 +78,7 @@ u64 nvgpu_hw_semaphore_addr(struct nvgpu_hw_semaphore *hw_sema);
 u32 nvgpu_hw_semaphore_read(struct nvgpu_hw_semaphore *hw_sema);
 bool nvgpu_hw_semaphore_reset(struct nvgpu_hw_semaphore *hw_sema);
 void nvgpu_hw_semaphore_set(struct nvgpu_hw_semaphore *hw_sema, u32 val);
+void nvgpu_hw_semaphore_init_next(struct nvgpu_hw_semaphore *hw_sema);
 int nvgpu_hw_semaphore_read_next(struct nvgpu_hw_semaphore *hw_sema);
 int nvgpu_hw_semaphore_update_next(struct nvgpu_hw_semaphore *hw_sema);
 
@@ -99,6 +101,9 @@ bool nvgpu_semaphore_can_wait(struct nvgpu_semaphore *s);
 
 void nvgpu_semaphore_prepare(struct nvgpu_semaphore *s,
 		struct nvgpu_hw_semaphore *hw_sema);
+void nvgpu_semaphore_prepare_for_gpfifo_get(struct nvgpu_channel *c,
+		struct nvgpu_semaphore *s,
+		struct nvgpu_hw_semaphore *hw_sema, u32 new_entries);
 u64 nvgpu_semaphore_get_hw_pool_page_idx(struct nvgpu_semaphore *s);
 
 #endif /* NVGPU_SEMAPHORE_H */