diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c
index 9122ee3a2..477212ee6 100644
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -44,25 +44,81 @@
  */
 #define EXTRA_GPFIFO_ENTRIES 2U
 
+static int nvgpu_submit_create_wait_cmd(struct nvgpu_channel *c,
+		struct nvgpu_channel_fence *fence,
+		struct priv_cmd_entry **wait_cmd, bool flag_sync_fence)
+{
+	/*
+	 * A single input sync fd may contain multiple fences. The preallocated
+	 * priv cmdbuf space allows exactly one per submit in the worst case.
+	 * Require at most one wait for consistent deterministic submits; if
+	 * there are more and no space, we'll -EAGAIN in nondeterministic mode.
+	 */
+	u32 max_wait_cmds = nvgpu_channel_is_deterministic(c) ?
+		1U : 0U;
+	int err;
+
+	if (flag_sync_fence) {
+		nvgpu_assert(fence->id <= (u32)INT_MAX);
+		err = nvgpu_channel_sync_wait_fence_fd(c->sync,
+			(int)fence->id, wait_cmd, max_wait_cmds);
+	} else {
+		struct nvgpu_channel_sync_syncpt *sync_syncpt;
+
+		sync_syncpt = nvgpu_channel_sync_to_syncpt(c->sync);
+		if (sync_syncpt != NULL) {
+			err = nvgpu_channel_sync_wait_syncpt(sync_syncpt,
+					fence->id, fence->value, wait_cmd);
+		} else {
+			err = -EINVAL;
+		}
+	}
+
+	return err;
+}
+
+static int nvgpu_submit_create_incr_cmd(struct nvgpu_channel *c,
+		struct priv_cmd_entry **incr_cmd,
+		struct nvgpu_fence_type **post_fence, bool flag_fence_get,
+		bool need_wfi, bool need_sync_fence, bool register_irq)
+{
+	int err;
+
+	*post_fence = nvgpu_fence_alloc(c);
+	if (*post_fence == NULL) {
+		return -ENOMEM;
+	}
+
+	if (flag_fence_get) {
+		err = nvgpu_channel_sync_incr_user(c->sync, incr_cmd,
+				*post_fence, need_wfi, need_sync_fence,
+				register_irq);
+	} else {
+		err = nvgpu_channel_sync_incr(c->sync, incr_cmd,
+				*post_fence, need_sync_fence, register_irq);
+	}
+
+	if (err != 0) {
+		nvgpu_fence_put(*post_fence);
+		*post_fence = NULL;
+	}
+
+	return err;
+}
+
 /*
  * Handle the submit synchronization - pre-fences and post-fences.
  */
 static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 				      struct nvgpu_channel_fence *fence,
 				      struct nvgpu_channel_job *job,
-				      struct priv_cmd_entry **wait_cmd,
-				      struct priv_cmd_entry **incr_cmd,
-				      struct nvgpu_fence_type **post_fence,
-				      bool register_irq,
-				      u32 flags)
+				      bool register_irq, u32 flags)
 {
 	struct gk20a *g = c->g;
-	bool need_sync_fence = false;
+	bool need_sync_fence;
 	bool new_sync_created = false;
-	int wait_fence_fd = -1;
 	int err = 0;
 	bool need_wfi = (flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI) == 0U;
-	struct nvgpu_channel_sync_syncpt *sync_syncpt = NULL;
 	bool flag_fence_get = (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) != 0U;
 	bool flag_sync_fence = (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) != 0U;
 	bool flag_fence_wait = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) != 0U;
@@ -83,70 +139,34 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 	if ((g->ops.channel.set_syncpt != NULL) && new_sync_created) {
 		err = g->ops.channel.set_syncpt(c);
 		if (err != 0) {
-			goto clean_up_unlock;
+			goto clean_up_put_sync;
 		}
 	}
 
 	/*
 	 * Optionally insert syncpt/semaphore wait in the beginning of gpfifo
-	 * submission when user requested and the wait hasn't expired.
+	 * submission when user requested.
 	 */
 	if (flag_fence_wait) {
-		u32 max_wait_cmds = nvgpu_channel_is_deterministic(c) ?
-			1U : 0U;
-
-		if (flag_sync_fence) {
-			nvgpu_assert(fence->id <= (u32)INT_MAX);
-			wait_fence_fd = (int)fence->id;
-			err = nvgpu_channel_sync_wait_fence_fd(c->sync,
-				wait_fence_fd, &job->wait_cmd, max_wait_cmds);
-		} else {
-			sync_syncpt = nvgpu_channel_sync_to_syncpt(c->sync);
-			if (sync_syncpt != NULL) {
-				err = nvgpu_channel_sync_wait_syncpt(
-					sync_syncpt, fence->id,
-					fence->value, &job->wait_cmd);
-			} else {
-				err = -EINVAL;
-			}
-		}
-
+		err = nvgpu_submit_create_wait_cmd(c, fence, &job->wait_cmd,
+				flag_sync_fence);
 		if (err != 0) {
-			goto clean_up_unlock;
+			goto clean_up_put_sync;
 		}
-
-		*wait_cmd = job->wait_cmd;
 	}
 
-	if (flag_fence_get && flag_sync_fence) {
-		need_sync_fence = true;
-	}
+	need_sync_fence = flag_fence_get && flag_sync_fence;
 
 	/*
-	 * Always generate an increment at the end of a GPFIFO submission. This
-	 * is used to keep track of method completion for idle railgating. The
-	 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
+	 * Always generate an increment at the end of a GPFIFO submission. When
+	 * we do job tracking, post fences are needed for various reasons even
+	 * if not requested by user.
 	 */
-	job->post_fence = nvgpu_fence_alloc(c);
-	if (job->post_fence == NULL) {
-		err = -ENOMEM;
-		goto clean_up_wait_cmd;
-	}
-
-	if (flag_fence_get) {
-		err = nvgpu_channel_sync_incr_user(c->sync,
-			&job->incr_cmd, job->post_fence, need_wfi,
-			need_sync_fence, register_irq);
-	} else {
-		err = nvgpu_channel_sync_incr(c->sync,
-			&job->incr_cmd, job->post_fence, need_sync_fence,
+	err = nvgpu_submit_create_incr_cmd(c, &job->incr_cmd, &job->post_fence,
+			flag_fence_get, need_wfi, need_sync_fence,
 			register_irq);
-	}
-	if (err == 0) {
-		*incr_cmd = job->incr_cmd;
-		*post_fence = job->post_fence;
-	} else {
-		goto clean_up_post_fence;
+	if (err != 0) {
+		goto clean_up_wait_cmd;
 	}
 
 	if (g->aggressive_sync_destroy_thresh != 0U) {
@@ -154,19 +174,22 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 	}
 	return 0;
 
-clean_up_post_fence:
-	nvgpu_fence_put(job->post_fence);
-	job->post_fence = NULL;
 clean_up_wait_cmd:
 	if (job->wait_cmd != NULL) {
 		nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->wait_cmd);
 	}
 	job->wait_cmd = NULL;
+clean_up_put_sync:
+	if (g->aggressive_sync_destroy_thresh != 0U) {
+		if (nvgpu_channel_sync_put_ref_and_check(c->sync)
+		    && g->aggressive_sync_destroy) {
+			nvgpu_channel_sync_destroy(c->sync);
+		}
+	}
 clean_up_unlock:
 	if (g->aggressive_sync_destroy_thresh != 0U) {
 		nvgpu_mutex_release(&c->sync_lock);
 	}
-	*wait_cmd = NULL;
 	return err;
 }
 
@@ -323,9 +346,6 @@ static int nvgpu_submit_prepare_gpfifo_track(struct nvgpu_channel *c,
 {
 	bool skip_buffer_refcounting = (flags &
 			NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING) != 0U;
-	struct nvgpu_fence_type *post_fence = NULL;
-	struct priv_cmd_entry *wait_cmd = NULL;
-	struct priv_cmd_entry *incr_cmd = NULL;
 	struct nvgpu_channel_job *job = NULL;
 	int err;
 
@@ -334,8 +354,8 @@ static int nvgpu_submit_prepare_gpfifo_track(struct nvgpu_channel *c,
 		return err;
 	}
 
-	err = nvgpu_submit_prepare_syncs(c, fence, job, &wait_cmd, &incr_cmd,
-			&post_fence, need_deferred_cleanup, flags);
+	err = nvgpu_submit_prepare_syncs(c, fence, job, need_deferred_cleanup,
+			flags);
 	if (err != 0) {
 		goto clean_up_job;
 	}
@@ -347,30 +367,34 @@ static int nvgpu_submit_prepare_gpfifo_track(struct nvgpu_channel *c,
 	 * android sync framework for example can provide entirely
 	 * empty fences that act like trivially expired waits.
 	 */
-	if (wait_cmd != NULL) {
-		nvgpu_submit_append_priv_cmdbuf(c, wait_cmd);
+	if (job->wait_cmd != NULL) {
+		nvgpu_submit_append_priv_cmdbuf(c, job->wait_cmd);
 	}
 
 	err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata, num_entries);
 	if (err != 0) {
-		goto clean_up_fence;
+		goto clean_up_syncs;
 	}
 
-	nvgpu_submit_append_priv_cmdbuf(c, incr_cmd);
+	nvgpu_submit_append_priv_cmdbuf(c, job->incr_cmd);
 
 	err = nvgpu_channel_add_job(c, job, skip_buffer_refcounting);
 	if (err != 0) {
-		goto clean_up_fence;
+		goto clean_up_syncs;
 	}
 
 	if (fence_out != NULL) {
-		*fence_out = nvgpu_fence_get(post_fence);
+		*fence_out = nvgpu_fence_get(job->post_fence);
 	}
 
 	return 0;
 
-clean_up_fence:
-	nvgpu_fence_put(post_fence);
+clean_up_syncs:
+	nvgpu_fence_put(job->post_fence);
+	nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->incr_cmd);
+	if (job->wait_cmd != NULL) {
+		nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->wait_cmd);
+	}
 clean_up_job:
 	nvgpu_channel_free_job(c, job);
 	return err;