diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index da8dade64..f915f863e 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -217,7 +217,7 @@ static void channel_kernelmode_deinit(struct nvgpu_channel *ch)
 #endif
 	(void) memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
 
-	nvgpu_free_priv_cmdbuf_queue(ch);
+	nvgpu_priv_cmdbuf_queue_free(ch);
 
 	/* free pre-allocated resources, if applicable */
 	if (nvgpu_channel_is_prealloc_enabled(ch)) {
@@ -375,7 +375,7 @@ static int channel_setup_kernelmode(struct nvgpu_channel *c,
 		}
 	}
 
-	err = nvgpu_alloc_priv_cmdbuf_queue(c, args->num_inflight_jobs);
+	err = nvgpu_priv_cmdbuf_queue_alloc(c, args->num_inflight_jobs);
 	if (err != 0) {
 		goto clean_up_prealloc;
 	}
@@ -388,7 +388,7 @@ static int channel_setup_kernelmode(struct nvgpu_channel *c,
 	return 0;
 
 clean_up_priv_cmd:
-	nvgpu_free_priv_cmdbuf_queue(c);
+	nvgpu_priv_cmdbuf_queue_free(c);
 clean_up_prealloc:
 	if (nvgpu_channel_is_deterministic(c) &&
 			args->num_inflight_jobs != 0U) {
@@ -998,10 +998,10 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
 		 * Free the private command buffers (wait_cmd first and
 		 * then incr_cmd i.e. order of allocation)
 		 */
-		nvgpu_channel_update_priv_cmd_q_and_free_entry(c,
-			job->wait_cmd);
-		nvgpu_channel_update_priv_cmd_q_and_free_entry(c,
-			job->incr_cmd);
+		if (job->wait_cmd != NULL) {
+			nvgpu_priv_cmdbuf_free(c, job->wait_cmd);
+		}
+		nvgpu_priv_cmdbuf_free(c, job->incr_cmd);
 
 		/*
 		 * ensure all pending writes complete before freeing up the job.
diff --git a/drivers/gpu/nvgpu/common/fifo/job.c b/drivers/gpu/nvgpu/common/fifo/job.c
index 8a7d1aae6..0f5fe56d9 100644
--- a/drivers/gpu/nvgpu/common/fifo/job.c
+++ b/drivers/gpu/nvgpu/common/fifo/job.c
@@ -76,17 +76,8 @@ int nvgpu_channel_alloc_job(struct nvgpu_channel *c,
 void nvgpu_channel_free_job(struct nvgpu_channel *c,
 		struct nvgpu_channel_job *job)
 {
-	/*
-	 * In case of pre_allocated jobs, we need to clean out
-	 * the job but maintain the pointers to the priv_cmd_entry,
-	 * since they're inherently tied to the job node.
-	 */
 	if (nvgpu_channel_is_prealloc_enabled(c)) {
-		struct priv_cmd_entry *wait_cmd = job->wait_cmd;
-		struct priv_cmd_entry *incr_cmd = job->incr_cmd;
 		(void) memset(job, 0, sizeof(*job));
-		job->wait_cmd = wait_cmd;
-		job->incr_cmd = incr_cmd;
 	} else {
 		nvgpu_kfree(c->g, job);
 	}
@@ -168,10 +159,8 @@ bool nvgpu_channel_joblist_is_empty(struct nvgpu_channel *c)
 int channel_prealloc_resources(struct nvgpu_channel *ch, u32 num_jobs)
 {
 #ifdef CONFIG_NVGPU_DETERMINISTIC_CHANNELS
-	unsigned int i;
 	int err;
 	size_t size;
-	struct priv_cmd_entry *entries = NULL;
 
 	if ((nvgpu_channel_is_prealloc_enabled(ch)) || (num_jobs == 0U)) {
 		return -EINVAL;
@@ -192,32 +181,10 @@ int channel_prealloc_resources(struct nvgpu_channel *ch, u32 num_jobs)
 		goto clean_up;
 	}
 
-	/*
-	 * pre-allocate 2x priv_cmd_entry for each job up front.
-	 * since vmalloc take in an unsigned long, we need
-	 * to make sure we don't hit an overflow condition
-	 */
-	size = sizeof(struct priv_cmd_entry);
-	if (num_jobs <= U32_MAX / (size << 1U)) {
-		entries = nvgpu_vzalloc(ch->g,
-					((unsigned long)num_jobs << 1UL) *
-					(unsigned long)size);
-	}
-	if (entries == NULL) {
-		err = -ENOMEM;
-		goto clean_up_joblist;
-	}
-
-	for (i = 0; i < num_jobs; i++) {
-		ch->joblist.pre_alloc.jobs[i].wait_cmd = &entries[i];
-		ch->joblist.pre_alloc.jobs[i].incr_cmd =
-			&entries[i + num_jobs];
-	}
-
 	/* pre-allocate a fence pool */
 	err = nvgpu_fence_pool_alloc(ch, num_jobs);
 	if (err != 0) {
-		goto clean_up_priv_cmd;
+		goto clean_up;
 	}
 
 	ch->joblist.pre_alloc.length = num_jobs;
@@ -234,11 +201,8 @@ int channel_prealloc_resources(struct nvgpu_channel *ch, u32 num_jobs)
 
 	return 0;
 
-clean_up_priv_cmd:
-	nvgpu_vfree(ch->g, entries);
-clean_up_joblist:
-	nvgpu_vfree(ch->g, ch->joblist.pre_alloc.jobs);
 clean_up:
+	nvgpu_vfree(ch->g, ch->joblist.pre_alloc.jobs);
 	(void) memset(&ch->joblist.pre_alloc, 0, sizeof(ch->joblist.pre_alloc));
 	return err;
 #else
diff --git a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
index 5555c6b9e..b7eea852e 100644
--- a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
+++ b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
@@ -31,17 +31,24 @@
 #include <nvgpu/priv_cmdbuf.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/trace.h>
+#include <nvgpu/circ_buf.h>
 
 struct priv_cmd_queue {
-	struct nvgpu_mem mem;
-	u32 size;	/* num of entries in words */
-	u32 put;	/* put for priv cmd queue */
-	u32 get;	/* get for priv cmd queue */
+	struct nvgpu_mem mem; /* pushbuf */
+	u32 size;	/* allocated length in words */
+	u32 put;	/* next entry will begin here */
+	u32 get;	/* next entry to free begins here */
+
+	/* an entry is a fragment of the pushbuf memory */
+	struct priv_cmd_entry *entries;
+	u32 entries_len; /* allocated length */
+	u32 entry_put;
+	u32 entry_get;
 };
 
 /* allocate private cmd buffer queue.
    used for inserting commands before/after user submitted buffers. */
-int nvgpu_alloc_priv_cmdbuf_queue(struct nvgpu_channel *ch,
+int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch,
 	u32 num_in_flight)
 {
 	struct gk20a *g = ch->g;
@@ -69,7 +76,9 @@ int nvgpu_alloc_priv_cmdbuf_queue(struct nvgpu_channel *ch,
 	 * Compute the amount of priv_cmdbuf space we need. In general the
 	 * worst case is the kernel inserts both a semaphore pre-fence and
 	 * post-fence. Any sync-pt fences will take less memory so we can
-	 * ignore them unless they're the only supported type.
+	 * ignore them unless they're the only supported type. Jobs can also
+	 * have more than one pre-fence but that's abnormal and we'll -EAGAIN
+	 * if such jobs would fill the queue.
 	 *
 	 * A semaphore ACQ (fence-wait) is 8 words: semaphore_a, semaphore_b,
 	 * semaphore_c, and semaphore_d. A semaphore INCR (fence-get) will be
@@ -95,21 +104,40 @@ int nvgpu_alloc_priv_cmdbuf_queue(struct nvgpu_channel *ch,
 	 */
 	if (num_in_flight == 0U) {
 		/* round down to ensure space for all priv cmds */
-		num_in_flight = ch->gpfifo.entry_num / 3;
+		num_in_flight = ch->gpfifo.entry_num / 3U;
 	}
 
 	size = num_in_flight * (wait_size + incr_size) * sizeof(u32);
 
 	tmp_size = PAGE_ALIGN(roundup_pow_of_two(size));
-	nvgpu_assert(tmp_size <= U32_MAX);
+	if (tmp_size > U32_MAX) {
+		return -ERANGE;
+	}
 	size = (u32)tmp_size;
 
 	q = nvgpu_kzalloc(g, sizeof(*q));
+	if (q == NULL) {
+		return -ENOMEM;
+	}
+
+	if (num_in_flight > U32_MAX / 2U) {
+		err = -ERANGE;
+		goto err_free_queue;
+	}
+
+	q->entries_len = 2U * num_in_flight;
+	q->entries = nvgpu_vzalloc(g,
+			nvgpu_safe_mult_u64((u64)q->entries_len,
+				sizeof(*q->entries)));
+	if (q->entries == NULL) {
+		err = -ENOMEM;
+		goto err_free_queue;
+	}
 
 	err = nvgpu_dma_alloc_map_sys(ch_vm, size, &q->mem);
 	if (err != 0) {
 		nvgpu_err(g, "%s: memory allocation failed", __func__);
-		goto err_free_buf;
+		goto err_free_entries;
 	}
 
 	tmp_size = q->mem.size / sizeof(u32);
@@ -119,53 +147,62 @@ int nvgpu_alloc_priv_cmdbuf_queue(struct nvgpu_channel *ch,
 	ch->priv_cmd_q = q;
 
 	return 0;
-err_free_buf:
+err_free_entries:
+	nvgpu_vfree(g, q->entries);
+err_free_queue:
 	nvgpu_kfree(g, q);
 	return err;
 }
 
-void nvgpu_free_priv_cmdbuf_queue(struct nvgpu_channel *ch)
+void nvgpu_priv_cmdbuf_queue_free(struct nvgpu_channel *ch)
 {
 	struct vm_gk20a *ch_vm = ch->vm;
 	struct priv_cmd_queue *q = ch->priv_cmd_q;
+	struct gk20a *g = ch->g;
 
 	if (q == NULL) {
 		return;
 	}
 
 	nvgpu_dma_unmap_free(ch_vm, &q->mem);
-	nvgpu_kfree(ch->g, q);
+	nvgpu_vfree(g, q->entries);
+	nvgpu_kfree(g, q);
 
 	ch->priv_cmd_q = NULL;
 }
 
 /* allocate a cmd buffer with given size. size is number of u32 entries */
-int nvgpu_channel_alloc_priv_cmdbuf(struct nvgpu_channel *c, u32 orig_size,
+static int nvgpu_priv_cmdbuf_alloc_buf(struct nvgpu_channel *c, u32 orig_size,
 			     struct priv_cmd_entry *e)
 {
 	struct priv_cmd_queue *q = c->priv_cmd_q;
-	u32 free_count;
 	u32 size = orig_size;
+	u32 free_count;
 
 	nvgpu_log_fn(c->g, "size %d", orig_size);
 
-	if (e == NULL) {
-		nvgpu_err(c->g,
-			"ch %d: priv cmd entry is null",
-			c->chid);
-		return -EINVAL;
-	}
-
-	/* if free space in the end is less than requested, increase the size
-	 * to make the real allocated space start from beginning. */
-	if (q->put + size > q->size) {
+	/*
+	 * If free space in the end is less than requested, increase the size
+	 * to make the real allocated space start from beginning. The hardware
+	 * expects each cmdbuf to be contiguous in the dma space.
+	 *
+	 * This too small extra space in the end may happen because the
+	 * requested wait and incr command buffers do not necessarily align
+	 * with the whole buffer capacity. They don't always align because the
+	 * buffer size is rounded to the next power of two and because not all
+	 * jobs necessarily use exactly one wait command.
+	 */
+	if (nvgpu_safe_add_u32(q->put, size) > q->size) {
 		size = orig_size + (q->size - q->put);
 	}
 
 	nvgpu_log_info(c->g, "ch %d: priv cmd queue get:put %d:%d",
 			c->chid, q->get, q->put);
 
-	free_count = (q->size - (q->put - q->get) - 1U) % q->size;
+	nvgpu_assert(q->put < q->size);
+	nvgpu_assert(q->get < q->size);
+	nvgpu_assert(q->size > 0U);
+	free_count = (q->size - q->put + q->get - 1U) & (q->size - 1U);
 
 	if (size > free_count) {
 		return -EAGAIN;
@@ -173,17 +210,22 @@ int nvgpu_channel_alloc_priv_cmdbuf(struct nvgpu_channel *c, u32 orig_size,
 
 	e->fill_off = 0;
 	e->size = orig_size;
+	e->alloc_size = size;
 	e->mem = &q->mem;
 
-	/* if we have increased size to skip free space in the end, set put
-	   to beginning of cmd buffer (0) + size */
+	/*
+	 * if we have increased size to skip free space in the end, set put
+	 * to beginning of cmd buffer + size, as if the prev put was at
+	 * position 0.
+	 */
 	if (size != orig_size) {
 		e->off = 0;
 		e->gva = q->mem.gpu_va;
 		q->put = orig_size;
 	} else {
 		e->off = q->put;
-		e->gva = q->mem.gpu_va + q->put * sizeof(u32);
+		e->gva = nvgpu_safe_add_u64(q->mem.gpu_va,
+				nvgpu_safe_mult_u64((u64)q->put, sizeof(u32)));
 		q->put = (q->put + orig_size) & (q->size - 1U);
 	}
 
@@ -193,7 +235,7 @@ int nvgpu_channel_alloc_priv_cmdbuf(struct nvgpu_channel *c, u32 orig_size,
 	/*
 	 * commit the previous writes before making the entry valid.
 	 * see the corresponding nvgpu_smp_rmb() in
-	 * nvgpu_channel_update_priv_cmd_q_and_free_entry().
+	 * nvgpu_priv_cmdbuf_free().
 	 */
 	nvgpu_smp_wmb();
 
@@ -203,30 +245,54 @@ int nvgpu_channel_alloc_priv_cmdbuf(struct nvgpu_channel *c, u32 orig_size,
 	return 0;
 }
 
-/*
- * Don't call this to free an explicit cmd entry.
- * It doesn't update priv_cmd_queue get/put.
- */
-void nvgpu_channel_free_priv_cmd_entry(struct nvgpu_channel *c,
-			     struct priv_cmd_entry *e)
+int nvgpu_priv_cmdbuf_alloc(struct nvgpu_channel *c, u32 size,
+			     struct priv_cmd_entry **e)
 {
-	if (nvgpu_channel_is_prealloc_enabled(c)) {
-		(void) memset(e, 0, sizeof(struct priv_cmd_entry));
-	} else {
-		nvgpu_kfree(c->g, e);
+	struct priv_cmd_queue *q = c->priv_cmd_q;
+	u32 next_put = nvgpu_safe_add_u32(q->entry_put, 1U) % q->entries_len;
+	struct priv_cmd_entry *entry;
+	int err;
+
+	if (next_put == q->entry_get) {
+		return -EAGAIN;
 	}
+	entry = &q->entries[q->entry_put];
+
+	err = nvgpu_priv_cmdbuf_alloc_buf(c, size, entry);
+	if (err != 0) {
+		return err;
+	}
+
+	q->entry_put = next_put;
+	*e = entry;
+
+	return 0;
 }
 
-void nvgpu_channel_update_priv_cmd_q_and_free_entry(
-		struct nvgpu_channel *ch, struct priv_cmd_entry *e)
+void nvgpu_priv_cmdbuf_rollback(struct nvgpu_channel *ch,
+		struct priv_cmd_entry *e)
+{
+	struct priv_cmd_queue *q = ch->priv_cmd_q;
+
+	nvgpu_assert(q->put < q->size);
+	nvgpu_assert(q->size > 0U);
+	nvgpu_assert(e->alloc_size <= q->size);
+	q->put = (q->put + q->size - e->alloc_size) & (q->size - 1U);
+
+	(void)memset(e, 0, sizeof(*e));
+
+	nvgpu_assert(q->entry_put < q->entries_len);
+	nvgpu_assert(q->entries_len > 0U);
+	q->entry_put = (q->entry_put + q->entries_len - 1U)
+		% q->entries_len;
+}
+
+void nvgpu_priv_cmdbuf_free(struct nvgpu_channel *ch,
+		struct priv_cmd_entry *e)
 {
 	struct priv_cmd_queue *q = ch->priv_cmd_q;
 	struct gk20a *g = ch->g;
 
-	if (e == NULL) {
-		return;
-	}
-
 	if (e->valid) {
 		/* read the entry's valid flag before reading its contents */
 		nvgpu_smp_rmb();
@@ -234,10 +300,13 @@ void nvgpu_channel_update_priv_cmd_q_and_free_entry(
 			nvgpu_err(g, "requests out-of-order, ch=%d",
 				  ch->chid);
 		}
-		q->get = e->off + e->size;
+		nvgpu_assert(q->size > 0U);
+		q->get = nvgpu_safe_add_u32(e->off, e->size) & (q->size - 1U);
+		q->entry_get = nvgpu_safe_add_u32(q->entry_get, 1U)
+			% q->entries_len;
 	}
 
-	nvgpu_channel_free_priv_cmd_entry(ch, e);
+	(void)memset(e, 0, sizeof(*e));
 }
 
 void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,
diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c
index 9d86adee6..70dfb7366 100644
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -62,7 +62,6 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 	int wait_fence_fd = -1;
 	int err = 0;
 	bool need_wfi = (flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI) == 0U;
-	bool pre_alloc_enabled = nvgpu_channel_is_prealloc_enabled(c);
 	struct nvgpu_channel_sync_syncpt *sync_syncpt = NULL;
 	bool flag_fence_get = (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) != 0U;
 	bool flag_sync_fence = (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) != 0U;
@@ -74,7 +73,7 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 			c->sync = nvgpu_channel_sync_create(c);
 			if (c->sync == NULL) {
 				err = -ENOMEM;
-				goto fail;
+				goto clean_up_unlock;
 			}
 			new_sync_created = true;
 		}
@@ -84,7 +83,7 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 	if ((g->ops.channel.set_syncpt != NULL) && new_sync_created) {
 		err = g->ops.channel.set_syncpt(c);
 		if (err != 0) {
-			goto fail;
+			goto clean_up_unlock;
 		}
 	}
 
@@ -96,40 +95,27 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 		u32 max_wait_cmds = nvgpu_channel_is_deterministic(c) ?
 			1U : 0U;
 
-		if (!pre_alloc_enabled) {
-			job->wait_cmd = nvgpu_kzalloc(g,
-				sizeof(struct priv_cmd_entry));
-		}
-
-		if (job->wait_cmd == NULL) {
-			err = -ENOMEM;
-			goto fail;
-		}
-
 		if (flag_sync_fence) {
 			nvgpu_assert(fence->id <= (u32)INT_MAX);
 			wait_fence_fd = (int)fence->id;
 			err = nvgpu_channel_sync_wait_fence_fd(c->sync,
-				wait_fence_fd, job->wait_cmd, max_wait_cmds);
+				wait_fence_fd, &job->wait_cmd, max_wait_cmds);
 		} else {
 			sync_syncpt = nvgpu_channel_sync_to_syncpt(c->sync);
 			if (sync_syncpt != NULL) {
 				err = nvgpu_channel_sync_wait_syncpt(
 					sync_syncpt, fence->id,
-					fence->value, job->wait_cmd);
+					fence->value, &job->wait_cmd);
 			} else {
 				err = -EINVAL;
 			}
 		}
 
 		if (err != 0) {
-			goto clean_up_wait_cmd;
+			goto clean_up_unlock;
 		}
 
-		if (job->wait_cmd->valid) {
-			/* not expired yet */
-			*wait_cmd = job->wait_cmd;
-		}
+		*wait_cmd = job->wait_cmd;
 	}
 
 	if (flag_fence_get && flag_sync_fence) {
@@ -146,29 +132,21 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 		err = -ENOMEM;
 		goto clean_up_wait_cmd;
 	}
-	if (!pre_alloc_enabled) {
-		job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
-	}
-
-	if (job->incr_cmd == NULL) {
-		err = -ENOMEM;
-		goto clean_up_post_fence;
-	}
 
 	if (flag_fence_get) {
 		err = nvgpu_channel_sync_incr_user(c->sync,
-			job->incr_cmd, job->post_fence, need_wfi,
+			&job->incr_cmd, job->post_fence, need_wfi,
 			need_sync_fence, register_irq);
 	} else {
 		err = nvgpu_channel_sync_incr(c->sync,
-			job->incr_cmd, job->post_fence, need_sync_fence,
+			&job->incr_cmd, job->post_fence, need_sync_fence,
 			register_irq);
 	}
 	if (err == 0) {
 		*incr_cmd = job->incr_cmd;
 		*post_fence = job->post_fence;
 	} else {
-		goto clean_up_incr_cmd;
+		goto clean_up_post_fence;
 	}
 
 	if (g->aggressive_sync_destroy_thresh != 0U) {
@@ -176,22 +154,15 @@ static int nvgpu_submit_prepare_syncs(struct nvgpu_channel *c,
 	}
 	return 0;
 
-clean_up_incr_cmd:
-	nvgpu_channel_free_priv_cmd_entry(c, job->incr_cmd);
-	if (!pre_alloc_enabled) {
-		job->incr_cmd = NULL;
-	}
 clean_up_post_fence:
 	nvgpu_fence_put(job->post_fence);
 	job->post_fence = NULL;
 clean_up_wait_cmd:
 	if (job->wait_cmd != NULL) {
-		nvgpu_channel_free_priv_cmd_entry(c, job->wait_cmd);
+		nvgpu_priv_cmdbuf_rollback(c, job->wait_cmd);
 	}
-	if (!pre_alloc_enabled) {
-		job->wait_cmd = NULL;
-	}
-fail:
+	job->wait_cmd = NULL;
+clean_up_unlock:
 	if (g->aggressive_sync_destroy_thresh != 0U) {
 		nvgpu_mutex_release(&c->sync_lock);
 	}
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync.c b/drivers/gpu/nvgpu/common/sync/channel_sync.c
index c773f7469..0d49052d1 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync.c
@@ -58,21 +58,21 @@ bool nvgpu_channel_sync_needs_os_fence_framework(struct gk20a *g)
 }
 
 int nvgpu_channel_sync_wait_fence_fd(struct nvgpu_channel_sync *s, int fd,
-	struct priv_cmd_entry *entry, u32 max_wait_cmds)
+	struct priv_cmd_entry **entry, u32 max_wait_cmds)
 {
 	return s->ops->wait_fence_fd(s, fd, entry, max_wait_cmds);
 }
 
 int nvgpu_channel_sync_incr(struct nvgpu_channel_sync *s,
-	struct priv_cmd_entry *entry, struct nvgpu_fence_type *fence,
+	struct priv_cmd_entry **entry, struct nvgpu_fence_type *fence,
 	bool need_sync_fence, bool register_irq)
 {
 	return s->ops->incr(s, entry, fence, need_sync_fence, register_irq);
 }
 
 int nvgpu_channel_sync_incr_user(struct nvgpu_channel_sync *s,
-	struct priv_cmd_entry *entry, struct nvgpu_fence_type *fence, bool wfi,
-	bool need_sync_fence, bool register_irq)
+	struct priv_cmd_entry **entry, struct nvgpu_fence_type *fence,
+	bool wfi, bool need_sync_fence, bool register_irq)
 {
 	return s->ops->incr_user(s, entry, fence, wfi, need_sync_fence,
 			register_irq);
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_priv.h b/drivers/gpu/nvgpu/common/sync/channel_sync_priv.h
index 69f8bbcfc..77966bcdb 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_priv.h
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_priv.h
@@ -54,19 +54,19 @@ struct nvgpu_channel_sync {
  */
 struct nvgpu_channel_sync_ops {
 	int (*wait_fence_raw)(struct nvgpu_channel_sync *s, u32 id, u32 thresh,
-			   struct priv_cmd_entry *entry);
+			   struct priv_cmd_entry **entry);
 
 	int (*wait_fence_fd)(struct nvgpu_channel_sync *s, int fd,
-		       struct priv_cmd_entry *entry, u32 max_wait_cmds);
+		       struct priv_cmd_entry **entry, u32 max_wait_cmds);
 
 	int (*incr)(struct nvgpu_channel_sync *s,
-		    struct priv_cmd_entry *entry,
+		    struct priv_cmd_entry **entry,
 		    struct nvgpu_fence_type *fence,
 		    bool need_sync_fence,
 		    bool register_irq);
 
 	int (*incr_user)(struct nvgpu_channel_sync *s,
-			 struct priv_cmd_entry *entry,
+			 struct priv_cmd_entry **entry,
 			 struct nvgpu_fence_type *fence,
 			 bool wfi,
 			 bool need_sync_fence,
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
index ad8e6df0a..0a8eeee62 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
@@ -114,7 +114,7 @@ static void channel_sync_semaphore_gen_wait_cmd(struct nvgpu_channel *c,
 
 static int channel_sync_semaphore_wait_fd(
 		struct nvgpu_channel_sync *s, int fd,
-		struct priv_cmd_entry *entry, u32 max_wait_cmds)
+		struct priv_cmd_entry **entry, u32 max_wait_cmds)
 {
 	struct nvgpu_channel_sync_semaphore *sema =
 		nvgpu_channel_sync_semaphore_from_base(s);
@@ -148,17 +148,16 @@ static int channel_sync_semaphore_wait_fd(
 	}
 
 	wait_cmd_size = c->g->ops.sync.sema.get_wait_cmd_size();
-	err = nvgpu_channel_alloc_priv_cmdbuf(c,
+	err = nvgpu_priv_cmdbuf_alloc(c,
 		wait_cmd_size * num_fences, entry);
 	if (err != 0) {
-		nvgpu_err(c->g, "not enough priv cmd buffer space");
 		goto cleanup;
 	}
 
 	for (i = 0; i < num_fences; i++) {
 		nvgpu_os_fence_sema_extract_nth_semaphore(
 			&os_fence_sema, i, &semaphore);
-		channel_sync_semaphore_gen_wait_cmd(c, semaphore, entry,
+		channel_sync_semaphore_gen_wait_cmd(c, semaphore, *entry,
 				wait_cmd_size);
 	}
 
@@ -169,7 +168,7 @@ cleanup:
 
 static int channel_sync_semaphore_incr_common(
 		struct nvgpu_channel_sync *s, bool wfi_cmd,
-		struct priv_cmd_entry *incr_cmd,
+		struct priv_cmd_entry **incr_cmd,
 		struct nvgpu_fence_type *fence,
 		bool need_sync_fence)
 {
@@ -189,39 +188,37 @@ static int channel_sync_semaphore_incr_common(
 	}
 
 	incr_cmd_size = c->g->ops.sync.sema.get_incr_cmd_size();
-	err = nvgpu_channel_alloc_priv_cmdbuf(c, incr_cmd_size, incr_cmd);
+	err = nvgpu_priv_cmdbuf_alloc(c, incr_cmd_size, incr_cmd);
 	if (err != 0) {
-		nvgpu_err(c->g,
-				"not enough priv cmd buffer space");
 		goto clean_up_sema;
 	}
 
 	/* Release the completion semaphore. */
-	add_sema_incr_cmd(c->g, c, semaphore, incr_cmd, wfi_cmd);
+	add_sema_incr_cmd(c->g, c, semaphore, *incr_cmd, wfi_cmd);
 
 	if (need_sync_fence) {
-		err = nvgpu_os_fence_sema_create(&os_fence, c,
-			semaphore);
+		err = nvgpu_os_fence_sema_create(&os_fence, c, semaphore);
 
 		if (err != 0) {
-			goto clean_up_sema;
+			goto clean_up_cmdbuf;
 		}
 	}
 
-	err = nvgpu_fence_from_semaphore(fence,
-		semaphore,
-		&c->semaphore_wq,
-		os_fence);
+	err = nvgpu_fence_from_semaphore(fence, semaphore, &c->semaphore_wq,
+			os_fence);
 
 	if (err != 0) {
-		if (nvgpu_os_fence_is_initialized(&os_fence)) {
-			os_fence.ops->drop_ref(&os_fence);
-		}
-		goto clean_up_sema;
+		goto clean_up_os_fence;
 	}
 
 	return 0;
 
+clean_up_os_fence:
+	if (nvgpu_os_fence_is_initialized(&os_fence)) {
+		os_fence.ops->drop_ref(&os_fence);
+	}
+clean_up_cmdbuf:
+	nvgpu_priv_cmdbuf_rollback(c, *incr_cmd);
 clean_up_sema:
 	nvgpu_semaphore_put(semaphore);
 	return err;
@@ -229,7 +226,7 @@ clean_up_sema:
 
 static int channel_sync_semaphore_incr(
 		struct nvgpu_channel_sync *s,
-		struct priv_cmd_entry *entry,
+		struct priv_cmd_entry **entry,
 		struct nvgpu_fence_type *fence,
 		bool need_sync_fence,
 		bool register_irq)
@@ -243,7 +240,7 @@ static int channel_sync_semaphore_incr(
 
 static int channel_sync_semaphore_incr_user(
 		struct nvgpu_channel_sync *s,
-		struct priv_cmd_entry *entry,
+		struct priv_cmd_entry **entry,
 		struct nvgpu_fence_type *fence,
 		bool wfi,
 		bool need_sync_fence,
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
index 87e8bb702..4a9f42c43 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
@@ -67,7 +67,7 @@ static void channel_sync_syncpt_gen_wait_cmd(struct nvgpu_channel *c,
 }
 
 static int channel_sync_syncpt_wait_raw(struct nvgpu_channel_sync_syncpt *s,
-		u32 id, u32 thresh, struct priv_cmd_entry *wait_cmd)
+		u32 id, u32 thresh, struct priv_cmd_entry **wait_cmd)
 {
 	struct nvgpu_channel *c = s->c;
 	int err = 0;
@@ -77,22 +77,21 @@ static int channel_sync_syncpt_wait_raw(struct nvgpu_channel_sync_syncpt *s,
 		return -EINVAL;
 	}
 
-	err = nvgpu_channel_alloc_priv_cmdbuf(c,
+	err = nvgpu_priv_cmdbuf_alloc(c,
 		c->g->ops.sync.syncpt.get_wait_cmd_size(),
 		wait_cmd);
 	if (err != 0) {
-		nvgpu_err(c->g, "not enough priv cmd buffer space");
 		return err;
 	}
 
 	channel_sync_syncpt_gen_wait_cmd(c, id, thresh,
-			wait_cmd, wait_cmd_size);
+			*wait_cmd, wait_cmd_size);
 
 	return 0;
 }
 
 static int channel_sync_syncpt_wait_fd(struct nvgpu_channel_sync *s, int fd,
-	struct priv_cmd_entry *wait_cmd, u32 max_wait_cmds)
+	struct priv_cmd_entry **wait_cmd, u32 max_wait_cmds)
 {
 	struct nvgpu_os_fence os_fence = {0};
 	struct nvgpu_os_fence_syncpt os_fence_syncpt = {0};
@@ -136,11 +135,9 @@ static int channel_sync_syncpt_wait_fd(struct nvgpu_channel_sync *s, int fd,
 	}
 
 	wait_cmd_size = c->g->ops.sync.syncpt.get_wait_cmd_size();
-	err = nvgpu_channel_alloc_priv_cmdbuf(c,
+	err = nvgpu_priv_cmdbuf_alloc(c,
 		wait_cmd_size * num_fences, wait_cmd);
 	if (err != 0) {
-		nvgpu_err(c->g, "not enough priv cmd buffer space");
-		err = -EINVAL;
 		goto cleanup;
 	}
 
@@ -148,7 +145,7 @@ static int channel_sync_syncpt_wait_fd(struct nvgpu_channel_sync *s, int fd,
 		nvgpu_os_fence_syncpt_extract_nth_syncpt(
 			&os_fence_syncpt, i, &syncpt_id, &syncpt_thresh);
 		channel_sync_syncpt_gen_wait_cmd(c, syncpt_id,
-			syncpt_thresh, wait_cmd, wait_cmd_size);
+			syncpt_thresh, *wait_cmd, wait_cmd_size);
 	}
 
 cleanup:
@@ -169,7 +166,7 @@ static void channel_sync_syncpt_update(void *priv, int nr_completed)
 static int channel_sync_syncpt_incr_common(struct nvgpu_channel_sync *s,
 				       bool wfi_cmd,
 				       bool register_irq,
-				       struct priv_cmd_entry *incr_cmd,
+				       struct priv_cmd_entry **incr_cmd,
 				       struct nvgpu_fence_type *fence,
 				       bool need_sync_fence)
 {
@@ -180,7 +177,7 @@ static int channel_sync_syncpt_incr_common(struct nvgpu_channel_sync *s,
 	struct nvgpu_channel *c = sp->c;
 	struct nvgpu_os_fence os_fence = {0};
 
-	err = nvgpu_channel_alloc_priv_cmdbuf(c,
+	err = nvgpu_priv_cmdbuf_alloc(c,
 			c->g->ops.sync.syncpt.get_incr_cmd_size(wfi_cmd),
 			incr_cmd);
 	if (err != 0) {
@@ -189,7 +186,7 @@ static int channel_sync_syncpt_incr_common(struct nvgpu_channel_sync *s,
 
 	nvgpu_log(c->g, gpu_dbg_info, "sp->id %d gpu va %llx",
 				sp->id, sp->syncpt_buf.gpu_va);
-	c->g->ops.sync.syncpt.add_incr_cmd(c->g, incr_cmd,
+	c->g->ops.sync.syncpt.add_incr_cmd(c->g, *incr_cmd,
 			sp->id, sp->syncpt_buf.gpu_va, wfi_cmd);
 
 	thresh = nvgpu_nvhost_syncpt_incr_max_ext(sp->nvhost, sp->id,
@@ -244,12 +241,12 @@ static int channel_sync_syncpt_incr_common(struct nvgpu_channel_sync *s,
 	return 0;
 
 clean_up_priv_cmd:
-	nvgpu_channel_update_priv_cmd_q_and_free_entry(c, incr_cmd);
+	nvgpu_priv_cmdbuf_rollback(c, *incr_cmd);
 	return err;
 }
 
 static int channel_sync_syncpt_incr(struct nvgpu_channel_sync *s,
-			      struct priv_cmd_entry *entry,
+			      struct priv_cmd_entry **entry,
 			      struct nvgpu_fence_type *fence,
 			      bool need_sync_fence,
 			      bool register_irq)
@@ -263,7 +260,7 @@ static int channel_sync_syncpt_incr(struct nvgpu_channel_sync *s,
 }
 
 static int channel_sync_syncpt_incr_user(struct nvgpu_channel_sync *s,
-				   struct priv_cmd_entry *entry,
+				   struct priv_cmd_entry **entry,
 				   struct nvgpu_fence_type *fence,
 				   bool wfi,
 				   bool need_sync_fence,
@@ -278,7 +275,7 @@ static int channel_sync_syncpt_incr_user(struct nvgpu_channel_sync *s,
 }
 
 int nvgpu_channel_sync_wait_syncpt(struct nvgpu_channel_sync_syncpt *s,
-	u32 id, u32 thresh, struct priv_cmd_entry *entry)
+	u32 id, u32 thresh, struct priv_cmd_entry **entry)
 {
 	return channel_sync_syncpt_wait_raw(s, id, thresh, entry);
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel_sync.h b/drivers/gpu/nvgpu/include/nvgpu/channel_sync.h
index 2be1e780d..e19d36e39 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel_sync.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel_sync.h
@@ -43,7 +43,7 @@ struct gk20a;
  * Returns a gpu cmdbuf that performs the wait when executed
  */
 int nvgpu_channel_sync_wait_fence_fd(struct nvgpu_channel_sync *s, int fd,
-	struct priv_cmd_entry *entry, u32 max_wait_cmds);
+	struct priv_cmd_entry **entry, u32 max_wait_cmds);
 
 /*
  * Increment syncpoint/semaphore.
@@ -52,7 +52,7 @@ int nvgpu_channel_sync_wait_fence_fd(struct nvgpu_channel_sync *s, int fd,
  *  - a fence that can be passed to wait_cpu() and is_expired().
  */
 int nvgpu_channel_sync_incr(struct nvgpu_channel_sync *s,
-	struct priv_cmd_entry *entry, struct nvgpu_fence_type *fence,
+	struct priv_cmd_entry **entry, struct nvgpu_fence_type *fence,
 	bool need_sync_fence, bool register_irq);
 
 /*
@@ -64,8 +64,8 @@ int nvgpu_channel_sync_incr(struct nvgpu_channel_sync *s,
  *  - a nvgpu_fence_type that signals when the incr has happened.
  */
 int nvgpu_channel_sync_incr_user(struct nvgpu_channel_sync *s,
-	struct priv_cmd_entry *entry, struct nvgpu_fence_type *fence, bool wfi,
-	bool need_sync_fence, bool register_irq);
+	struct priv_cmd_entry **entry, struct nvgpu_fence_type *fence,
+	bool wfi, bool need_sync_fence, bool register_irq);
 /*
  * Reset the channel syncpoint/semaphore. Syncpoint increments generally
  * wrap around the range of integer values. Current max value encompasses
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel_sync_syncpt.h b/drivers/gpu/nvgpu/include/nvgpu/channel_sync_syncpt.h
index 9d3bd904c..b24809072 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel_sync_syncpt.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel_sync_syncpt.h
@@ -53,7 +53,7 @@ u32 nvgpu_channel_sync_get_syncpt_id(struct nvgpu_channel_sync_syncpt *s);
  * Returns a gpu cmdbuf that performs the wait when executed.
  */
 int nvgpu_channel_sync_wait_syncpt(struct nvgpu_channel_sync_syncpt *s,
-	u32 id, u32 thresh, struct priv_cmd_entry *entry);
+	u32 id, u32 thresh, struct priv_cmd_entry **entry);
 #endif
 
 /**
@@ -97,7 +97,7 @@ static inline u64 nvgpu_channel_sync_get_syncpt_address(
 
 static inline int nvgpu_channel_sync_wait_syncpt(
 	struct nvgpu_channel_sync_syncpt *s,
-	u32 id, u32 thresh, struct priv_cmd_entry *entry)
+	u32 id, u32 thresh, struct priv_cmd_entry **entry)
 {
 	return -EINVAL;
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h b/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
index f06954dac..9e26020ee 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
@@ -35,18 +35,18 @@ struct priv_cmd_entry {
 	u32 off;	/* offset in mem, in u32 entries */
 	u32 fill_off;	/* write offset from off, in u32 entries */
 	u64 gva;
-	u32 get;	/* start of entry in queue */
 	u32 size;	/* in words */
+	u32 alloc_size;
 };
 
-int nvgpu_alloc_priv_cmdbuf_queue(struct nvgpu_channel *ch, u32 num_in_flight);
-void nvgpu_free_priv_cmdbuf_queue(struct nvgpu_channel *ch);
+int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch, u32 num_in_flight);
+void nvgpu_priv_cmdbuf_queue_free(struct nvgpu_channel *ch);
 
-int nvgpu_channel_alloc_priv_cmdbuf(struct nvgpu_channel *c, u32 orig_size,
+int nvgpu_priv_cmdbuf_alloc(struct nvgpu_channel *c, u32 size,
+		struct priv_cmd_entry **e);
+void nvgpu_priv_cmdbuf_rollback(struct nvgpu_channel *ch,
 		struct priv_cmd_entry *e);
-void nvgpu_channel_free_priv_cmd_entry(struct nvgpu_channel *c,
-		struct priv_cmd_entry *e);
-void nvgpu_channel_update_priv_cmd_q_and_free_entry(struct nvgpu_channel *ch,
+void nvgpu_priv_cmdbuf_free(struct nvgpu_channel *ch,
 		struct priv_cmd_entry *e);
 
 void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,