From 39844fb27cd4c5d3ad1e2972de7bf30a49620280 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Konsta=20H=C3=B6ltt=C3=A4?= <kholtta@nvidia.com>
Date: Fri, 3 Apr 2020 11:04:02 +0300
Subject: [PATCH] gpu: nvgpu: hide priv cmdbuf mem writes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an API to append data to a priv cmdbuf entry. Hold the write pointer
offset internally in the entry instead of having the user keep track of
where those words are written to.

This helps in eventually hiding struct priv_cmd_entry from users and
provides a more consistent interface in general. The wait and incr
commands are now slightly easier to read as well when they're just
arrays of data.

A syncfd-backed prefence may be composed of several individual fences.
Some of those (or even a fence backed by just one) may be already
expired, and currently the syncfd export design releases and nulls
semaphores when expired (see gk20a_sync_pt_has_signaled()) so for those
the wait cmdbuf is appended with zeros; the specific function is for
this purpose.

Jira NVGPU-4548

Change-Id: I1057f98c1b5b407460aa6e1dcba917da9c9aa9c9
Signed-off-by: Konsta Hölttä <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2325099
(cherry picked from commit 6a00a65a86d8249cfeb06a05682abb4771949f19)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2331336
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c   |  19 ++++
 .../common/sync/channel_sync_semaphore.c      |  22 ++--
 .../nvgpu/common/sync/channel_sync_syncpt.c   |  13 +--
 .../gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.c    |  90 ++++++++-------
 .../gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.h    |   2 +-
 .../gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.c    |  74 ++++++------
 .../gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.h    |   2 +-
 .../gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.c  |  72 ++++++------
 .../gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h  |   4 +-
 .../gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.c  | 105 ++++++++----------
 .../gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.h  |   4 +-
 drivers/gpu/nvgpu/include/nvgpu/gops_sync.h   |   4 +-
 drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h |   6 +
 13 files changed, 217 insertions(+), 200 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
index 017d2601d..2868ba370 100644
--- a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
+++ b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
@@ -170,6 +170,7 @@ int nvgpu_channel_alloc_priv_cmdbuf(struct nvgpu_channel *c, u32 orig_size,
 		return -EAGAIN;
 	}
 
+	e->fill_off = 0;
 	e->size = orig_size;
 	e->mem = &q->mem;
 
@@ -237,3 +238,21 @@ void nvgpu_channel_update_priv_cmd_q_and_free_entry(
 
 	nvgpu_channel_free_priv_cmd_entry(ch, e);
 }
+
+void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 *data, u32 entries)
+{
+	nvgpu_assert(e->fill_off + entries <= e->size);
+	nvgpu_mem_wr_n(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
+			data, entries * sizeof(u32));
+	e->fill_off += entries;
+}
+
+void nvgpu_priv_cmdbuf_append_zeros(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 entries)
+{
+	nvgpu_assert(e->fill_off + entries <= e->size);
+	nvgpu_memset(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
+			0, entries * sizeof(u32));
+	e->fill_off += entries;
+}
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
index 02ea03230..4d3b670ec 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
@@ -57,8 +57,7 @@ nvgpu_channel_sync_semaphore_from_base(struct nvgpu_channel_sync *base)
 }
 
 static void add_sema_wait_cmd(struct gk20a *g, struct nvgpu_channel *c,
-			 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
-			 u32 offset)
+			 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd)
 {
 	int ch = c->chid;
 	u64 va;
@@ -66,12 +65,12 @@ static void add_sema_wait_cmd(struct gk20a *g, struct nvgpu_channel *c,
 	/* acquire just needs to read the mem. */
 	va = nvgpu_semaphore_gpu_ro_va(s);
 
-	g->ops.sync.sema.add_wait_cmd(g, cmd, offset, s, va);
+	g->ops.sync.sema.add_wait_cmd(g, cmd, s, va);
 	gpu_sema_verbose_dbg(g, "(A) c=%d ACQ_GE %-4u pool=%-3llu"
-			     "va=0x%llx cmd_mem=0x%llx b=0x%llx off=%u",
+			     "va=0x%llx cmd_mem=0x%llx b=0x%llx",
 			     ch, nvgpu_semaphore_get_value(s),
 			     nvgpu_semaphore_get_hw_pool_page_idx(s),
-			     va, cmd->gva, cmd->mem->gpu_va, offset);
+			     va, cmd->gva, cmd->mem->gpu_va);
 }
 
 static void add_sema_incr_cmd(struct gk20a *g, struct nvgpu_channel *c,
@@ -98,20 +97,17 @@ static void add_sema_incr_cmd(struct gk20a *g, struct nvgpu_channel *c,
 
 static void channel_sync_semaphore_gen_wait_cmd(struct nvgpu_channel *c,
 	struct nvgpu_semaphore *sema, struct priv_cmd_entry *wait_cmd,
-	u32 wait_cmd_size, u32 pos)
+	u32 wait_cmd_size)
 {
 	bool has_incremented;
 
 	if (sema == NULL) {
-		/* expired */
-		nvgpu_memset(c->g, wait_cmd->mem,
-			(wait_cmd->off + pos * wait_cmd_size) * (u32)sizeof(u32),
-			0, wait_cmd_size * (u32)sizeof(u32));
+		/* came from an expired sync fence */
+		nvgpu_priv_cmdbuf_append_zeros(c->g, wait_cmd, wait_cmd_size);
 	} else {
 		has_incremented = nvgpu_semaphore_can_wait(sema);
 		nvgpu_assert(has_incremented);
-		add_sema_wait_cmd(c->g, c, sema, wait_cmd,
-			pos * wait_cmd_size);
+		add_sema_wait_cmd(c->g, c, sema, wait_cmd);
 		nvgpu_semaphore_put(sema);
 	}
 }
@@ -163,7 +159,7 @@ static int channel_sync_semaphore_wait_fd(
 		nvgpu_os_fence_sema_extract_nth_semaphore(
 			&os_fence_sema, i, &semaphore);
 		channel_sync_semaphore_gen_wait_cmd(c, semaphore, entry,
-				wait_cmd_size, i);
+				wait_cmd_size);
 	}
 
 cleanup:
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
index f228a10ed..e91e94187 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
@@ -58,7 +58,7 @@ nvgpu_channel_sync_syncpt_from_base(struct nvgpu_channel_sync *base)
 
 static int channel_sync_syncpt_gen_wait_cmd(struct nvgpu_channel *c,
 	u32 id, u32 thresh, struct priv_cmd_entry *wait_cmd,
-	u32 wait_cmd_size, u32 pos, bool preallocated)
+	u32 wait_cmd_size, bool preallocated)
 {
 	int err = 0;
 
@@ -73,9 +73,8 @@ static int channel_sync_syncpt_gen_wait_cmd(struct nvgpu_channel *c,
 	}
 	nvgpu_log(c->g, gpu_dbg_info, "sp->id %d gpu va %llx",
 			id, c->vm->syncpt_ro_map_gpu_va);
-	c->g->ops.sync.syncpt.add_wait_cmd(c->g, wait_cmd,
-		pos * wait_cmd_size, id, thresh,
-		c->vm->syncpt_ro_map_gpu_va);
+	c->g->ops.sync.syncpt.add_wait_cmd(c->g, wait_cmd, id, thresh,
+			c->vm->syncpt_ro_map_gpu_va);
 
 	return 0;
 }
@@ -92,7 +91,7 @@ static int channel_sync_syncpt_wait_raw(struct nvgpu_channel_sync_syncpt *s,
 	}
 
 	err = channel_sync_syncpt_gen_wait_cmd(c, id, thresh,
-			wait_cmd, wait_cmd_size, 0, false);
+			wait_cmd, wait_cmd_size, false);
 
 	return err;
 }
@@ -154,7 +153,7 @@ static int channel_sync_syncpt_wait_fd(struct nvgpu_channel_sync *s, int fd,
 		nvgpu_os_fence_syncpt_extract_nth_syncpt(
 			&os_fence_syncpt, i, &syncpt_id, &syncpt_thresh);
 		err = channel_sync_syncpt_gen_wait_cmd(c, syncpt_id,
-			syncpt_thresh, wait_cmd, wait_cmd_size, i, true);
+			syncpt_thresh, wait_cmd, wait_cmd_size, true);
 	}
 
 cleanup:
@@ -384,5 +383,3 @@ nvgpu_channel_sync_syncpt_create(struct nvgpu_channel *c)
 
 	return &sp->base;
 }
-
-
diff --git a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.c b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.c
index f5b3db27f..4997ae32c 100644
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.c
@@ -21,11 +21,9 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
+
 #include <nvgpu/log.h>
-#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/semaphore.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
 #include <nvgpu/priv_cmdbuf.h>
 
 #include "sema_cmdbuf_gk20a.h"
@@ -40,66 +38,66 @@ u32 gk20a_sema_get_incr_cmd_size(void)
 	return 10U;
 }
 
-static u32 gk20a_sema_add_header(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
-		u64 sema_va)
+static void gk20a_sema_add_header(struct gk20a *g,
+		struct priv_cmd_entry *cmd, u64 sema_va)
 {
-	/* semaphore_a */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010004U);
-	/* offset_upper */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (u32)(sema_va >> 32) & 0xffU);
-	/* semaphore_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010005U);
-	/* offset */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (u32)sema_va & 0xffffffff);
+	u32 data[] = {
+		/* semaphore_a */
+		0x20010004U,
+		/* offset_upper */
+		(u32)(sema_va >> 32) & 0xffU,
+		/* semaphore_b */
+		0x20010005U,
+		/* offset */
+		(u32)sema_va & 0xffffffff,
+	};
 
-	return off;
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
 
 void gk20a_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va)
 {
+	u32 data[] = {
+		/* semaphore_c */
+		0x20010006U,
+		/* payload */
+		nvgpu_semaphore_get_value(s),
+		/* semaphore_d */
+		0x20010007U,
+		/* operation: acq_geq, switch_en */
+		0x4U | BIT32(12),
+	};
+
 	nvgpu_log_fn(g, " ");
 
-	off = cmd->off + off;
-	off = gk20a_sema_add_header(g, cmd, off, sema_va);
-
-	/* semaphore_c */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006U);
-	/* payload */
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-		       nvgpu_semaphore_get_value(s));
-	/* semaphore_d */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007U);
-	/* operation: acq_geq, switch_en */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x4U | BIT32(12));
+	gk20a_sema_add_header(g, cmd, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
 
 void gk20a_sema_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va,
 		bool wfi)
-
 {
-	u32 off = cmd->off;
+	u32 data[] = {
+		/* semaphore_c */
+		0x20010006U,
+		/* payload */
+		nvgpu_semaphore_get_value(s),
+		/* semaphore_d */
+		0x20010007U,
+		/* operation: release, wfi */
+		0x2UL | ((wfi ? 0x0UL : 0x1UL) << 20),
+		/* non_stall_int */
+		0x20010008U,
+		/* ignored */
+		0U,
+	};
 
 	nvgpu_log_fn(g, " ");
 
-	off = gk20a_sema_add_header(g, cmd, off, sema_va);
-
-	/* semaphore_c */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006U);
-	/* payload */
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-		       nvgpu_semaphore_get_value(s));
-	/* semaphore_d */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007U);
-	/* operation: release, wfi */
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			0x2UL | ((wfi ? 0x0UL : 0x1UL) << 20));
-	/* non_stall_int */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010008U);
-	/* ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0U);
+	gk20a_sema_add_header(g, cmd, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
diff --git a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.h b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.h
index 1e80d7821..82cfc19ab 100644
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.h
@@ -31,7 +31,7 @@ struct nvgpu_semaphore;
 u32 gk20a_sema_get_wait_cmd_size(void);
 u32 gk20a_sema_get_incr_cmd_size(void);
 void gk20a_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va);
 void gk20a_sema_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
diff --git a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.c b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.c
index 01c4dcbb2..d66dca521 100644
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.c
@@ -22,10 +22,8 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/log.h>
 #include <nvgpu/semaphore.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
 #include <nvgpu/priv_cmdbuf.h>
 
 #include "sema_cmdbuf_gv11b.h"
@@ -40,41 +38,45 @@ u32 gv11b_sema_get_incr_cmd_size(void)
 	return 12U;
 }
 
-static u32 gv11b_sema_add_header(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+static void gv11b_sema_add_header(struct gk20a *g,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va)
 {
-	/* sema_addr_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010017);
-	nvgpu_mem_wr32(g, cmd->mem, off++, sema_va & 0xffffffffULL);
+	u32 data[] = {
+		/* sema_addr_lo */
+		0x20010017,
+		sema_va & 0xffffffffULL,
 
-	/* sema_addr_hi */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010018);
-	nvgpu_mem_wr32(g, cmd->mem, off++, (sema_va >> 32ULL) & 0xffULL);
+		/* sema_addr_hi */
+		0x20010018,
+		(sema_va >> 32ULL) & 0xffULL,
 
-	/* payload_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010019);
-	nvgpu_mem_wr32(g, cmd->mem, off++, nvgpu_semaphore_get_value(s));
+		/* payload_lo */
+		0x20010019,
+		nvgpu_semaphore_get_value(s),
 
-	/* payload_hi : ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001a);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
+		/* payload_hi : ignored */
+		0x2001001a,
+		0,
+	};
 
-	return off;
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
 
 void gv11b_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va)
 {
+	u32 data[] = {
+		/* sema_execute : acq_strict_geq | switch_en | 32bit */
+		0x2001001b,
+		U32(0x2) | BIT32(12),
+	};
+
 	nvgpu_log_fn(g, " ");
 
-	off = cmd->off + off;
-	off = gv11b_sema_add_header(g, cmd, off, s, sema_va);
-
-	/* sema_execute : acq_strict_geq | switch_en | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off++, U32(0x2) | BIT32(12));
+	gv11b_sema_add_header(g, cmd, s, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
 
 void gv11b_sema_add_incr_cmd(struct gk20a *g,
@@ -82,18 +84,18 @@ void gv11b_sema_add_incr_cmd(struct gk20a *g,
 		struct nvgpu_semaphore *s, u64 sema_va,
 		bool wfi)
 {
-	u32 off = cmd->off;
+	u32 data[] = {
+		/* sema_execute : release | wfi | 32bit */
+		0x2001001b,
+		U32(0x1) | ((wfi ? U32(0x1) : U32(0x0)) << 20U),
+
+		/* non_stall_int : payload is ignored */
+		0x20010008,
+		0,
+	};
 
 	nvgpu_log_fn(g, " ");
 
-	off = gv11b_sema_add_header(g, cmd, off, s, sema_va);
-
-	/* sema_execute : release | wfi | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-		U32(0x1) | ((wfi ? U32(0x1) : U32(0x0)) << 20U));
-
-	/* non_stall_int : payload is ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010008);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
+	gv11b_sema_add_header(g, cmd, s, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
diff --git a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.h b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.h
index d7a1ee56a..7bd8e685b 100644
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.h
@@ -31,7 +31,7 @@ struct nvgpu_semaphore;
 u32 gv11b_sema_get_wait_cmd_size(void);
 u32 gv11b_sema_get_incr_cmd_size(void);
 void gv11b_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va);
 void gv11b_sema_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
diff --git a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.c b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.c
index 07783510a..2ca44cad3 100644
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.c
@@ -22,29 +22,30 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-#include <nvgpu/nvgpu_mem.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
+#include <nvgpu/log.h>
 #include <nvgpu/priv_cmdbuf.h>
 
 #include "syncpt_cmdbuf_gk20a.h"
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gk20a_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
+	u32 data[] = {
+		/* syncpoint_a */
+		0x2001001CU,
+		/* payload */
+		thresh,
+		/* syncpoint_b */
+		0x2001001DU,
+		/* syncpt_id, switch_en, wait */
+		(id << 8U) | 0x10U,
+	};
+
 	nvgpu_log_fn(g, " ");
 
-	off = cmd->off + off;
-	/* syncpoint_a */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001CU);
-	/* payload */
-	nvgpu_mem_wr32(g, cmd->mem, off++, thresh);
-	/* syncpoint_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001DU);
-	/* syncpt_id, switch_en, wait */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8U) | 0x10U);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
 
 u32 gk20a_syncpt_get_wait_cmd_size(void)
@@ -61,28 +62,35 @@ void gk20a_syncpt_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
 		u32 id, u64 gpu_va, bool wfi)
 {
-	u32 off = cmd->off;
+	u32 wfi_data[] = {
+		/* wfi */
+		0x2001001EU,
+		/* handle, ignored */
+		0x00000000U,
+	};
+
+	u32 incr_data[] = {
+		/* syncpoint_a */
+		0x2001001CU,
+		/* payload, ignored */
+		0U,
+		/* syncpoint_b */
+		0x2001001DU,
+		/* syncpt_id, incr */
+		(id << 8U) | 0x1U,
+		/* syncpoint_b */
+		0x2001001DU,
+		/* syncpt_id, incr */
+		(id << 8U) | 0x1U,
+	};
 
 	nvgpu_log_fn(g, " ");
-	if (wfi) {
-		/* wfi */
-		nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001EU);
-		/* handle, ignored */
-		nvgpu_mem_wr32(g, cmd->mem, off++, 0x00000000U);
-	}
-	/* syncpoint_a */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001CU);
-	/* payload, ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0U);
-	/* syncpoint_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001DU);
-	/* syncpt_id, incr */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8U) | 0x1U);
-	/* syncpoint_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001DU);
-	/* syncpt_id, incr */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8U) | 0x1U);
 
+	if (wfi) {
+		nvgpu_priv_cmdbuf_append(g, cmd, wfi_data,
+				ARRAY_SIZE(wfi_data));
+	}
+	nvgpu_priv_cmdbuf_append(g, cmd, incr_data, ARRAY_SIZE(incr_data));
 }
 
 u32 gk20a_syncpt_get_incr_cmd_size(bool wfi_cmd)
diff --git a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
index b4c46ca45..f2bdb756f 100644
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
@@ -32,7 +32,7 @@ struct nvgpu_mem;
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gk20a_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base);
 u32 gk20a_syncpt_get_wait_cmd_size(void);
 u32 gk20a_syncpt_get_incr_per_release(void);
@@ -52,7 +52,7 @@ int gk20a_syncpt_alloc_buf(struct nvgpu_channel *c,
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 static inline void gk20a_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
 }
diff --git a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.c b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.c
index 3f9104533..a01e5981f 100644
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.c
@@ -21,53 +21,45 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
-#include <nvgpu/mm.h>
-#include <nvgpu/vm.h>
-#include <nvgpu/gmmu.h>
-#include <nvgpu/nvgpu_mem.h>
-#include <nvgpu/dma.h>
-#include <nvgpu/lock.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
-#include <nvgpu/priv_cmdbuf.h>
+
+#include <nvgpu/log.h>
 #include <nvgpu/nvhost.h>
-#include <nvgpu/static_analysis.h>
+#include <nvgpu/priv_cmdbuf.h>
 
 #include "syncpt_cmdbuf_gv11b.h"
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gv11b_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
 	u64 gpu_va = gpu_va_base +
 		nvgpu_nvhost_syncpt_unit_interface_get_byte_offset(id);
+	u32 data[] = {
+		/* sema_addr_lo */
+		0x20010017,
+		nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU),
+
+		/* sema_addr_hi */
+		0x20010018,
+		nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU),
+
+		/* payload_lo */
+		0x20010019,
+		thresh,
+
+		/* payload_hi : ignored */
+		0x2001001a,
+		0U,
+
+		/* sema_execute : acq_strict_geq | switch_en | 32bit */
+		0x2001001b,
+		0x2U | ((u32)1U << 12U),
+	};
 
 	nvgpu_log_fn(g, " ");
 
-	off = cmd->off + off;
-
-	/* sema_addr_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010017);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU));
-
-	/* sema_addr_hi */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010018);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU));
-
-	/* payload_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010019);
-	nvgpu_mem_wr32(g, cmd->mem, off++, thresh);
-
-	/* payload_hi : ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001a);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0U);
-
-	/* sema_execute : acq_strict_geq | switch_en | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off, 0x2U | ((u32)1U << 12U));
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
 
 u32 gv11b_syncpt_get_wait_cmd_size(void)
@@ -84,32 +76,31 @@ void gv11b_syncpt_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
 		u32 id, u64 gpu_va, bool wfi)
 {
-	u32 off = cmd->off;
+	u32 data[] = {
+		/* sema_addr_lo */
+		0x20010017,
+		nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU),
+
+		/* sema_addr_hi */
+		0x20010018,
+		nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU),
+
+		/* payload_lo */
+		0x20010019,
+		0,
+
+		/* payload_hi : ignored */
+		0x2001001a,
+		0,
+
+		/* sema_execute : release | wfi | 32bit */
+		0x2001001b,
+		(0x1U | ((u32)(wfi ? 0x1U : 0x0U) << 20U)),
+	};
 
 	nvgpu_log_fn(g, " ");
 
-	/* sema_addr_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010017);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU));
-
-	/* sema_addr_hi */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010018);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU));
-
-	/* payload_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010019);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
-
-	/* payload_hi : ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001a);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
-
-	/* sema_execute : release | wfi | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off, (0x1U |
-					((u32)(wfi ? 0x1U : 0x0U) << 20U)));
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
 
 u32 gv11b_syncpt_get_incr_cmd_size(bool wfi_cmd)
diff --git a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.h b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.h
index dc71f9cfd..ad1ce1497 100644
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.h
@@ -35,7 +35,7 @@ struct vm_gk20a;
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gv11b_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base);
 u32 gv11b_syncpt_get_wait_cmd_size(void);
 u32 gv11b_syncpt_get_incr_per_release(void);
@@ -58,7 +58,7 @@ int gv11b_syncpt_get_sync_ro_map(struct vm_gk20a *vm,
 
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 static inline void gv11b_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops_sync.h b/drivers/gpu/nvgpu/include/nvgpu/gops_sync.h
index 8331bec38..350ad9430 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_sync.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_sync.h
@@ -76,7 +76,7 @@ struct gops_sync {
 				struct nvgpu_mem *syncpt_buf);
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 		void (*add_wait_cmd)(struct gk20a *g,
-				struct priv_cmd_entry *cmd, u32 off,
+				struct priv_cmd_entry *cmd,
 				u32 id, u32 thresh, u64 gpu_va_base);
 		u32 (*get_wait_cmd_size)(void);
 		void (*add_incr_cmd)(struct gk20a *g,
@@ -97,7 +97,7 @@ struct gops_sync {
 		u32 (*get_wait_cmd_size)(void);
 		u32 (*get_incr_cmd_size)(void);
 		void (*add_wait_cmd)(struct gk20a *g,
-			struct priv_cmd_entry *cmd, u32 off,
+			struct priv_cmd_entry *cmd,
 			struct nvgpu_semaphore *s, u64 sema_va);
 		void (*add_incr_cmd)(struct gk20a *g,
 			struct priv_cmd_entry *cmd,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h b/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
index 161dea16c..4d9b06d4e 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
@@ -33,6 +33,7 @@ struct priv_cmd_entry {
 	bool valid;
 	struct nvgpu_mem *mem;
 	u32 off;	/* offset in mem, in u32 entries */
+	u32 fill_off;	/* write offset from off, in u32 entries */
 	u64 gva;
 	u32 get;	/* start of entry in queue */
 	u32 size;	/* in words */
@@ -48,4 +49,9 @@ void nvgpu_channel_free_priv_cmd_entry(struct nvgpu_channel *c,
 void nvgpu_channel_update_priv_cmd_q_and_free_entry(struct nvgpu_channel *ch,
 		struct priv_cmd_entry *e);
 
+void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 *data, u32 entries);
+void nvgpu_priv_cmdbuf_append_zeros(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 entries);
+
 #endif