gpu: nvgpu: hide priv cmdbuf mem writes

Add an API to append data to a priv cmdbuf entry. Hold the write pointer offset internally in the entry instead of having the user keep track of where those words are written to. This helps in eventually hiding struct priv_cmd_entry from users and provides a more consistent interface in general. The wait and incr commands are now slightly easier to read as well when they're just arrays of data. A syncfd-backed prefence may be composed of several individual fences. Some of those (or even a fence backed by just one) may be already expired, and currently the syncfd export design releases and nulls semaphores when expired (see gk20a_sync_pt_has_signaled()) so for those the wait cmdbuf is appended with zeros; the specific function is for this purpose. Jira NVGPU-4548 Change-Id: I1057f98c1b5b407460aa6e1dcba917da9c9aa9c9 Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2325099 (cherry picked from commit 6a00a65a86d8249cfeb06a05682abb4771949f19) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2331336 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2020-04-03 11:04:02 +03:00
parent 0c9f589f3f
commit 39844fb27c
13 changed files with 217 additions and 200 deletions
--- a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
+++ b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
@@ -170,6 +170,7 @@ int nvgpu_channel_alloc_priv_cmdbuf(struct nvgpu_channel *c, u32 orig_size,
 		return -EAGAIN;
 	}

+	e->fill_off = 0;
 	e->size = orig_size;
 	e->mem = &q->mem;

@@ -237,3 +238,21 @@ void nvgpu_channel_update_priv_cmd_q_and_free_entry(

 	nvgpu_channel_free_priv_cmd_entry(ch, e);
 }
+
+void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 *data, u32 entries)
+{
+	nvgpu_assert(e->fill_off + entries <= e->size);
+	nvgpu_mem_wr_n(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
+			data, entries * sizeof(u32));
+	e->fill_off += entries;
+}
+
+void nvgpu_priv_cmdbuf_append_zeros(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 entries)
+{
+	nvgpu_assert(e->fill_off + entries <= e->size);
+	nvgpu_memset(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
+			0, entries * sizeof(u32));
+	e->fill_off += entries;
+}
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
@@ -57,8 +57,7 @@ nvgpu_channel_sync_semaphore_from_base(struct nvgpu_channel_sync *base)
 }

 static void add_sema_wait_cmd(struct gk20a *g, struct nvgpu_channel *c,
-			 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
-			 u32 offset)
+			 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd)
 {
 	int ch = c->chid;
 	u64 va;
@@ -66,12 +65,12 @@ static void add_sema_wait_cmd(struct gk20a *g, struct nvgpu_channel *c,
 	/* acquire just needs to read the mem. */
 	va = nvgpu_semaphore_gpu_ro_va(s);

-	g->ops.sync.sema.add_wait_cmd(g, cmd, offset, s, va);
+	g->ops.sync.sema.add_wait_cmd(g, cmd, s, va);
 	gpu_sema_verbose_dbg(g, "(A) c=%d ACQ_GE %-4u pool=%-3llu"
-			     "va=0x%llx cmd_mem=0x%llx b=0x%llx off=%u",
+			     "va=0x%llx cmd_mem=0x%llx b=0x%llx",
 			     ch, nvgpu_semaphore_get_value(s),
 			     nvgpu_semaphore_get_hw_pool_page_idx(s),
-			     va, cmd->gva, cmd->mem->gpu_va, offset);
+			     va, cmd->gva, cmd->mem->gpu_va);
 }

 static void add_sema_incr_cmd(struct gk20a *g, struct nvgpu_channel *c,
@@ -98,20 +97,17 @@ static void add_sema_incr_cmd(struct gk20a *g, struct nvgpu_channel *c,

 static void channel_sync_semaphore_gen_wait_cmd(struct nvgpu_channel *c,
 	struct nvgpu_semaphore *sema, struct priv_cmd_entry *wait_cmd,
-	u32 wait_cmd_size, u32 pos)
+	u32 wait_cmd_size)
 {
 	bool has_incremented;

 	if (sema == NULL) {
-		/* expired */
-		nvgpu_memset(c->g, wait_cmd->mem,
-			(wait_cmd->off + pos * wait_cmd_size) * (u32)sizeof(u32),
-			0, wait_cmd_size * (u32)sizeof(u32));
+		/* came from an expired sync fence */
+		nvgpu_priv_cmdbuf_append_zeros(c->g, wait_cmd, wait_cmd_size);
 	} else {
 		has_incremented = nvgpu_semaphore_can_wait(sema);
 		nvgpu_assert(has_incremented);
-		add_sema_wait_cmd(c->g, c, sema, wait_cmd,
-			pos * wait_cmd_size);
+		add_sema_wait_cmd(c->g, c, sema, wait_cmd);
 		nvgpu_semaphore_put(sema);
 	}
 }
@@ -163,7 +159,7 @@ static int channel_sync_semaphore_wait_fd(
 		nvgpu_os_fence_sema_extract_nth_semaphore(
 			&os_fence_sema, i, &semaphore);
 		channel_sync_semaphore_gen_wait_cmd(c, semaphore, entry,
-				wait_cmd_size, i);
+				wait_cmd_size);
 	}

 cleanup:
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
@@ -58,7 +58,7 @@ nvgpu_channel_sync_syncpt_from_base(struct nvgpu_channel_sync *base)

 static int channel_sync_syncpt_gen_wait_cmd(struct nvgpu_channel *c,
 	u32 id, u32 thresh, struct priv_cmd_entry *wait_cmd,
-	u32 wait_cmd_size, u32 pos, bool preallocated)
+	u32 wait_cmd_size, bool preallocated)
 {
 	int err = 0;

@@ -73,9 +73,8 @@ static int channel_sync_syncpt_gen_wait_cmd(struct nvgpu_channel *c,
 	}
 	nvgpu_log(c->g, gpu_dbg_info, "sp->id %d gpu va %llx",
 			id, c->vm->syncpt_ro_map_gpu_va);
-	c->g->ops.sync.syncpt.add_wait_cmd(c->g, wait_cmd,
-		pos * wait_cmd_size, id, thresh,
-		c->vm->syncpt_ro_map_gpu_va);
+	c->g->ops.sync.syncpt.add_wait_cmd(c->g, wait_cmd, id, thresh,
+			c->vm->syncpt_ro_map_gpu_va);

 	return 0;
 }
@@ -92,7 +91,7 @@ static int channel_sync_syncpt_wait_raw(struct nvgpu_channel_sync_syncpt *s,
 	}

 	err = channel_sync_syncpt_gen_wait_cmd(c, id, thresh,
-			wait_cmd, wait_cmd_size, 0, false);
+			wait_cmd, wait_cmd_size, false);

 	return err;
 }
@@ -154,7 +153,7 @@ static int channel_sync_syncpt_wait_fd(struct nvgpu_channel_sync *s, int fd,
 		nvgpu_os_fence_syncpt_extract_nth_syncpt(
 			&os_fence_syncpt, i, &syncpt_id, &syncpt_thresh);
 		err = channel_sync_syncpt_gen_wait_cmd(c, syncpt_id,
-			syncpt_thresh, wait_cmd, wait_cmd_size, i, true);
+			syncpt_thresh, wait_cmd, wait_cmd_size, true);
 	}

 cleanup:
@@ -384,5 +383,3 @@ nvgpu_channel_sync_syncpt_create(struct nvgpu_channel *c)

 	return &sp->base;
 }
-
-
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.c
@@ -21,11 +21,9 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
+
 #include <nvgpu/log.h>
-#include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/semaphore.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
 #include <nvgpu/priv_cmdbuf.h>

 #include "sema_cmdbuf_gk20a.h"
@@ -40,66 +38,66 @@ u32 gk20a_sema_get_incr_cmd_size(void)
 	return 10U;
 }

-static u32 gk20a_sema_add_header(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
-		u64 sema_va)
+static void gk20a_sema_add_header(struct gk20a *g,
+		struct priv_cmd_entry *cmd, u64 sema_va)
 {
-	/* semaphore_a */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010004U);
-	/* offset_upper */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (u32)(sema_va >> 32) & 0xffU);
-	/* semaphore_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010005U);
-	/* offset */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (u32)sema_va & 0xffffffff);
+	u32 data[] = {
+		/* semaphore_a */
+		0x20010004U,
+		/* offset_upper */
+		(u32)(sema_va >> 32) & 0xffU,
+		/* semaphore_b */
+		0x20010005U,
+		/* offset */
+		(u32)sema_va & 0xffffffff,
+	};

-	return off;
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }

 void gk20a_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va)
 {
+	u32 data[] = {
+		/* semaphore_c */
+		0x20010006U,
+		/* payload */
+		nvgpu_semaphore_get_value(s),
+		/* semaphore_d */
+		0x20010007U,
+		/* operation: acq_geq, switch_en */
+		0x4U | BIT32(12),
+	};
+
 	nvgpu_log_fn(g, " ");

-	off = cmd->off + off;
-	off = gk20a_sema_add_header(g, cmd, off, sema_va);
-
-	/* semaphore_c */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006U);
-	/* payload */
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-		       nvgpu_semaphore_get_value(s));
-	/* semaphore_d */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007U);
-	/* operation: acq_geq, switch_en */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x4U | BIT32(12));
+	gk20a_sema_add_header(g, cmd, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }

 void gk20a_sema_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va,
 		bool wfi)
-
 {
-	u32 off = cmd->off;
+	u32 data[] = {
+		/* semaphore_c */
+		0x20010006U,
+		/* payload */
+		nvgpu_semaphore_get_value(s),
+		/* semaphore_d */
+		0x20010007U,
+		/* operation: release, wfi */
+		0x2UL | ((wfi ? 0x0UL : 0x1UL) << 20),
+		/* non_stall_int */
+		0x20010008U,
+		/* ignored */
+		0U,
+	};

 	nvgpu_log_fn(g, " ");

-	off = gk20a_sema_add_header(g, cmd, off, sema_va);
-
-	/* semaphore_c */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006U);
-	/* payload */
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-		       nvgpu_semaphore_get_value(s));
-	/* semaphore_d */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007U);
-	/* operation: release, wfi */
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			0x2UL | ((wfi ? 0x0UL : 0x1UL) << 20));
-	/* non_stall_int */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010008U);
-	/* ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0U);
+	gk20a_sema_add_header(g, cmd, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gk20a.h
@@ -31,7 +31,7 @@ struct nvgpu_semaphore;
 u32 gk20a_sema_get_wait_cmd_size(void);
 u32 gk20a_sema_get_incr_cmd_size(void);
 void gk20a_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va);
 void gk20a_sema_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.c
@@ -22,10 +22,8 @@
 * DEALINGS IN THE SOFTWARE.
 */

-#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/log.h>
 #include <nvgpu/semaphore.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
 #include <nvgpu/priv_cmdbuf.h>

 #include "sema_cmdbuf_gv11b.h"
@@ -40,41 +38,45 @@ u32 gv11b_sema_get_incr_cmd_size(void)
 	return 12U;
 }

-static u32 gv11b_sema_add_header(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+static void gv11b_sema_add_header(struct gk20a *g,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va)
 {
-	/* sema_addr_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010017);
-	nvgpu_mem_wr32(g, cmd->mem, off++, sema_va & 0xffffffffULL);
+	u32 data[] = {
+		/* sema_addr_lo */
+		0x20010017,
+		sema_va & 0xffffffffULL,

-	/* sema_addr_hi */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010018);
-	nvgpu_mem_wr32(g, cmd->mem, off++, (sema_va >> 32ULL) & 0xffULL);
+		/* sema_addr_hi */
+		0x20010018,
+		(sema_va >> 32ULL) & 0xffULL,

-	/* payload_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010019);
-	nvgpu_mem_wr32(g, cmd->mem, off++, nvgpu_semaphore_get_value(s));
+		/* payload_lo */
+		0x20010019,
+		nvgpu_semaphore_get_value(s),

-	/* payload_hi : ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001a);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
+		/* payload_hi : ignored */
+		0x2001001a,
+		0,
+	};

-	return off;
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }

 void gv11b_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va)
 {
+	u32 data[] = {
+		/* sema_execute : acq_strict_geq | switch_en | 32bit */
+		0x2001001b,
+		U32(0x2) | BIT32(12),
+	};
+
 	nvgpu_log_fn(g, " ");

-	off = cmd->off + off;
-	off = gv11b_sema_add_header(g, cmd, off, s, sema_va);
-
-	/* sema_execute : acq_strict_geq | switch_en | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off++, U32(0x2) | BIT32(12));
+	gv11b_sema_add_header(g, cmd, s, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }

 void gv11b_sema_add_incr_cmd(struct gk20a *g,
@@ -82,18 +84,18 @@ void gv11b_sema_add_incr_cmd(struct gk20a *g,
 		struct nvgpu_semaphore *s, u64 sema_va,
 		bool wfi)
 {
-	u32 off = cmd->off;
+	u32 data[] = {
+		/* sema_execute : release | wfi | 32bit */
+		0x2001001b,
+		U32(0x1) | ((wfi ? U32(0x1) : U32(0x0)) << 20U),
+
+		/* non_stall_int : payload is ignored */
+		0x20010008,
+		0,
+	};

 	nvgpu_log_fn(g, " ");

-	off = gv11b_sema_add_header(g, cmd, off, s, sema_va);
-
-	/* sema_execute : release | wfi | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-		U32(0x1) | ((wfi ? U32(0x1) : U32(0x0)) << 20U));
-
-	/* non_stall_int : payload is ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010008);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
+	gv11b_sema_add_header(g, cmd, s, sema_va);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }
--- a/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/sync/sema_cmdbuf_gv11b.h
@@ -31,7 +31,7 @@ struct nvgpu_semaphore;
 u32 gv11b_sema_get_wait_cmd_size(void);
 u32 gv11b_sema_get_incr_cmd_size(void);
 void gv11b_sema_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		struct nvgpu_semaphore *s, u64 sema_va);
 void gv11b_sema_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.c
@@ -22,29 +22,30 @@
 * DEALINGS IN THE SOFTWARE.
 */

-#include <nvgpu/nvgpu_mem.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
+#include <nvgpu/log.h>
 #include <nvgpu/priv_cmdbuf.h>

 #include "syncpt_cmdbuf_gk20a.h"

 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gk20a_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
+	u32 data[] = {
+		/* syncpoint_a */
+		0x2001001CU,
+		/* payload */
+		thresh,
+		/* syncpoint_b */
+		0x2001001DU,
+		/* syncpt_id, switch_en, wait */
+		(id << 8U) | 0x10U,
+	};
+
 	nvgpu_log_fn(g, " ");

-	off = cmd->off + off;
-	/* syncpoint_a */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001CU);
-	/* payload */
-	nvgpu_mem_wr32(g, cmd->mem, off++, thresh);
-	/* syncpoint_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001DU);
-	/* syncpt_id, switch_en, wait */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8U) | 0x10U);
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }

 u32 gk20a_syncpt_get_wait_cmd_size(void)
@@ -61,28 +62,35 @@ void gk20a_syncpt_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
 		u32 id, u64 gpu_va, bool wfi)
 {
-	u32 off = cmd->off;
+	u32 wfi_data[] = {
+		/* wfi */
+		0x2001001EU,
+		/* handle, ignored */
+		0x00000000U,
+	};
+
+	u32 incr_data[] = {
+		/* syncpoint_a */
+		0x2001001CU,
+		/* payload, ignored */
+		0U,
+		/* syncpoint_b */
+		0x2001001DU,
+		/* syncpt_id, incr */
+		(id << 8U) | 0x1U,
+		/* syncpoint_b */
+		0x2001001DU,
+		/* syncpt_id, incr */
+		(id << 8U) | 0x1U,
+	};

 	nvgpu_log_fn(g, " ");
-	if (wfi) {
-		/* wfi */
-		nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001EU);
-		/* handle, ignored */
-		nvgpu_mem_wr32(g, cmd->mem, off++, 0x00000000U);
-	}
-	/* syncpoint_a */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001CU);
-	/* payload, ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0U);
-	/* syncpoint_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001DU);
-	/* syncpt_id, incr */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8U) | 0x1U);
-	/* syncpoint_b */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001DU);
-	/* syncpt_id, incr */
-	nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8U) | 0x1U);

+	if (wfi) {
+		nvgpu_priv_cmdbuf_append(g, cmd, wfi_data,
+				ARRAY_SIZE(wfi_data));
+	}
+	nvgpu_priv_cmdbuf_append(g, cmd, incr_data, ARRAY_SIZE(incr_data));
 }

 u32 gk20a_syncpt_get_incr_cmd_size(bool wfi_cmd)
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
@@ -32,7 +32,7 @@ struct nvgpu_mem;

 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gk20a_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base);
 u32 gk20a_syncpt_get_wait_cmd_size(void);
 u32 gk20a_syncpt_get_incr_per_release(void);
@@ -52,7 +52,7 @@ int gk20a_syncpt_alloc_buf(struct nvgpu_channel *c,

 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 static inline void gk20a_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
 }
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.c
@@ -21,53 +21,45 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
-#include <nvgpu/mm.h>
-#include <nvgpu/vm.h>
-#include <nvgpu/gmmu.h>
-#include <nvgpu/nvgpu_mem.h>
-#include <nvgpu/dma.h>
-#include <nvgpu/lock.h>
-#include <nvgpu/gk20a.h>
-#include <nvgpu/channel.h>
-#include <nvgpu/priv_cmdbuf.h>
+
+#include <nvgpu/log.h>
 #include <nvgpu/nvhost.h>
-#include <nvgpu/static_analysis.h>
+#include <nvgpu/priv_cmdbuf.h>

 #include "syncpt_cmdbuf_gv11b.h"

 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gv11b_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
 	u64 gpu_va = gpu_va_base +
 		nvgpu_nvhost_syncpt_unit_interface_get_byte_offset(id);
+	u32 data[] = {
+		/* sema_addr_lo */
+		0x20010017,
+		nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU),
+
+		/* sema_addr_hi */
+		0x20010018,
+		nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU),
+
+		/* payload_lo */
+		0x20010019,
+		thresh,
+
+		/* payload_hi : ignored */
+		0x2001001a,
+		0U,
+
+		/* sema_execute : acq_strict_geq | switch_en | 32bit */
+		0x2001001b,
+		0x2U | ((u32)1U << 12U),
+	};

 	nvgpu_log_fn(g, " ");

-	off = cmd->off + off;
-
-	/* sema_addr_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010017);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU));
-
-	/* sema_addr_hi */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010018);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU));
-
-	/* payload_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010019);
-	nvgpu_mem_wr32(g, cmd->mem, off++, thresh);
-
-	/* payload_hi : ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001a);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0U);
-
-	/* sema_execute : acq_strict_geq | switch_en | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off, 0x2U | ((u32)1U << 12U));
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }

 u32 gv11b_syncpt_get_wait_cmd_size(void)
@@ -84,32 +76,31 @@ void gv11b_syncpt_add_incr_cmd(struct gk20a *g,
 		struct priv_cmd_entry *cmd,
 		u32 id, u64 gpu_va, bool wfi)
 {
-	u32 off = cmd->off;
+	u32 data[] = {
+		/* sema_addr_lo */
+		0x20010017,
+		nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU),
+
+		/* sema_addr_hi */
+		0x20010018,
+		nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU),
+
+		/* payload_lo */
+		0x20010019,
+		0,
+
+		/* payload_hi : ignored */
+		0x2001001a,
+		0,
+
+		/* sema_execute : release | wfi | 32bit */
+		0x2001001b,
+		(0x1U | ((u32)(wfi ? 0x1U : 0x0U) << 20U)),
+	};

 	nvgpu_log_fn(g, " ");

-	/* sema_addr_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010017);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32(gpu_va & 0xffffffffU));
-
-	/* sema_addr_hi */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010018);
-	nvgpu_mem_wr32(g, cmd->mem, off++,
-			nvgpu_safe_cast_u64_to_u32((gpu_va >> 32U) & 0xffU));
-
-	/* payload_lo */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010019);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
-
-	/* payload_hi : ignored */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001a);
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0);
-
-	/* sema_execute : release | wfi | 32bit */
-	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001b);
-	nvgpu_mem_wr32(g, cmd->mem, off, (0x1U |
-					((u32)(wfi ? 0x1U : 0x0U) << 20U)));
+	nvgpu_priv_cmdbuf_append(g, cmd, data, ARRAY_SIZE(data));
 }

 u32 gv11b_syncpt_get_incr_cmd_size(bool wfi_cmd)
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gv11b.h
@@ -35,7 +35,7 @@ struct vm_gk20a;

 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 void gv11b_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base);
 u32 gv11b_syncpt_get_wait_cmd_size(void);
 u32 gv11b_syncpt_get_incr_per_release(void);
@@ -58,7 +58,7 @@ int gv11b_syncpt_get_sync_ro_map(struct vm_gk20a *vm,

 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 static inline void gv11b_syncpt_add_wait_cmd(struct gk20a *g,
-		struct priv_cmd_entry *cmd, u32 off,
+		struct priv_cmd_entry *cmd,
 		u32 id, u32 thresh, u64 gpu_va_base)
 {
 }
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_sync.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_sync.h
@@ -76,7 +76,7 @@ struct gops_sync {
 				struct nvgpu_mem *syncpt_buf);
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 		void (*add_wait_cmd)(struct gk20a *g,
-				struct priv_cmd_entry *cmd, u32 off,
+				struct priv_cmd_entry *cmd,
 				u32 id, u32 thresh, u64 gpu_va_base);
 		u32 (*get_wait_cmd_size)(void);
 		void (*add_incr_cmd)(struct gk20a *g,
@@ -97,7 +97,7 @@ struct gops_sync {
 		u32 (*get_wait_cmd_size)(void);
 		u32 (*get_incr_cmd_size)(void);
 		void (*add_wait_cmd)(struct gk20a *g,
-			struct priv_cmd_entry *cmd, u32 off,
+			struct priv_cmd_entry *cmd,
 			struct nvgpu_semaphore *s, u64 sema_va);
 		void (*add_incr_cmd)(struct gk20a *g,
 			struct priv_cmd_entry *cmd,
--- a/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
@@ -33,6 +33,7 @@ struct priv_cmd_entry {
 	bool valid;
 	struct nvgpu_mem *mem;
 	u32 off;	/* offset in mem, in u32 entries */
+	u32 fill_off;	/* write offset from off, in u32 entries */
 	u64 gva;
 	u32 get;	/* start of entry in queue */
 	u32 size;	/* in words */
@@ -48,4 +49,9 @@ void nvgpu_channel_free_priv_cmd_entry(struct nvgpu_channel *c,
 void nvgpu_channel_update_priv_cmd_q_and_free_entry(struct nvgpu_channel *ch,
 		struct priv_cmd_entry *e);

+void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 *data, u32 entries);
+void nvgpu_priv_cmdbuf_append_zeros(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 entries);
+
 #endif