From 05df07945aceb527eb501f790091aca5d5d0e0c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Konsta=20H=C3=B6ltt=C3=A4?= <kholtta@nvidia.com>
Date: Wed, 15 Apr 2020 12:19:42 +0300
Subject: [PATCH] gpu: nvgpu: avoid channel dependency in priv cmdbuf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The priv cmdbuf queue needs only the vm_gk20a of the channel that owns
it. Pass the vm to the queue constructor and have the channel code store
the queue to itself instead of poking at the channel from the queue
code. Adjust the cmdbuf queue api to take the queue, not the channel.

Move the inflight job fallback calculation to the channel code. The size
of the channel gpfifo isn't needed in the queue; just the job count is.

[not part of the cherry-pick: a bunch of MISRA mitigations.]

Jira NVGPU-4548

Change-Id: I4277dc67bb50380cb157f3aa3c5d57b162a8f0ba
Signed-off-by: Konsta Hölttä <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2329659
(cherry picked from commit 83b2276f7bea563602eee20ce24b70ce70c8475a)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2332508
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/channel.c       | 28 +++++-
 drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c   | 95 ++++++++-----------
 drivers/gpu/nvgpu/common/fifo/submit.c        |  2 +-
 .../common/sync/channel_sync_semaphore.c      |  6 +-
 .../nvgpu/common/sync/channel_sync_syncpt.c   |  8 +-
 .../gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h  |  1 +
 drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h | 14 +--
 7 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index f915f863e..b30e8535b 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -217,7 +217,10 @@ static void channel_kernelmode_deinit(struct nvgpu_channel *ch)
 #endif
 	(void) memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
 
-	nvgpu_priv_cmdbuf_queue_free(ch);
+	if (ch->priv_cmd_q != NULL) {
+		nvgpu_priv_cmdbuf_queue_free(ch->priv_cmd_q);
+		ch->priv_cmd_q = NULL;
+	}
 
 	/* free pre-allocated resources, if applicable */
 	if (nvgpu_channel_is_prealloc_enabled(ch)) {
@@ -304,6 +307,7 @@ static int channel_setup_kernelmode(struct nvgpu_channel *c,
 {
 	u32 gpfifo_size, gpfifo_entry_size;
 	u64 gpfifo_gpu_va;
+	u32 priv_cmd_jobs;
 
 	int err = 0;
 	struct gk20a *g = c->g;
@@ -375,7 +379,20 @@ static int channel_setup_kernelmode(struct nvgpu_channel *c,
 		}
 	}
 
-	err = nvgpu_priv_cmdbuf_queue_alloc(c, args->num_inflight_jobs);
+	/*
+	 * Allocate priv cmdbuf space for pre and post fences. If the inflight
+	 * job count isn't specified, we base it on the gpfifo count. We
+	 * multiply by a factor of 1/3 because at most a third of the GPFIFO
+	 * entries can be used for user-submitted jobs; another third goes to
+	 * wait entries, and the final third to incr entries. There will be one
+	 * pair of acq and incr commands for each job.
+	 */
+	priv_cmd_jobs = args->num_inflight_jobs;
+	if (priv_cmd_jobs == 0U) {
+		priv_cmd_jobs = c->gpfifo.entry_num / 3U;
+	}
+
+	err = nvgpu_priv_cmdbuf_queue_alloc(c->vm, priv_cmd_jobs, &c->priv_cmd_q);
 	if (err != 0) {
 		goto clean_up_prealloc;
 	}
@@ -388,7 +405,8 @@ static int channel_setup_kernelmode(struct nvgpu_channel *c,
 	return 0;
 
 clean_up_priv_cmd:
-	nvgpu_priv_cmdbuf_queue_free(c);
+	nvgpu_priv_cmdbuf_queue_free(c->priv_cmd_q);
+	c->priv_cmd_q = NULL;
 clean_up_prealloc:
 	if (nvgpu_channel_is_deterministic(c) &&
 			args->num_inflight_jobs != 0U) {
@@ -999,9 +1017,9 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
 		 * then incr_cmd i.e. order of allocation)
 		 */
 		if (job->wait_cmd != NULL) {
-			nvgpu_priv_cmdbuf_free(c, job->wait_cmd);
+			nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->wait_cmd);
 		}
-		nvgpu_priv_cmdbuf_free(c, job->incr_cmd);
+		nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->incr_cmd);
 
 		/*
 		 * ensure all pending writes complete before freeing up the job.
diff --git a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
index 76ff5392a..c1c2704f3 100644
--- a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
+++ b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
@@ -27,7 +27,7 @@
 #include <nvgpu/dma.h>
 #include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/kmem.h>
-#include <nvgpu/channel.h>
+#include <nvgpu/vm.h>
 #include <nvgpu/priv_cmdbuf.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/trace.h>
@@ -44,6 +44,7 @@ struct priv_cmd_entry {
 };
 
 struct priv_cmd_queue {
+	struct vm_gk20a *vm;
 	struct nvgpu_mem mem; /* pushbuf */
 	u32 size;	/* allocated length in words */
 	u32 put;	/* next entry will begin here */
@@ -58,15 +59,15 @@ struct priv_cmd_queue {
 
 /* allocate private cmd buffer queue.
    used for inserting commands before/after user submitted buffers. */
-int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch,
-	u32 num_in_flight)
+int nvgpu_priv_cmdbuf_queue_alloc(struct vm_gk20a *vm,
+	u32 job_count, struct priv_cmd_queue **queue)
 {
-	struct gk20a *g = ch->g;
-	struct vm_gk20a *ch_vm = ch->vm;
+	struct gk20a *g = vm->mm->g;
 	struct priv_cmd_queue *q;
 	u64 size, tmp_size;
 	int err = 0;
 	u32 wait_size, incr_size;
+	u32 mem_per_job;
 
 	/*
 	 * sema size is at least as much as syncpt size, but semas may not be
@@ -96,28 +97,18 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch,
 	 * another 2 words. In reality these numbers vary by chip but we'll use
 	 * 8 and 10 as examples.
 	 *
-	 * We have two cases to consider: the first is we base the size of the
-	 * queue on the gpfifo count. Here we multiply by a factor of 1/3
-	 * because at most a third of the GPFIFO entries can be used for
-	 * user-submitted jobs; another third goes to wait entries, and the
-	 * final third to incr entries. There will be one pair of acq and incr
-	 * commands for each job.
-	 *
-	 *   gpfifo entry num * (1 / 3) * (8 + 10) * 4 bytes
-	 *
-	 * If instead num_in_flight is specified then we will use that to size
-	 * the queue instead of a third of the gpfifo entry count. The worst
-	 * case is still both sync commands (one ACQ and one INCR) per submit so
-	 * we have a queue size of:
+	 * Given the job count, cmdbuf space is allocated such that each job
+	 * can get one wait command and one increment command:
 	 *
 	 *   num_in_flight * (8 + 10) * 4 bytes
+	 *
+	 * These cmdbufs are inserted as gpfifo entries right before and after
+	 * the user submitted gpfifo entries per submit.
 	 */
-	if (num_in_flight == 0U) {
-		/* round down to ensure space for all priv cmds */
-		num_in_flight = ch->gpfifo.entry_num / 3U;
-	}
-
-	size = num_in_flight * (wait_size + incr_size) * sizeof(u32);
+	mem_per_job = nvgpu_safe_mult_u32(nvgpu_safe_add_u32(wait_size,
+				incr_size), (u32)sizeof(u32));
+	/* both 32 bit and mem_per_job is small */
+	size = nvgpu_safe_mult_u64((u64)job_count, (u64)mem_per_job);
 
 	tmp_size = PAGE_ALIGN(roundup_pow_of_two(size));
 	if (tmp_size > U32_MAX) {
@@ -130,12 +121,14 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch,
 		return -ENOMEM;
 	}
 
-	if (num_in_flight > U32_MAX / 2U) {
+	q->vm = vm;
+
+	if (job_count > U32_MAX / 2U) {
 		err = -ERANGE;
 		goto err_free_queue;
 	}
 
-	q->entries_len = 2U * num_in_flight;
+	q->entries_len = 2U * job_count;
 	q->entries = nvgpu_vzalloc(g,
 			nvgpu_safe_mult_u64((u64)q->entries_len,
 				sizeof(*q->entries)));
@@ -144,7 +137,7 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch,
 		goto err_free_queue;
 	}
 
-	err = nvgpu_dma_alloc_map_sys(ch_vm, size, &q->mem);
+	err = nvgpu_dma_alloc_map_sys(vm, size, &q->mem);
 	if (err != 0) {
 		nvgpu_err(g, "%s: memory allocation failed", __func__);
 		goto err_free_entries;
@@ -154,8 +147,7 @@ int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch,
 	nvgpu_assert(tmp_size <= U32_MAX);
 	q->size = (u32)tmp_size;
 
-	ch->priv_cmd_q = q;
-
+	*queue = q;
 	return 0;
 err_free_entries:
 	nvgpu_vfree(g, q->entries);
@@ -164,32 +156,25 @@ err_free_queue:
 	return err;
 }
 
-void nvgpu_priv_cmdbuf_queue_free(struct nvgpu_channel *ch)
+void nvgpu_priv_cmdbuf_queue_free(struct priv_cmd_queue *q)
 {
-	struct vm_gk20a *ch_vm = ch->vm;
-	struct priv_cmd_queue *q = ch->priv_cmd_q;
-	struct gk20a *g = ch->g;
+	struct vm_gk20a *vm = q->vm;
+	struct gk20a *g = vm->mm->g;
 
-	if (q == NULL) {
-		return;
-	}
-
-	nvgpu_dma_unmap_free(ch_vm, &q->mem);
+	nvgpu_dma_unmap_free(vm, &q->mem);
 	nvgpu_vfree(g, q->entries);
 	nvgpu_kfree(g, q);
-
-	ch->priv_cmd_q = NULL;
 }
 
 /* allocate a cmd buffer with given size. size is number of u32 entries */
-static int nvgpu_priv_cmdbuf_alloc_buf(struct nvgpu_channel *c, u32 orig_size,
+static int nvgpu_priv_cmdbuf_alloc_buf(struct priv_cmd_queue *q, u32 orig_size,
 			     struct priv_cmd_entry *e)
 {
-	struct priv_cmd_queue *q = c->priv_cmd_q;
+	struct gk20a *g = q->vm->mm->g;
 	u32 size = orig_size;
 	u32 free_count;
 
-	nvgpu_log_fn(c->g, "size %d", orig_size);
+	nvgpu_log_fn(g, "size %d", orig_size);
 
 	/*
 	 * If free space in the end is less than requested, increase the size
@@ -206,8 +191,8 @@ static int nvgpu_priv_cmdbuf_alloc_buf(struct nvgpu_channel *c, u32 orig_size,
 		size = orig_size + (q->size - q->put);
 	}
 
-	nvgpu_log_info(c->g, "ch %d: priv cmd queue get:put %d:%d",
-			c->chid, q->get, q->put);
+	nvgpu_log_info(g, "priv cmd queue get:put %d:%d",
+			q->get, q->put);
 
 	nvgpu_assert(q->put < q->size);
 	nvgpu_assert(q->get < q->size);
@@ -250,15 +235,14 @@ static int nvgpu_priv_cmdbuf_alloc_buf(struct nvgpu_channel *c, u32 orig_size,
 	nvgpu_smp_wmb();
 
 	e->valid = true;
-	nvgpu_log_fn(c->g, "done");
+	nvgpu_log_fn(g, "done");
 
 	return 0;
 }
 
-int nvgpu_priv_cmdbuf_alloc(struct nvgpu_channel *c, u32 size,
+int nvgpu_priv_cmdbuf_alloc(struct priv_cmd_queue *q, u32 size,
 			     struct priv_cmd_entry **e)
 {
-	struct priv_cmd_queue *q = c->priv_cmd_q;
 	u32 next_put = nvgpu_safe_add_u32(q->entry_put, 1U) % q->entries_len;
 	struct priv_cmd_entry *entry;
 	int err;
@@ -268,7 +252,7 @@ int nvgpu_priv_cmdbuf_alloc(struct nvgpu_channel *c, u32 size,
 	}
 	entry = &q->entries[q->entry_put];
 
-	err = nvgpu_priv_cmdbuf_alloc_buf(c, size, entry);
+	err = nvgpu_priv_cmdbuf_alloc_buf(q, size, entry);
 	if (err != 0) {
 		return err;
 	}
@@ -279,11 +263,9 @@ int nvgpu_priv_cmdbuf_alloc(struct nvgpu_channel *c, u32 size,
 	return 0;
 }
 
-void nvgpu_priv_cmdbuf_rollback(struct nvgpu_channel *ch,
+void nvgpu_priv_cmdbuf_rollback(struct priv_cmd_queue *q,
 		struct priv_cmd_entry *e)
 {
-	struct priv_cmd_queue *q = ch->priv_cmd_q;
-
 	nvgpu_assert(q->put < q->size);
 	nvgpu_assert(q->size > 0U);
 	nvgpu_assert(e->alloc_size <= q->size);
@@ -297,18 +279,15 @@ void nvgpu_priv_cmdbuf_rollback(struct nvgpu_channel *ch,
 		% q->entries_len;
 }
 
-void nvgpu_priv_cmdbuf_free(struct nvgpu_channel *ch,
-		struct priv_cmd_entry *e)
+void nvgpu_priv_cmdbuf_free(struct priv_cmd_queue *q, struct priv_cmd_entry *e)
 {
-	struct priv_cmd_queue *q = ch->priv_cmd_q;
-	struct gk20a *g = ch->g;
+	struct gk20a *g = q->vm->mm->g;
 
 	if (e->valid) {
 		/* read the entry's valid flag before reading its contents */
 		nvgpu_smp_rmb();
 		if ((q->get != e->off) && e->off != 0U) {
-			nvgpu_err(g, "requests out-of-order, ch=%d",
-				  ch->chid);
+			nvgpu_err(g, "priv cmdbuf requests out-of-order");
 		}
 		nvgpu_assert(q->size > 0U);
 		q->get = nvgpu_safe_add_u32(e->off, e->size) & (q->size - 1U);
diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c
index 70dfb7366..9122ee3a2 100644
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -159,7 +159,7 @@ clean_up_post_fence:
 	job->post_fence = NULL;
 clean_up_wait_cmd:
 	if (job->wait_cmd != NULL) {
-		nvgpu_priv_cmdbuf_rollback(c, job->wait_cmd);
+		nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, job->wait_cmd);
 	}
 	job->wait_cmd = NULL;
 clean_up_unlock:
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
index aea38adb1..3aa724199 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_semaphore.c
@@ -148,7 +148,7 @@ static int channel_sync_semaphore_wait_fd(
 	}
 
 	wait_cmd_size = c->g->ops.sync.sema.get_wait_cmd_size();
-	err = nvgpu_priv_cmdbuf_alloc(c,
+	err = nvgpu_priv_cmdbuf_alloc(c->priv_cmd_q,
 		wait_cmd_size * num_fences, entry);
 	if (err != 0) {
 		goto cleanup;
@@ -188,7 +188,7 @@ static int channel_sync_semaphore_incr_common(
 	}
 
 	incr_cmd_size = c->g->ops.sync.sema.get_incr_cmd_size();
-	err = nvgpu_priv_cmdbuf_alloc(c, incr_cmd_size, incr_cmd);
+	err = nvgpu_priv_cmdbuf_alloc(c->priv_cmd_q, incr_cmd_size, incr_cmd);
 	if (err != 0) {
 		goto clean_up_sema;
 	}
@@ -218,7 +218,7 @@ clean_up_os_fence:
 		os_fence.ops->drop_ref(&os_fence);
 	}
 clean_up_cmdbuf:
-	nvgpu_priv_cmdbuf_rollback(c, *incr_cmd);
+	nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, *incr_cmd);
 clean_up_sema:
 	nvgpu_semaphore_put(semaphore);
 	return err;
diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
index 4a9f42c43..29a72a130 100644
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
@@ -77,7 +77,7 @@ static int channel_sync_syncpt_wait_raw(struct nvgpu_channel_sync_syncpt *s,
 		return -EINVAL;
 	}
 
-	err = nvgpu_priv_cmdbuf_alloc(c,
+	err = nvgpu_priv_cmdbuf_alloc(c->priv_cmd_q,
 		c->g->ops.sync.syncpt.get_wait_cmd_size(),
 		wait_cmd);
 	if (err != 0) {
@@ -135,7 +135,7 @@ static int channel_sync_syncpt_wait_fd(struct nvgpu_channel_sync *s, int fd,
 	}
 
 	wait_cmd_size = c->g->ops.sync.syncpt.get_wait_cmd_size();
-	err = nvgpu_priv_cmdbuf_alloc(c,
+	err = nvgpu_priv_cmdbuf_alloc(c->priv_cmd_q,
 		wait_cmd_size * num_fences, wait_cmd);
 	if (err != 0) {
 		goto cleanup;
@@ -177,7 +177,7 @@ static int channel_sync_syncpt_incr_common(struct nvgpu_channel_sync *s,
 	struct nvgpu_channel *c = sp->c;
 	struct nvgpu_os_fence os_fence = {0};
 
-	err = nvgpu_priv_cmdbuf_alloc(c,
+	err = nvgpu_priv_cmdbuf_alloc(c->priv_cmd_q,
 			c->g->ops.sync.syncpt.get_incr_cmd_size(wfi_cmd),
 			incr_cmd);
 	if (err != 0) {
@@ -241,7 +241,7 @@ static int channel_sync_syncpt_incr_common(struct nvgpu_channel_sync *s,
 	return 0;
 
 clean_up_priv_cmd:
-	nvgpu_priv_cmdbuf_rollback(c, *incr_cmd);
+	nvgpu_priv_cmdbuf_rollback(c->priv_cmd_q, *incr_cmd);
 	return err;
 }
 
diff --git a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
index b0bfe4fd4..b1260b748 100644
--- a/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/sync/syncpt_cmdbuf_gk20a.h
@@ -27,6 +27,7 @@
 struct gk20a;
 struct priv_cmd_entry;
 struct nvgpu_mem;
+struct nvgpu_channel;
 
 #ifdef CONFIG_TEGRA_GK20A_NVHOST
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h b/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
index 33fe6e4de..faa271681 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/priv_cmdbuf.h
@@ -26,17 +26,19 @@
 #include <nvgpu/types.h>
 
 struct gk20a;
-struct nvgpu_channel;
+struct vm_gk20a;
 struct priv_cmd_entry;
+struct priv_cmd_queue;
 
-int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch, u32 num_in_flight);
-void nvgpu_priv_cmdbuf_queue_free(struct nvgpu_channel *ch);
+int nvgpu_priv_cmdbuf_queue_alloc(struct vm_gk20a *vm,
+		u32 job_count, struct priv_cmd_queue **queue);
+void nvgpu_priv_cmdbuf_queue_free(struct priv_cmd_queue *q);
 
-int nvgpu_priv_cmdbuf_alloc(struct nvgpu_channel *c, u32 size,
+int nvgpu_priv_cmdbuf_alloc(struct priv_cmd_queue *q, u32 size,
 		struct priv_cmd_entry **e);
-void nvgpu_priv_cmdbuf_rollback(struct nvgpu_channel *ch,
+void nvgpu_priv_cmdbuf_rollback(struct priv_cmd_queue *q,
 		struct priv_cmd_entry *e);
-void nvgpu_priv_cmdbuf_free(struct nvgpu_channel *ch,
+void nvgpu_priv_cmdbuf_free(struct priv_cmd_queue *q,
 		struct priv_cmd_entry *e);
 
 void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,