From 116c3850899f0dbd6bbd8255c78d1b9aa45e9827 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Konsta=20H=C3=B6ltt=C3=A4?= <kholtta@nvidia.com>
Date: Fri, 3 Apr 2020 08:58:06 +0300
Subject: [PATCH] gpu: nvgpu: alloc priv cmdbuf based on chip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The semaphore wait and incr sizes are not 8 and 10 for gv11b onwards.
Use the specific HAL API to retrieve their sizes and compute the priv
cmdbuf queue based on them instead of the up-to-gp10b values.

We haven't run out of space likely for several reasons:

1) userspace may not need both pre and post fences for absolutely each
   submitted job
2) submitted jobs may consist of more than one gpfifo entry, reducing
   the relative required sync capacity
3) the queue size is rounded up to the next power of two which leaves
   some margin for error in this calculation
4) the gpfifo size based num-in-flight guess has been twice as big as it
   needs to be (fixed in a next patch)

Jira NVGPU-4548

Change-Id: I172b5c0d8bb7d2231cc45cbed5e1e8b60ce7c707
Signed-off-by: Konsta Hölttä <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2323148
(cherry picked from commit 03fb194d105242c3eb20a9857a22743f5f64b9b9)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2328412
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c | 34 +++++++++++++++------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
index 3ced083f3..a8356acc0 100644
--- a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
+++ b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
@@ -29,6 +29,7 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/priv_cmdbuf.h>
+#include <nvgpu/gk20a.h>
 
 struct priv_cmd_queue {
 	struct nvgpu_mem mem;
@@ -47,23 +48,38 @@ int nvgpu_alloc_priv_cmdbuf_queue(struct nvgpu_channel *ch,
 	struct priv_cmd_queue *q;
 	u64 size, tmp_size;
 	int err = 0;
+	u32 wait_size, incr_size;
 	bool gpfifo_based = false;
 
+	/*
+	 * sema size is at least as much as syncpt size, but semas may not be
+	 * enabled in the build. If neither semas nor syncpts are enabled, priv
+	 * cmdbufs and as such kernel mode submits with job tracking won't be
+	 * supported.
+	 */
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
+	wait_size = g->ops.sync.sema.get_wait_cmd_size();
+	incr_size = g->ops.sync.sema.get_incr_cmd_size();
+#else
+	wait_size = g->ops.sync.syncpt.get_wait_cmd_size();
+	incr_size = g->ops.sync.syncpt.get_incr_cmd_size(true);
+#endif
 	if (num_in_flight == 0U) {
 		num_in_flight = ch->gpfifo.entry_num;
 		gpfifo_based = true;
 	}
 
 	/*
-	 * Compute the amount of priv_cmdbuf space we need. In general the worst
-	 * case is the kernel inserts both a semaphore pre-fence and post-fence.
-	 * Any sync-pt fences will take less memory so we can ignore them for
-	 * now.
+	 * Compute the amount of priv_cmdbuf space we need. In general the
+	 * worst case is the kernel inserts both a semaphore pre-fence and
+	 * post-fence. Any sync-pt fences will take less memory so we can
+	 * ignore them unless they're the only supported type.
 	 *
 	 * A semaphore ACQ (fence-wait) is 8 words: semaphore_a, semaphore_b,
-	 * semaphore_c, and semaphore_d. A semaphore INCR (fence-get) will be 10
-	 * words: all the same as an ACQ plus a non-stalling intr which is
-	 * another 2 words.
+	 * semaphore_c, and semaphore_d. A semaphore INCR (fence-get) will be
+	 * 10 words: all the same as an ACQ plus a non-stalling intr which is
+	 * another 2 words. In reality these numbers vary by chip but we'll use
+	 * 8 and 10 as examples.
 	 *
 	 * We have two cases to consider: the first is we base the size of the
 	 * priv_cmd_buf on the gpfifo count. Here we multiply by a factor of
@@ -73,12 +89,12 @@ int nvgpu_alloc_priv_cmdbuf_queue(struct nvgpu_channel *ch,
 	 *   nr_gpfifos * (2 / 3) * (8 + 10) * 4 bytes
 	 *
 	 * If instead num_in_flight is specified then we will use that to size
-	 * the priv_cmd_buf. The worst case is two sync commands (one ACQ and
+	 * the priv_cmd_buf. The worst case is both sync commands (one ACQ and
 	 * one INCR) per submit so we have a priv_cmd_buf size of:
 	 *
 	 *   num_in_flight * (8 + 10) * 4 bytes
 	 */
-	size = num_in_flight * 18UL * sizeof(u32);
+	size = num_in_flight * (wait_size + incr_size) * sizeof(u32);
 	if (gpfifo_based) {
 		size = 2U * size / 3U;
 	}