Open source GPL/LGPL release

2025-12-24 10:34:43 +03:00 · 2022-02-15 09:37:14 -08:00
commit d3b5c6a57c
2490 changed files with 683529 additions and 0 deletions
--- a/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
+++ b/drivers/gpu/nvgpu/common/fifo/priv_cmdbuf.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/log.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/barrier.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/priv_cmdbuf.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/trace.h>
+#include <nvgpu/circ_buf.h>
+
+struct priv_cmd_entry {
+	struct nvgpu_mem *mem;
+	u32 off;	/* offset in mem, in u32 entries */
+	u32 fill_off;	/* write offset from off, in u32 entries */
+	u32 size;	/* in words */
+	u32 alloc_size;
+};
+
+struct priv_cmd_queue {
+	struct vm_gk20a *vm;
+	struct nvgpu_mem mem; /* pushbuf */
+	u32 size;	/* allocated length in words */
+	u32 put;	/* next entry will begin here */
+	u32 get;	/* next entry to free begins here */
+
+	/* an entry is a fragment of the pushbuf memory */
+	struct priv_cmd_entry *entries;
+	u32 entries_len; /* allocated length */
+	u32 entry_put;
+	u32 entry_get;
+};
+
+/* allocate private cmd buffer queue.
+   used for inserting commands before/after user submitted buffers. */
+int nvgpu_priv_cmdbuf_queue_alloc(struct vm_gk20a *vm,
+	u32 job_count, struct priv_cmd_queue **queue)
+{
+	struct gk20a *g = vm->mm->g;
+	struct priv_cmd_queue *q;
+	u64 size, tmp_size;
+	int err = 0;
+	u32 wait_size, incr_size;
+	u32 mem_per_job;
+
+	/*
+	 * sema size is at least as much as syncpt size, but semas may not be
+	 * enabled in the build. If neither semas nor syncpts are enabled, priv
+	 * cmdbufs and as such kernel mode submits with job tracking won't be
+	 * supported.
+	 */
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
+	wait_size = g->ops.sync.sema.get_wait_cmd_size();
+	incr_size = g->ops.sync.sema.get_incr_cmd_size();
+#else
+	wait_size = g->ops.sync.syncpt.get_wait_cmd_size();
+	incr_size = g->ops.sync.syncpt.get_incr_cmd_size(true);
+#endif
+
+	/*
+	 * Compute the amount of priv_cmdbuf space we need. In general the
+	 * worst case is the kernel inserts both a semaphore pre-fence and
+	 * post-fence. Any sync-pt fences will take less memory so we can
+	 * ignore them unless they're the only supported type. Jobs can also
+	 * have more than one pre-fence but that's abnormal and we'll -EAGAIN
+	 * if such jobs would fill the queue.
+	 *
+	 * A semaphore ACQ (fence-wait) is 8 words: semaphore_a, semaphore_b,
+	 * semaphore_c, and semaphore_d. A semaphore INCR (fence-get) will be
+	 * 10 words: all the same as an ACQ plus a non-stalling intr which is
+	 * another 2 words. In reality these numbers vary by chip but we'll use
+	 * 8 and 10 as examples.
+	 *
+	 * Given the job count, cmdbuf space is allocated such that each job
+	 * can get one wait command and one increment command:
+	 *
+	 *   job_count * (8 + 10) * 4 bytes
+	 *
+	 * These cmdbufs are inserted as gpfifo entries right before and after
+	 * the user submitted gpfifo entries per submit.
+	 *
+	 * One extra slot is added to the queue length so that the requested
+	 * job count can actually be allocated. This ring buffer implementation
+	 * is full when the number of consumed entries is one less than the
+	 * allocation size:
+	 *
+	 * alloc bytes = job_count * (wait + incr + 1) * slot in bytes
+	 */
+	mem_per_job = nvgpu_safe_mult_u32(
+			nvgpu_safe_add_u32(
+				nvgpu_safe_add_u32(wait_size, incr_size),
+				1U),
+			(u32)sizeof(u32));
+	/* both 32 bit and mem_per_job is small */
+	size = nvgpu_safe_mult_u64((u64)job_count, (u64)mem_per_job);
+
+	tmp_size = PAGE_ALIGN(roundup_pow_of_two(size));
+	if (tmp_size > U32_MAX) {
+		return -ERANGE;
+	}
+	size = (u32)tmp_size;
+
+	q = nvgpu_kzalloc(g, sizeof(*q));
+	if (q == NULL) {
+		return -ENOMEM;
+	}
+
+	q->vm = vm;
+
+	if (job_count > U32_MAX / 2U - 1U) {
+		err = -ERANGE;
+		goto err_free_queue;
+	}
+
+	/* One extra to account for the full condition: 2 * job_count + 1 */
+	q->entries_len = nvgpu_safe_mult_u32(2U,
+			nvgpu_safe_add_u32(job_count, 1U));
+	q->entries = nvgpu_vzalloc(g,
+			nvgpu_safe_mult_u64((u64)q->entries_len,
+				sizeof(*q->entries)));
+	if (q->entries == NULL) {
+		err = -ENOMEM;
+		goto err_free_queue;
+	}
+
+	err = nvgpu_dma_alloc_map_sys(vm, size, &q->mem);
+	if (err != 0) {
+		nvgpu_err(g, "%s: memory allocation failed", __func__);
+		goto err_free_entries;
+	}
+
+	tmp_size = q->mem.size / sizeof(u32);
+	nvgpu_assert(tmp_size <= U32_MAX);
+	q->size = (u32)tmp_size;
+
+	*queue = q;
+	return 0;
+err_free_entries:
+	nvgpu_vfree(g, q->entries);
+err_free_queue:
+	nvgpu_kfree(g, q);
+	return err;
+}
+
+void nvgpu_priv_cmdbuf_queue_free(struct priv_cmd_queue *q)
+{
+	struct vm_gk20a *vm = q->vm;
+	struct gk20a *g = vm->mm->g;
+
+	nvgpu_dma_unmap_free(vm, &q->mem);
+	nvgpu_vfree(g, q->entries);
+	nvgpu_kfree(g, q);
+}
+
+/* allocate a cmd buffer with given size. size is number of u32 entries */
+static int nvgpu_priv_cmdbuf_alloc_buf(struct priv_cmd_queue *q, u32 orig_size,
+			     struct priv_cmd_entry *e)
+{
+	struct gk20a *g = q->vm->mm->g;
+	u32 size = orig_size;
+	u32 free_count;
+
+	nvgpu_log_fn(g, "size %d", orig_size);
+
+	/*
+	 * If free space in the end is less than requested, increase the size
+	 * to make the real allocated space start from beginning. The hardware
+	 * expects each cmdbuf to be contiguous in the dma space.
+	 *
+	 * This too small extra space in the end may happen because the
+	 * requested wait and incr command buffers do not necessarily align
+	 * with the whole buffer capacity. They don't always align because the
+	 * buffer size is rounded to the next power of two and because not all
+	 * jobs necessarily use exactly one wait command.
+	 */
+	if (nvgpu_safe_add_u32(q->put, size) > q->size) {
+		size = orig_size + (q->size - q->put);
+	}
+
+	nvgpu_log_info(g, "priv cmd queue get:put %d:%d",
+			q->get, q->put);
+
+	nvgpu_assert(q->put < q->size);
+	nvgpu_assert(q->get < q->size);
+	nvgpu_assert(q->size > 0U);
+	free_count = (q->size - q->put + q->get - 1U) & (q->size - 1U);
+
+	if (size > free_count) {
+		return -EAGAIN;
+	}
+
+	e->fill_off = 0;
+	e->size = orig_size;
+	e->alloc_size = size;
+	e->mem = &q->mem;
+
+	/*
+	 * if we have increased size to skip free space in the end, set put
+	 * to beginning of cmd buffer + size, as if the prev put was at
+	 * position 0.
+	 */
+	if (size != orig_size) {
+		e->off = 0;
+		q->put = orig_size;
+	} else {
+		e->off = q->put;
+		q->put = (q->put + orig_size) & (q->size - 1U);
+	}
+
+	/* we already handled q->put + size > q->size so BUG_ON this */
+	BUG_ON(q->put > q->size);
+
+	nvgpu_log_fn(g, "done");
+
+	return 0;
+}
+
+int nvgpu_priv_cmdbuf_alloc(struct priv_cmd_queue *q, u32 size,
+			     struct priv_cmd_entry **e)
+{
+	u32 next_put = nvgpu_safe_add_u32(q->entry_put, 1U) % q->entries_len;
+	struct priv_cmd_entry *entry;
+	int err;
+
+	if (next_put == q->entry_get) {
+		return -EAGAIN;
+	}
+	entry = &q->entries[q->entry_put];
+
+	err = nvgpu_priv_cmdbuf_alloc_buf(q, size, entry);
+	if (err != 0) {
+		return err;
+	}
+
+	q->entry_put = next_put;
+	*e = entry;
+
+	return 0;
+}
+
+void nvgpu_priv_cmdbuf_rollback(struct priv_cmd_queue *q,
+		struct priv_cmd_entry *e)
+{
+	nvgpu_assert(q->put < q->size);
+	nvgpu_assert(q->size > 0U);
+	nvgpu_assert(e->alloc_size <= q->size);
+	q->put = (q->put + q->size - e->alloc_size) & (q->size - 1U);
+
+	(void)memset(e, 0, sizeof(*e));
+
+	nvgpu_assert(q->entry_put < q->entries_len);
+	nvgpu_assert(q->entries_len > 0U);
+	q->entry_put = (q->entry_put + q->entries_len - 1U)
+		% q->entries_len;
+}
+
+void nvgpu_priv_cmdbuf_free(struct priv_cmd_queue *q, struct priv_cmd_entry *e)
+{
+	struct gk20a *g = q->vm->mm->g;
+
+	if ((q->get != e->off) && e->off != 0U) {
+		nvgpu_err(g, "priv cmdbuf requests out-of-order");
+	}
+	nvgpu_assert(q->size > 0U);
+	q->get = nvgpu_safe_add_u32(e->off, e->size) & (q->size - 1U);
+	q->entry_get = nvgpu_safe_add_u32(q->entry_get, 1U) % q->entries_len;
+
+	(void)memset(e, 0, sizeof(*e));
+}
+
+void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 *data, u32 entries)
+{
+	nvgpu_assert(e->fill_off + entries <= e->size);
+	nvgpu_mem_wr_n(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
+			data, entries * sizeof(u32));
+	e->fill_off += entries;
+}
+
+void nvgpu_priv_cmdbuf_append_zeros(struct gk20a *g, struct priv_cmd_entry *e,
+		u32 entries)
+{
+	nvgpu_assert(e->fill_off + entries <= e->size);
+	nvgpu_memset(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
+			0, entries * sizeof(u32));
+	e->fill_off += entries;
+}
+
+void nvgpu_priv_cmdbuf_finish(struct gk20a *g, struct priv_cmd_entry *e,
+		u64 *gva, u32 *size)
+{
+	/*
+	 * The size is written to the pushbuf entry, so make sure this buffer
+	 * is complete at this point. The responsibility of the channel sync is
+	 * to be consistent in allocation and usage, and the matching size and
+	 * add gops (e.g., get_wait_cmd_size, add_wait_cmd) help there.
+	 */
+	nvgpu_assert(e->fill_off == e->size);
+
+#ifdef CONFIG_NVGPU_TRACE
+	if (e->mem->aperture == APERTURE_SYSMEM) {
+		trace_gk20a_push_cmdbuf(g->name, 0, e->size, 0,
+				(u32 *)e->mem->cpu_va + e->off);
+	}
+#endif
+	*gva = nvgpu_safe_add_u64(e->mem->gpu_va,
+			nvgpu_safe_mult_u64((u64)e->off, sizeof(u32)));
+	*size = e->size;
+}