mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
Move preallocation of priv cmdbuf metadata structs to the priv cmdbuf level and do it always, not only on deterministic channels. This makes job tracking simpler and loosens dependencies from jobs to cmdbuf internals. The underlying dma memory for the cmdbuf data has always been preallocated. Rename the priv cmdbuf functions to have a consistent prefix. Refactor the channel sync wait and incr ops to free any priv cmdbufs they allocate. They have been depending on the caller to free their resources even on error conditions, requiring the caller to know how they work. The error paths that could occur after a priv cmdbuf has been allocated have likely been wrong for a long time. Usually the cmdbuf queue allows allocating only from one end and freeing from only the other end, as that's natural with the hardware job queue. However, in error conditions the just recently allocated entries need to be put back. Improve the interface for this. [not part of the cherry-pick:] Delete the error prints about not enough priv cmd buffer space. That is not an error. When obeying the user-provided job sizes more strictly, momentarily running out of job tracking resources is possible when the job cleanup thread does not catch up quickly enough. In such a case the number of inflight jobs on the hardware could be less than the maximum, but the inflight job count that nvgpu sees via the consumed resources could reach the maximum. Also remove the wrong translation to -EINVAL from err from one call to nvgpu_priv_cmdbuf_alloc() - the -EAGAIN from the failed allocation is important. [not part of the cherry-pick: a bunch of MISRA mitigations.] Jira NVGPU-4548 Change-Id: I09d02bd44d50a5451500d09605f906d74009a8a4 Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2329657 (cherry picked from commit 25412412f31436688c6b45684886f7552075da83) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2332506 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
350 lines
10 KiB
C
350 lines
10 KiB
C
/*
|
|
* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include <nvgpu/log.h>
|
|
#include <nvgpu/utils.h>
|
|
#include <nvgpu/log2.h>
|
|
#include <nvgpu/barrier.h>
|
|
#include <nvgpu/dma.h>
|
|
#include <nvgpu/nvgpu_mem.h>
|
|
#include <nvgpu/kmem.h>
|
|
#include <nvgpu/channel.h>
|
|
#include <nvgpu/priv_cmdbuf.h>
|
|
#include <nvgpu/gk20a.h>
|
|
#include <nvgpu/trace.h>
|
|
#include <nvgpu/circ_buf.h>
|
|
|
|
struct priv_cmd_queue {
|
|
struct nvgpu_mem mem; /* pushbuf */
|
|
u32 size; /* allocated length in words */
|
|
u32 put; /* next entry will begin here */
|
|
u32 get; /* next entry to free begins here */
|
|
|
|
/* an entry is a fragment of the pushbuf memory */
|
|
struct priv_cmd_entry *entries;
|
|
u32 entries_len; /* allocated length */
|
|
u32 entry_put;
|
|
u32 entry_get;
|
|
};
|
|
|
|
/* allocate private cmd buffer queue.
|
|
used for inserting commands before/after user submitted buffers. */
|
|
int nvgpu_priv_cmdbuf_queue_alloc(struct nvgpu_channel *ch,
|
|
u32 num_in_flight)
|
|
{
|
|
struct gk20a *g = ch->g;
|
|
struct vm_gk20a *ch_vm = ch->vm;
|
|
struct priv_cmd_queue *q;
|
|
u64 size, tmp_size;
|
|
int err = 0;
|
|
u32 wait_size, incr_size;
|
|
|
|
/*
|
|
* sema size is at least as much as syncpt size, but semas may not be
|
|
* enabled in the build. If neither semas nor syncpts are enabled, priv
|
|
* cmdbufs and as such kernel mode submits with job tracking won't be
|
|
* supported.
|
|
*/
|
|
#ifdef CONFIG_NVGPU_SW_SEMAPHORE
|
|
wait_size = g->ops.sync.sema.get_wait_cmd_size();
|
|
incr_size = g->ops.sync.sema.get_incr_cmd_size();
|
|
#else
|
|
wait_size = g->ops.sync.syncpt.get_wait_cmd_size();
|
|
incr_size = g->ops.sync.syncpt.get_incr_cmd_size(true);
|
|
#endif
|
|
|
|
/*
|
|
* Compute the amount of priv_cmdbuf space we need. In general the
|
|
* worst case is the kernel inserts both a semaphore pre-fence and
|
|
* post-fence. Any sync-pt fences will take less memory so we can
|
|
* ignore them unless they're the only supported type. Jobs can also
|
|
* have more than one pre-fence but that's abnormal and we'll -EAGAIN
|
|
* if such jobs would fill the queue.
|
|
*
|
|
* A semaphore ACQ (fence-wait) is 8 words: semaphore_a, semaphore_b,
|
|
* semaphore_c, and semaphore_d. A semaphore INCR (fence-get) will be
|
|
* 10 words: all the same as an ACQ plus a non-stalling intr which is
|
|
* another 2 words. In reality these numbers vary by chip but we'll use
|
|
* 8 and 10 as examples.
|
|
*
|
|
* We have two cases to consider: the first is we base the size of the
|
|
* queue on the gpfifo count. Here we multiply by a factor of 1/3
|
|
* because at most a third of the GPFIFO entries can be used for
|
|
* user-submitted jobs; another third goes to wait entries, and the
|
|
* final third to incr entries. There will be one pair of acq and incr
|
|
* commands for each job.
|
|
*
|
|
* gpfifo entry num * (1 / 3) * (8 + 10) * 4 bytes
|
|
*
|
|
* If instead num_in_flight is specified then we will use that to size
|
|
* the queue instead of a third of the gpfifo entry count. The worst
|
|
* case is still both sync commands (one ACQ and one INCR) per submit so
|
|
* we have a queue size of:
|
|
*
|
|
* num_in_flight * (8 + 10) * 4 bytes
|
|
*/
|
|
if (num_in_flight == 0U) {
|
|
/* round down to ensure space for all priv cmds */
|
|
num_in_flight = ch->gpfifo.entry_num / 3U;
|
|
}
|
|
|
|
size = num_in_flight * (wait_size + incr_size) * sizeof(u32);
|
|
|
|
tmp_size = PAGE_ALIGN(roundup_pow_of_two(size));
|
|
if (tmp_size > U32_MAX) {
|
|
return -ERANGE;
|
|
}
|
|
size = (u32)tmp_size;
|
|
|
|
q = nvgpu_kzalloc(g, sizeof(*q));
|
|
if (q == NULL) {
|
|
return -ENOMEM;
|
|
}
|
|
|
|
if (num_in_flight > U32_MAX / 2U) {
|
|
err = -ERANGE;
|
|
goto err_free_queue;
|
|
}
|
|
|
|
q->entries_len = 2U * num_in_flight;
|
|
q->entries = nvgpu_vzalloc(g,
|
|
nvgpu_safe_mult_u64((u64)q->entries_len,
|
|
sizeof(*q->entries)));
|
|
if (q->entries == NULL) {
|
|
err = -ENOMEM;
|
|
goto err_free_queue;
|
|
}
|
|
|
|
err = nvgpu_dma_alloc_map_sys(ch_vm, size, &q->mem);
|
|
if (err != 0) {
|
|
nvgpu_err(g, "%s: memory allocation failed", __func__);
|
|
goto err_free_entries;
|
|
}
|
|
|
|
tmp_size = q->mem.size / sizeof(u32);
|
|
nvgpu_assert(tmp_size <= U32_MAX);
|
|
q->size = (u32)tmp_size;
|
|
|
|
ch->priv_cmd_q = q;
|
|
|
|
return 0;
|
|
err_free_entries:
|
|
nvgpu_vfree(g, q->entries);
|
|
err_free_queue:
|
|
nvgpu_kfree(g, q);
|
|
return err;
|
|
}
|
|
|
|
void nvgpu_priv_cmdbuf_queue_free(struct nvgpu_channel *ch)
|
|
{
|
|
struct vm_gk20a *ch_vm = ch->vm;
|
|
struct priv_cmd_queue *q = ch->priv_cmd_q;
|
|
struct gk20a *g = ch->g;
|
|
|
|
if (q == NULL) {
|
|
return;
|
|
}
|
|
|
|
nvgpu_dma_unmap_free(ch_vm, &q->mem);
|
|
nvgpu_vfree(g, q->entries);
|
|
nvgpu_kfree(g, q);
|
|
|
|
ch->priv_cmd_q = NULL;
|
|
}
|
|
|
|
/* allocate a cmd buffer with given size. size is number of u32 entries */
|
|
static int nvgpu_priv_cmdbuf_alloc_buf(struct nvgpu_channel *c, u32 orig_size,
|
|
struct priv_cmd_entry *e)
|
|
{
|
|
struct priv_cmd_queue *q = c->priv_cmd_q;
|
|
u32 size = orig_size;
|
|
u32 free_count;
|
|
|
|
nvgpu_log_fn(c->g, "size %d", orig_size);
|
|
|
|
/*
|
|
* If free space in the end is less than requested, increase the size
|
|
* to make the real allocated space start from beginning. The hardware
|
|
* expects each cmdbuf to be contiguous in the dma space.
|
|
*
|
|
* This too small extra space in the end may happen because the
|
|
* requested wait and incr command buffers do not necessarily align
|
|
* with the whole buffer capacity. They don't always align because the
|
|
* buffer size is rounded to the next power of two and because not all
|
|
* jobs necessarily use exactly one wait command.
|
|
*/
|
|
if (nvgpu_safe_add_u32(q->put, size) > q->size) {
|
|
size = orig_size + (q->size - q->put);
|
|
}
|
|
|
|
nvgpu_log_info(c->g, "ch %d: priv cmd queue get:put %d:%d",
|
|
c->chid, q->get, q->put);
|
|
|
|
nvgpu_assert(q->put < q->size);
|
|
nvgpu_assert(q->get < q->size);
|
|
nvgpu_assert(q->size > 0U);
|
|
free_count = (q->size - q->put + q->get - 1U) & (q->size - 1U);
|
|
|
|
if (size > free_count) {
|
|
return -EAGAIN;
|
|
}
|
|
|
|
e->fill_off = 0;
|
|
e->size = orig_size;
|
|
e->alloc_size = size;
|
|
e->mem = &q->mem;
|
|
|
|
/*
|
|
* if we have increased size to skip free space in the end, set put
|
|
* to beginning of cmd buffer + size, as if the prev put was at
|
|
* position 0.
|
|
*/
|
|
if (size != orig_size) {
|
|
e->off = 0;
|
|
e->gva = q->mem.gpu_va;
|
|
q->put = orig_size;
|
|
} else {
|
|
e->off = q->put;
|
|
e->gva = nvgpu_safe_add_u64(q->mem.gpu_va,
|
|
nvgpu_safe_mult_u64((u64)q->put, sizeof(u32)));
|
|
q->put = (q->put + orig_size) & (q->size - 1U);
|
|
}
|
|
|
|
/* we already handled q->put + size > q->size so BUG_ON this */
|
|
BUG_ON(q->put > q->size);
|
|
|
|
/*
|
|
* commit the previous writes before making the entry valid.
|
|
* see the corresponding nvgpu_smp_rmb() in
|
|
* nvgpu_priv_cmdbuf_free().
|
|
*/
|
|
nvgpu_smp_wmb();
|
|
|
|
e->valid = true;
|
|
nvgpu_log_fn(c->g, "done");
|
|
|
|
return 0;
|
|
}
|
|
|
|
int nvgpu_priv_cmdbuf_alloc(struct nvgpu_channel *c, u32 size,
|
|
struct priv_cmd_entry **e)
|
|
{
|
|
struct priv_cmd_queue *q = c->priv_cmd_q;
|
|
u32 next_put = nvgpu_safe_add_u32(q->entry_put, 1U) % q->entries_len;
|
|
struct priv_cmd_entry *entry;
|
|
int err;
|
|
|
|
if (next_put == q->entry_get) {
|
|
return -EAGAIN;
|
|
}
|
|
entry = &q->entries[q->entry_put];
|
|
|
|
err = nvgpu_priv_cmdbuf_alloc_buf(c, size, entry);
|
|
if (err != 0) {
|
|
return err;
|
|
}
|
|
|
|
q->entry_put = next_put;
|
|
*e = entry;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void nvgpu_priv_cmdbuf_rollback(struct nvgpu_channel *ch,
|
|
struct priv_cmd_entry *e)
|
|
{
|
|
struct priv_cmd_queue *q = ch->priv_cmd_q;
|
|
|
|
nvgpu_assert(q->put < q->size);
|
|
nvgpu_assert(q->size > 0U);
|
|
nvgpu_assert(e->alloc_size <= q->size);
|
|
q->put = (q->put + q->size - e->alloc_size) & (q->size - 1U);
|
|
|
|
(void)memset(e, 0, sizeof(*e));
|
|
|
|
nvgpu_assert(q->entry_put < q->entries_len);
|
|
nvgpu_assert(q->entries_len > 0U);
|
|
q->entry_put = (q->entry_put + q->entries_len - 1U)
|
|
% q->entries_len;
|
|
}
|
|
|
|
void nvgpu_priv_cmdbuf_free(struct nvgpu_channel *ch,
|
|
struct priv_cmd_entry *e)
|
|
{
|
|
struct priv_cmd_queue *q = ch->priv_cmd_q;
|
|
struct gk20a *g = ch->g;
|
|
|
|
if (e->valid) {
|
|
/* read the entry's valid flag before reading its contents */
|
|
nvgpu_smp_rmb();
|
|
if ((q->get != e->off) && e->off != 0U) {
|
|
nvgpu_err(g, "requests out-of-order, ch=%d",
|
|
ch->chid);
|
|
}
|
|
nvgpu_assert(q->size > 0U);
|
|
q->get = nvgpu_safe_add_u32(e->off, e->size) & (q->size - 1U);
|
|
q->entry_get = nvgpu_safe_add_u32(q->entry_get, 1U)
|
|
% q->entries_len;
|
|
}
|
|
|
|
(void)memset(e, 0, sizeof(*e));
|
|
}
|
|
|
|
void nvgpu_priv_cmdbuf_append(struct gk20a *g, struct priv_cmd_entry *e,
|
|
u32 *data, u32 entries)
|
|
{
|
|
nvgpu_assert(e->fill_off + entries <= e->size);
|
|
nvgpu_mem_wr_n(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
|
|
data, entries * sizeof(u32));
|
|
e->fill_off += entries;
|
|
}
|
|
|
|
void nvgpu_priv_cmdbuf_append_zeros(struct gk20a *g, struct priv_cmd_entry *e,
|
|
u32 entries)
|
|
{
|
|
nvgpu_assert(e->fill_off + entries <= e->size);
|
|
nvgpu_memset(g, e->mem, (e->off + e->fill_off) * sizeof(u32),
|
|
0, entries * sizeof(u32));
|
|
e->fill_off += entries;
|
|
}
|
|
|
|
void nvgpu_priv_cmdbuf_finish(struct gk20a *g, struct priv_cmd_entry *e,
|
|
u64 *gva, u32 *size)
|
|
{
|
|
/*
|
|
* The size is written to the pushbuf entry, so make sure this buffer
|
|
* is complete at this point. The responsibility of the channel sync is
|
|
* to be consistent in allocation and usage, and the matching size and
|
|
* add gops (e.g., get_wait_cmd_size, add_wait_cmd) help there.
|
|
*/
|
|
nvgpu_assert(e->fill_off == e->size);
|
|
|
|
#ifdef CONFIG_NVGPU_TRACE
|
|
if (e->mem->aperture == APERTURE_SYSMEM) {
|
|
trace_gk20a_push_cmdbuf(g->name, 0, e->size, 0,
|
|
(u32 *)e->mem->cpu_va + e->off);
|
|
}
|
|
#endif
|
|
*gva = e->gva;
|
|
*size = e->size;
|
|
}
|