gpu: nvgpu: move submit path to linux

Nvgpu submit path has a lot of dependency on Linux framework
e.g. use of copy_from_user, use of structures defined in uapi/nvgpu headers,
dma_buf_* calls for trace support etc

Hence to keep common code independent of Linux code, move submit path to
Linux directory

Move below APIs to common/linux/channel.c
trace_write_pushbuffer()
trace_write_pushbuffer_range()
gk20a_submit_prepare_syncs()
gk20a_submit_append_priv_cmdbuf()
gk20a_submit_append_gpfifo()
gk20a_submit_channel_gpfifo()

Move below APIs to common/linux/ce2.c
gk20a_ce_execute_ops()

Define gk20a_ce_execute_ops() in common/linux/ce2.c, and declare it in
gk20a/ce2_gk20a.h since it is needed in common/mm code too
Each OS needs to implement this API separately

gk20a_channel_alloc_gpfifo() use sizeof(nvgpu_gpfifo) to get size of one gpfifo
entry, but structure nvgpu_gpfifo is linux specific
Define new nvgpu_get_gpfifo_entry_size() in linux specific code and use it
in gk20a_channel_alloc_gpfifo() to get gpfifo entry size
Each OS needs to implement this API separately

Export some APIs from gk20a/ce2_gk20a.h and gk20a/channel_gk20a.h that are
needed in linux code

Jira NVGPU-259
Jira NVGPU-313

Change-Id: I360c6cb8ce4494b1e50c66af334a2a379f0d2dc4
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1586277
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Deepak Nibade
2017-10-26 08:29:56 -07:00
committed by mobile promotions
parent 5f8cfaa250
commit 23c7903eff
10 changed files with 917 additions and 827 deletions

View File

@@ -54,6 +54,8 @@ nvgpu-y := \
common/linux/comptags.o \ common/linux/comptags.o \
common/linux/dmabuf.o \ common/linux/dmabuf.o \
common/linux/sched.o \ common/linux/sched.o \
common/linux/channel.o \
common/linux/ce2.o \
common/mm/nvgpu_allocator.o \ common/mm/nvgpu_allocator.o \
common/mm/bitmap_allocator.o \ common/mm/bitmap_allocator.o \
common/mm/buddy_allocator.o \ common/mm/buddy_allocator.o \

View File

@@ -42,6 +42,7 @@
#include "cde.h" #include "cde.h"
#include "os_linux.h" #include "os_linux.h"
#include "dmabuf.h" #include "dmabuf.h"
#include "channel.h"
#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h> #include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h> #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>

View File

@@ -0,0 +1,185 @@
/*
* Copyright (c) 2017, NVIDIA Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <nvgpu/types.h>
#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
#include "gk20a/ce2_gk20a.h"
#include "gk20a/gk20a.h"
#include "channel.h"
static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
{
/* there is no local memory available,
don't allow local memory related CE flags */
if (!g->mm.vidmem.size) {
launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
NVGPU_CE_DST_LOCATION_LOCAL_FB);
}
return launch_flags;
}
int gk20a_ce_execute_ops(struct gk20a *g,
u32 ce_ctx_id,
u64 src_buf,
u64 dst_buf,
u64 size,
unsigned int payload,
int launch_flags,
int request_operation,
struct gk20a_fence *gk20a_fence_in,
u32 submit_flags,
struct gk20a_fence **gk20a_fence_out)
{
int ret = -EPERM;
struct gk20a_ce_app *ce_app = &g->ce_app;
struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
bool found = false;
u32 *cmd_buf_cpu_va;
u64 cmd_buf_gpu_va = 0;
u32 methodSize;
u32 cmd_buf_read_offset;
u32 fence_index;
struct nvgpu_gpfifo gpfifo;
struct nvgpu_fence fence = {0,0};
struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
goto end;
nvgpu_mutex_acquire(&ce_app->app_mutex);
nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
&ce_app->allocated_contexts, gk20a_gpu_ctx, list) {
if (ce_ctx->ctx_id == ce_ctx_id) {
found = true;
break;
}
}
nvgpu_mutex_release(&ce_app->app_mutex);
if (!found) {
ret = -EINVAL;
goto end;
}
if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
ret = -ENODEV;
goto end;
}
nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
/* at end of command buffer has gk20a_fence for command buffer sync */
fence_index = (cmd_buf_read_offset +
((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
ret = -ENOMEM;
goto noop;
}
cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
/* 0 is treated as invalid pre-sync */
if (cmd_buf_cpu_va[fence_index]) {
struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
memcpy((void *)&ce_cmd_buf_fence_in,
(void *)(cmd_buf_cpu_va + fence_index),
sizeof(struct gk20a_fence *));
ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in,
gk20a_get_gr_idle_timeout(g));
gk20a_fence_put(ce_cmd_buf_fence_in);
/* Reset the stored last pre-sync */
memset((void *)(cmd_buf_cpu_va + fence_index),
0,
NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
if (ret)
goto noop;
}
cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
methodSize = gk20a_ce_prepare_submit(src_buf,
dst_buf,
size,
&cmd_buf_cpu_va[cmd_buf_read_offset],
NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
payload,
gk20a_get_valid_launch_flags(g, launch_flags),
request_operation,
gpu_capability->dma_copy_class,
gk20a_fence_in);
if (methodSize) {
/* TODO: Remove CPU pre-fence wait */
if (gk20a_fence_in) {
ret = gk20a_fence_wait(g, gk20a_fence_in,
gk20a_get_gr_idle_timeout(g));
gk20a_fence_put(gk20a_fence_in);
if (ret)
goto noop;
}
/* store the element into gpfifo */
gpfifo.entry0 =
u64_lo32(cmd_buf_gpu_va);
gpfifo.entry1 =
(u64_hi32(cmd_buf_gpu_va) |
pbdma_gp_entry1_length_f(methodSize));
/* take always the postfence as it is needed for protecting the ce context */
submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
nvgpu_smp_wmb();
ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
1, submit_flags, &fence,
&ce_cmd_buf_fence_out, false, NULL);
if (!ret) {
memcpy((void *)(cmd_buf_cpu_va + fence_index),
(void *)&ce_cmd_buf_fence_out,
sizeof(struct gk20a_fence *));
if (gk20a_fence_out) {
gk20a_fence_get(ce_cmd_buf_fence_out);
*gk20a_fence_out = ce_cmd_buf_fence_out;
}
/* Next available command buffer queue Index */
++ce_ctx->cmd_buf_read_queue_offset;
++ce_ctx->submitted_seq_number;
}
} else {
ret = -ENOMEM;
}
noop:
nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
end:
return ret;
}

View File

@@ -0,0 +1,648 @@
/*
* Copyright (c) 2017, NVIDIA Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <nvgpu/enabled.h>
#include <nvgpu/debug.h>
#include <nvgpu/ltc.h>
/*
* This is required for nvgpu_vm_find_buf() which is used in the tracing
* code. Once we can get and access userspace buffers without requiring
* direct dma_buf usage this can be removed.
*/
#include <nvgpu/linux/vm.h>
#include "gk20a/gk20a.h"
#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
#include <linux/uaccess.h>
#include <linux/dma-buf.h>
#include <trace/events/gk20a.h>
u32 nvgpu_get_gpfifo_entry_size(void)
{
return sizeof(struct nvgpu_gpfifo);
}
#ifdef CONFIG_DEBUG_FS
static void trace_write_pushbuffer(struct channel_gk20a *c,
struct nvgpu_gpfifo *g)
{
void *mem = NULL;
unsigned int words;
u64 offset;
struct dma_buf *dmabuf = NULL;
if (gk20a_debug_trace_cmdbuf) {
u64 gpu_va = (u64)g->entry0 |
(u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
int err;
words = pbdma_gp_entry1_length_v(g->entry1);
err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset);
if (!err)
mem = dma_buf_vmap(dmabuf);
}
if (mem) {
u32 i;
/*
* Write in batches of 128 as there seems to be a limit
* of how much you can output to ftrace at once.
*/
for (i = 0; i < words; i += 128U) {
trace_gk20a_push_cmdbuf(
c->g->name,
0,
min(words - i, 128U),
offset + i * sizeof(u32),
mem);
}
dma_buf_vunmap(dmabuf, mem);
}
}
#endif
static void trace_write_pushbuffer_range(struct channel_gk20a *c,
struct nvgpu_gpfifo *g,
struct nvgpu_gpfifo __user *user_gpfifo,
int offset,
int count)
{
#ifdef CONFIG_DEBUG_FS
u32 size;
int i;
struct nvgpu_gpfifo *gp;
bool gpfifo_allocated = false;
if (!gk20a_debug_trace_cmdbuf)
return;
if (!g && !user_gpfifo)
return;
if (!g) {
size = count * sizeof(struct nvgpu_gpfifo);
if (size) {
g = nvgpu_big_malloc(c->g, size);
if (!g)
return;
if (copy_from_user(g, user_gpfifo, size)) {
nvgpu_big_free(c->g, g);
return;
}
}
gpfifo_allocated = true;
}
gp = g + offset;
for (i = 0; i < count; i++, gp++)
trace_write_pushbuffer(c, gp);
if (gpfifo_allocated)
nvgpu_big_free(c->g, g);
#endif
}
/*
* Handle the submit synchronization - pre-fences and post-fences.
*/
static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
struct nvgpu_fence *fence,
struct channel_gk20a_job *job,
struct priv_cmd_entry **wait_cmd,
struct priv_cmd_entry **incr_cmd,
struct gk20a_fence **pre_fence,
struct gk20a_fence **post_fence,
bool force_need_sync_fence,
bool register_irq,
u32 flags)
{
struct gk20a *g = c->g;
bool need_sync_fence = false;
bool new_sync_created = false;
int wait_fence_fd = -1;
int err = 0;
bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
/*
* If user wants to always allocate sync_fence_fds then respect that;
* otherwise, allocate sync_fence_fd based on user flags.
*/
if (force_need_sync_fence)
need_sync_fence = true;
if (g->aggressive_sync_destroy_thresh) {
nvgpu_mutex_acquire(&c->sync_lock);
if (!c->sync) {
c->sync = gk20a_channel_sync_create(c);
if (!c->sync) {
err = -ENOMEM;
nvgpu_mutex_release(&c->sync_lock);
goto fail;
}
new_sync_created = true;
}
nvgpu_atomic_inc(&c->sync->refcount);
nvgpu_mutex_release(&c->sync_lock);
}
if (g->ops.fifo.resetup_ramfc && new_sync_created) {
err = g->ops.fifo.resetup_ramfc(c);
if (err)
goto fail;
}
/*
* Optionally insert syncpt wait in the beginning of gpfifo submission
* when user requested and the wait hasn't expired. Validate that the id
* makes sense, elide if not. The only reason this isn't being
* unceremoniously killed is to keep running some tests which trigger
* this condition.
*/
if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
job->pre_fence = gk20a_alloc_fence(c);
if (!job->pre_fence) {
err = -ENOMEM;
goto fail;
}
if (!pre_alloc_enabled)
job->wait_cmd = nvgpu_kzalloc(g,
sizeof(struct priv_cmd_entry));
if (!job->wait_cmd) {
err = -ENOMEM;
goto clean_up_pre_fence;
}
if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
wait_fence_fd = fence->id;
err = c->sync->wait_fd(c->sync, wait_fence_fd,
job->wait_cmd, job->pre_fence);
} else {
err = c->sync->wait_syncpt(c->sync, fence->id,
fence->value, job->wait_cmd,
job->pre_fence);
}
if (!err) {
if (job->wait_cmd->valid)
*wait_cmd = job->wait_cmd;
*pre_fence = job->pre_fence;
} else
goto clean_up_wait_cmd;
}
if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
need_sync_fence = true;
/*
* Always generate an increment at the end of a GPFIFO submission. This
* is used to keep track of method completion for idle railgating. The
* sync_pt/semaphore PB is added to the GPFIFO later on in submit.
*/
job->post_fence = gk20a_alloc_fence(c);
if (!job->post_fence) {
err = -ENOMEM;
goto clean_up_wait_cmd;
}
if (!pre_alloc_enabled)
job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
if (!job->incr_cmd) {
err = -ENOMEM;
goto clean_up_post_fence;
}
if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
job->post_fence, need_wfi, need_sync_fence,
register_irq);
else
err = c->sync->incr(c->sync, job->incr_cmd,
job->post_fence, need_sync_fence,
register_irq);
if (!err) {
*incr_cmd = job->incr_cmd;
*post_fence = job->post_fence;
} else
goto clean_up_incr_cmd;
return 0;
clean_up_incr_cmd:
free_priv_cmdbuf(c, job->incr_cmd);
if (!pre_alloc_enabled)
job->incr_cmd = NULL;
clean_up_post_fence:
gk20a_fence_put(job->post_fence);
job->post_fence = NULL;
clean_up_wait_cmd:
free_priv_cmdbuf(c, job->wait_cmd);
if (!pre_alloc_enabled)
job->wait_cmd = NULL;
clean_up_pre_fence:
gk20a_fence_put(job->pre_fence);
job->pre_fence = NULL;
fail:
*wait_cmd = NULL;
*pre_fence = NULL;
return err;
}
static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
struct priv_cmd_entry *cmd)
{
struct gk20a *g = c->g;
struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
struct nvgpu_gpfifo x = {
.entry0 = u64_lo32(cmd->gva),
.entry1 = u64_hi32(cmd->gva) |
pbdma_gp_entry1_length_f(cmd->size)
};
nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
&x, sizeof(x));
if (cmd->mem->aperture == APERTURE_SYSMEM)
trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
cmd->mem->cpu_va + cmd->off * sizeof(u32));
c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
}
/*
* Copy source gpfifo entries into the gpfifo ring buffer, potentially
* splitting into two memcpys to handle wrap-around.
*/
static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
struct nvgpu_gpfifo *kern_gpfifo,
struct nvgpu_gpfifo __user *user_gpfifo,
u32 num_entries)
{
/* byte offsets */
u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
u32 end = start + len; /* exclusive */
struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
struct nvgpu_gpfifo *cpu_src;
int err;
if (user_gpfifo && !c->gpfifo.pipe) {
/*
* This path (from userspace to sysmem) is special in order to
* avoid two copies unnecessarily (from user to pipe, then from
* pipe to gpu sysmem buffer).
*
* As a special case, the pipe buffer exists if PRAMIN writes
* are forced, although the buffers may not be in vidmem in
* that case.
*/
if (end > gpfifo_size) {
/* wrap-around */
int length0 = gpfifo_size - start;
int length1 = len - length0;
void __user *user2 = (u8 __user *)user_gpfifo + length0;
err = copy_from_user(gpfifo_mem->cpu_va + start,
user_gpfifo, length0);
if (err)
return err;
err = copy_from_user(gpfifo_mem->cpu_va,
user2, length1);
if (err)
return err;
} else {
err = copy_from_user(gpfifo_mem->cpu_va + start,
user_gpfifo, len);
if (err)
return err;
}
trace_write_pushbuffer_range(c, NULL, user_gpfifo,
0, num_entries);
goto out;
} else if (user_gpfifo) {
/* from userspace to vidmem or sysmem when pramin forced, use
* the common copy path below */
err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
if (err)
return err;
cpu_src = c->gpfifo.pipe;
} else {
/* from kernel to either sysmem or vidmem, don't need
* copy_from_user so use the common path below */
cpu_src = kern_gpfifo;
}
if (end > gpfifo_size) {
/* wrap-around */
int length0 = gpfifo_size - start;
int length1 = len - length0;
void *src2 = (u8 *)cpu_src + length0;
nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
} else {
nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
}
trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
out:
c->gpfifo.put = (c->gpfifo.put + num_entries) &
(c->gpfifo.entry_num - 1);
return 0;
}
int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
struct nvgpu_gpfifo *gpfifo,
struct nvgpu_submit_gpfifo_args *args,
u32 num_entries,
u32 flags,
struct nvgpu_fence *fence,
struct gk20a_fence **fence_out,
bool force_need_sync_fence,
struct fifo_profile_gk20a *profile)
{
struct gk20a *g = c->g;
struct priv_cmd_entry *wait_cmd = NULL;
struct priv_cmd_entry *incr_cmd = NULL;
struct gk20a_fence *pre_fence = NULL;
struct gk20a_fence *post_fence = NULL;
struct channel_gk20a_job *job = NULL;
/* we might need two extra gpfifo entries - one for pre fence
* and one for post fence. */
const int extra_entries = 2;
bool skip_buffer_refcounting = (flags &
NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
int err = 0;
bool need_job_tracking;
bool need_deferred_cleanup = false;
struct nvgpu_gpfifo __user *user_gpfifo = args ?
(struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
return -ENODEV;
if (c->has_timedout)
return -ETIMEDOUT;
if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
return -ENOMEM;
/* fifo not large enough for request. Return error immediately.
* Kernel can insert gpfifo entries before and after user gpfifos.
* So, add extra_entries in user request. Also, HW with fifo size N
* can accept only N-1 entreis and so the below condition */
if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
nvgpu_err(g, "not enough gpfifo space allocated");
return -ENOMEM;
}
if (!gpfifo && !args)
return -EINVAL;
if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
!fence)
return -EINVAL;
/* an address space needs to have been bound at this point. */
if (!gk20a_channel_as_bound(c)) {
nvgpu_err(g,
"not bound to an address space at time of gpfifo"
" submission.");
return -EINVAL;
}
if (profile)
profile->timestamp[PROFILE_ENTRY] = sched_clock();
/* update debug settings */
nvgpu_ltc_sync_enabled(g);
gk20a_dbg_info("channel %d", c->chid);
/*
* Job tracking is necessary for any of the following conditions:
* - pre- or post-fence functionality
* - channel wdt
* - GPU rail-gating with non-deterministic channels
* - buffer refcounting
*
* If none of the conditions are met, then job tracking is not
* required and a fast submit can be done (ie. only need to write
* out userspace GPFIFO entries and update GP_PUT).
*/
need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
c->wdt_enabled ||
(g->can_railgate && !c->deterministic) ||
!skip_buffer_refcounting;
if (need_job_tracking) {
bool need_sync_framework = false;
/*
* If the channel is to have deterministic latency and
* job tracking is required, the channel must have
* pre-allocated resources. Otherwise, we fail the submit here
*/
if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
return -EINVAL;
need_sync_framework = force_need_sync_fence ||
gk20a_channel_sync_needs_sync_framework(g) ||
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT ||
flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
/*
* Deferred clean-up is necessary for any of the following
* conditions:
* - channel's deterministic flag is not set
* - dependency on sync framework, which could make the
* behavior of the clean-up operation non-deterministic
* (should not be performed in the submit path)
* - channel wdt
* - GPU rail-gating with non-deterministic channels
* - buffer refcounting
*
* If none of the conditions are met, then deferred clean-up
* is not required, and we clean-up one job-tracking
* resource in the submit path.
*/
need_deferred_cleanup = !c->deterministic ||
need_sync_framework ||
c->wdt_enabled ||
(g->can_railgate &&
!c->deterministic) ||
!skip_buffer_refcounting;
/*
* For deterministic channels, we don't allow deferred clean_up
* processing to occur. In cases we hit this, we fail the submit
*/
if (c->deterministic && need_deferred_cleanup)
return -EINVAL;
if (!c->deterministic) {
/*
* Get a power ref unless this is a deterministic
* channel that holds them during the channel lifetime.
* This one is released by gk20a_channel_clean_up_jobs,
* via syncpt or sema interrupt, whichever is used.
*/
err = gk20a_busy(g);
if (err) {
nvgpu_err(g,
"failed to host gk20a to submit gpfifo, process %s",
current->comm);
return err;
}
}
if (!need_deferred_cleanup) {
/* clean up a single job */
gk20a_channel_clean_up_jobs(c, false);
}
}
/* Grab access to HW to deal with do_idle */
if (c->deterministic)
nvgpu_rwsem_down_read(&g->deterministic_busy);
trace_gk20a_channel_submit_gpfifo(g->name,
c->chid,
num_entries,
flags,
fence ? fence->id : 0,
fence ? fence->value : 0);
gk20a_dbg_info("pre-submit put %d, get %d, size %d",
c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
/*
* Make sure we have enough space for gpfifo entries. Check cached
* values first and then read from HW. If no space, return EAGAIN
* and let userpace decide to re-try request or not.
*/
if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
err = -EAGAIN;
goto clean_up;
}
}
if (c->has_timedout) {
err = -ETIMEDOUT;
goto clean_up;
}
if (need_job_tracking) {
err = channel_gk20a_alloc_job(c, &job);
if (err)
goto clean_up;
err = gk20a_submit_prepare_syncs(c, fence, job,
&wait_cmd, &incr_cmd,
&pre_fence, &post_fence,
force_need_sync_fence,
need_deferred_cleanup,
flags);
if (err)
goto clean_up_job;
}
if (profile)
profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
if (wait_cmd)
gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
if (gpfifo || user_gpfifo)
err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
num_entries);
if (err)
goto clean_up_job;
/*
* And here's where we add the incr_cmd we generated earlier. It should
* always run!
*/
if (incr_cmd)
gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
if (fence_out)
*fence_out = gk20a_fence_get(post_fence);
if (need_job_tracking)
/* TODO! Check for errors... */
gk20a_channel_add_job(c, job, skip_buffer_refcounting);
if (profile)
profile->timestamp[PROFILE_APPEND] = sched_clock();
g->ops.fifo.userd_gp_put(g, c);
if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) &&
g->ops.fifo.reschedule_runlist)
g->ops.fifo.reschedule_runlist(g, c->runlist_id);
/* No hw access beyond this point */
if (c->deterministic)
nvgpu_rwsem_up_read(&g->deterministic_busy);
trace_gk20a_channel_submitted_gpfifo(g->name,
c->chid,
num_entries,
flags,
post_fence ? post_fence->syncpt_id : 0,
post_fence ? post_fence->syncpt_value : 0);
gk20a_dbg_info("post-submit put %d, get %d, size %d",
c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
if (profile)
profile->timestamp[PROFILE_END] = sched_clock();
gk20a_dbg_fn("done");
return err;
clean_up_job:
channel_gk20a_free_job(c, job);
clean_up:
gk20a_dbg_fn("fail");
gk20a_fence_put(pre_fence);
gk20a_fence_put(post_fence);
if (c->deterministic)
nvgpu_rwsem_up_read(&g->deterministic_busy);
else if (need_deferred_cleanup)
gk20a_idle(g);
return err;
}

View File

@@ -0,0 +1,38 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __NVGPU_CHANNEL_H__
#define __NVGPU_CHANNEL_H__
#include <nvgpu/types.h>
struct channel_gk20a;
struct nvgpu_gpfifo;
struct nvgpu_submit_gpfifo_args;
struct nvgpu_fence;
struct gk20a_fence;
struct fifo_profile_gk20a;
int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
struct nvgpu_gpfifo *gpfifo,
struct nvgpu_submit_gpfifo_args *args,
u32 num_entries,
u32 flags,
struct nvgpu_fence *fence,
struct gk20a_fence **fence_out,
bool force_need_sync_fence,
struct fifo_profile_gk20a *profile);
#endif /* __NVGPU_CHANNEL_H__ */

View File

@@ -36,6 +36,7 @@
#include "gk20a/platform_gk20a.h" #include "gk20a/platform_gk20a.h"
#include "ioctl_channel.h" #include "ioctl_channel.h"
#include "channel.h"
#include "os_linux.h" #include "os_linux.h"
#include "ctxsw_trace.h" #include "ctxsw_trace.h"

View File

@@ -249,18 +249,7 @@ static inline unsigned int gk20a_ce_get_method_size(int request_operation,
return methodsize; return methodsize;
} }
static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags) int gk20a_ce_prepare_submit(u64 src_buf,
{
/* there is no local memory available,
don't allow local memory related CE flags */
if (!g->mm.vidmem.size) {
launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
NVGPU_CE_DST_LOCATION_LOCAL_FB);
}
return launch_flags;
}
static int gk20a_ce_prepare_submit(u64 src_buf,
u64 dst_buf, u64 dst_buf,
u64 size, u64 size,
u32 *cmd_buf_cpu_va, u32 *cmd_buf_cpu_va,
@@ -626,157 +615,6 @@ end:
} }
EXPORT_SYMBOL(gk20a_ce_create_context_with_cb); EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
int gk20a_ce_execute_ops(struct gk20a *g,
u32 ce_ctx_id,
u64 src_buf,
u64 dst_buf,
u64 size,
unsigned int payload,
int launch_flags,
int request_operation,
struct gk20a_fence *gk20a_fence_in,
u32 submit_flags,
struct gk20a_fence **gk20a_fence_out)
{
int ret = -EPERM;
struct gk20a_ce_app *ce_app = &g->ce_app;
struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
bool found = false;
u32 *cmd_buf_cpu_va;
u64 cmd_buf_gpu_va = 0;
u32 methodSize;
u32 cmd_buf_read_offset;
u32 fence_index;
struct nvgpu_gpfifo gpfifo;
struct nvgpu_fence fence = {0,0};
struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
goto end;
nvgpu_mutex_acquire(&ce_app->app_mutex);
nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
&ce_app->allocated_contexts, gk20a_gpu_ctx, list) {
if (ce_ctx->ctx_id == ce_ctx_id) {
found = true;
break;
}
}
nvgpu_mutex_release(&ce_app->app_mutex);
if (!found) {
ret = -EINVAL;
goto end;
}
if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
ret = -ENODEV;
goto end;
}
nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
/* at end of command buffer has gk20a_fence for command buffer sync */
fence_index = (cmd_buf_read_offset +
((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
ret = -ENOMEM;
goto noop;
}
cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
/* 0 is treated as invalid pre-sync */
if (cmd_buf_cpu_va[fence_index]) {
struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
memcpy((void *)&ce_cmd_buf_fence_in,
(void *)(cmd_buf_cpu_va + fence_index),
sizeof(struct gk20a_fence *));
ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in,
gk20a_get_gr_idle_timeout(g));
gk20a_fence_put(ce_cmd_buf_fence_in);
/* Reset the stored last pre-sync */
memset((void *)(cmd_buf_cpu_va + fence_index),
0,
NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
if (ret)
goto noop;
}
cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
methodSize = gk20a_ce_prepare_submit(src_buf,
dst_buf,
size,
&cmd_buf_cpu_va[cmd_buf_read_offset],
NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
payload,
gk20a_get_valid_launch_flags(g, launch_flags),
request_operation,
gpu_capability->dma_copy_class,
gk20a_fence_in);
if (methodSize) {
/* TODO: Remove CPU pre-fence wait */
if (gk20a_fence_in) {
ret = gk20a_fence_wait(g, gk20a_fence_in,
gk20a_get_gr_idle_timeout(g));
gk20a_fence_put(gk20a_fence_in);
if (ret)
goto noop;
}
/* store the element into gpfifo */
gpfifo.entry0 =
u64_lo32(cmd_buf_gpu_va);
gpfifo.entry1 =
(u64_hi32(cmd_buf_gpu_va) |
pbdma_gp_entry1_length_f(methodSize));
/* take always the postfence as it is needed for protecting the ce context */
submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
nvgpu_smp_wmb();
ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
1, submit_flags, &fence,
&ce_cmd_buf_fence_out, false, NULL);
if (!ret) {
memcpy((void *)(cmd_buf_cpu_va + fence_index),
(void *)&ce_cmd_buf_fence_out,
sizeof(struct gk20a_fence *));
if (gk20a_fence_out) {
gk20a_fence_get(ce_cmd_buf_fence_out);
*gk20a_fence_out = ce_cmd_buf_fence_out;
}
/* Next available command buffer queue Index */
++ce_ctx->cmd_buf_read_queue_offset;
++ce_ctx->submitted_seq_number;
}
} else
ret = -ENOMEM;
noop:
nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
end:
return ret;
}
EXPORT_SYMBOL(gk20a_ce_execute_ops);
void gk20a_ce_delete_context(struct gk20a *g, void gk20a_ce_delete_context(struct gk20a *g,
u32 ce_ctx_id) u32 ce_ctx_id)
{ {

View File

@@ -161,5 +161,15 @@ void gk20a_ce_delete_context_priv(struct gk20a *g,
u32 ce_ctx_id); u32 ce_ctx_id);
void gk20a_ce_delete_context(struct gk20a *g, void gk20a_ce_delete_context(struct gk20a *g,
u32 ce_ctx_id); u32 ce_ctx_id);
int gk20a_ce_prepare_submit(u64 src_buf,
u64 dst_buf,
u64 size,
u32 *cmd_buf_cpu_va,
u32 max_cmd_buf_size,
unsigned int payload,
int launch_flags,
int request_operation,
u32 dma_copy_class,
struct gk20a_fence *gk20a_fence_in);
#endif /*__CE2_GK20A_H__*/ #endif /*__CE2_GK20A_H__*/

View File

@@ -44,45 +44,13 @@
#include <nvgpu/barrier.h> #include <nvgpu/barrier.h>
#include <nvgpu/ctxsw_trace.h> #include <nvgpu/ctxsw_trace.h>
/*
* This is required for nvgpu_vm_find_buf() which is used in the tracing
* code. Once we can get and access userspace buffers without requiring
* direct dma_buf usage this can be removed.
*/
#include <nvgpu/linux/vm.h>
#include "gk20a.h" #include "gk20a.h"
#include "dbg_gpu_gk20a.h" #include "dbg_gpu_gk20a.h"
#include "fence_gk20a.h" #include "fence_gk20a.h"
#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
/*
* Note
* This is added for all the copy_from_user methods in this file which needs to
* be moved lated to reduce depenedency on Linux
*/
#include <linux/uaccess.h>
/*
* Although channels do have pointers back to the gk20a struct that they were
* created under in cases where the driver is killed that pointer can be bad.
* The channel memory can be freed before the release() function for a given
* channel is called. This happens when the driver dies and userspace doesn't
* get a chance to call release() until after the entire gk20a driver data is
* unloaded and freed.
*/
struct channel_priv {
struct gk20a *g;
struct channel_gk20a *c;
};
static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c); static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c); static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c);
static void free_priv_cmdbuf(struct channel_gk20a *c,
struct priv_cmd_entry *e);
static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c); static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c); static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
@@ -97,9 +65,6 @@ static struct channel_gk20a_job *channel_gk20a_joblist_peek(
static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch); static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
bool clean_all);
/* allocate GPU channel */ /* allocate GPU channel */
static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
{ {
@@ -1038,7 +1003,7 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
/* Don't call this to free an explict cmd entry. /* Don't call this to free an explict cmd entry.
* It doesn't update priv_cmd_queue get/put */ * It doesn't update priv_cmd_queue get/put */
static void free_priv_cmdbuf(struct channel_gk20a *c, void free_priv_cmdbuf(struct channel_gk20a *c,
struct priv_cmd_entry *e) struct priv_cmd_entry *e)
{ {
if (channel_gk20a_is_prealloc_enabled(c)) if (channel_gk20a_is_prealloc_enabled(c))
@@ -1047,7 +1012,7 @@ static void free_priv_cmdbuf(struct channel_gk20a *c,
nvgpu_kfree(c->g, e); nvgpu_kfree(c->g, e);
} }
static int channel_gk20a_alloc_job(struct channel_gk20a *c, int channel_gk20a_alloc_job(struct channel_gk20a *c,
struct channel_gk20a_job **job_out) struct channel_gk20a_job **job_out)
{ {
int err = 0; int err = 0;
@@ -1080,7 +1045,7 @@ static int channel_gk20a_alloc_job(struct channel_gk20a *c,
return err; return err;
} }
static void channel_gk20a_free_job(struct channel_gk20a *c, void channel_gk20a_free_job(struct channel_gk20a *c,
struct channel_gk20a_job *job) struct channel_gk20a_job *job)
{ {
/* /*
@@ -1267,11 +1232,12 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
{ {
struct gk20a *g = c->g; struct gk20a *g = c->g;
struct vm_gk20a *ch_vm; struct vm_gk20a *ch_vm;
u32 gpfifo_size; u32 gpfifo_size, gpfifo_entry_size;
int err = 0; int err = 0;
unsigned long acquire_timeout; unsigned long acquire_timeout;
gpfifo_size = num_entries; gpfifo_size = num_entries;
gpfifo_entry_size = nvgpu_get_gpfifo_entry_size();
if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED) if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED)
c->vpr = true; c->vpr = true;
@@ -1315,7 +1281,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
} }
err = nvgpu_dma_alloc_map_sys(ch_vm, err = nvgpu_dma_alloc_map_sys(ch_vm,
gpfifo_size * sizeof(struct nvgpu_gpfifo), gpfifo_size * gpfifo_entry_size,
&c->gpfifo.mem); &c->gpfifo.mem);
if (err) { if (err) {
nvgpu_err(g, "%s: memory allocation failed", __func__); nvgpu_err(g, "%s: memory allocation failed", __func__);
@@ -1324,7 +1290,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) { if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
c->gpfifo.pipe = nvgpu_big_malloc(g, c->gpfifo.pipe = nvgpu_big_malloc(g,
gpfifo_size * sizeof(struct nvgpu_gpfifo)); gpfifo_size * gpfifo_entry_size);
if (!c->gpfifo.pipe) { if (!c->gpfifo.pipe) {
err = -ENOMEM; err = -ENOMEM;
goto clean_up_unmap; goto clean_up_unmap;
@@ -1427,7 +1393,7 @@ static inline u32 update_gp_get(struct gk20a *g,
return new_get; return new_get;
} }
static inline u32 gp_free_count(struct channel_gk20a *c) u32 nvgpu_gp_free_count(struct channel_gk20a *c)
{ {
return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) % return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) %
c->gpfifo.entry_num; c->gpfifo.entry_num;
@@ -1460,91 +1426,10 @@ static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
return ch->g->ch_wdt_timeout_ms; return ch->g->ch_wdt_timeout_ms;
} }
static u32 get_gp_free_count(struct channel_gk20a *c) u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
{ {
update_gp_get(c->g, c); update_gp_get(c->g, c);
return gp_free_count(c); return nvgpu_gp_free_count(c);
}
#ifdef CONFIG_DEBUG_FS
static void trace_write_pushbuffer(struct channel_gk20a *c,
struct nvgpu_gpfifo *g)
{
void *mem = NULL;
unsigned int words;
u64 offset;
struct dma_buf *dmabuf = NULL;
if (gk20a_debug_trace_cmdbuf) {
u64 gpu_va = (u64)g->entry0 |
(u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
int err;
words = pbdma_gp_entry1_length_v(g->entry1);
err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset);
if (!err)
mem = dma_buf_vmap(dmabuf);
}
if (mem) {
u32 i;
/*
* Write in batches of 128 as there seems to be a limit
* of how much you can output to ftrace at once.
*/
for (i = 0; i < words; i += 128U) {
trace_gk20a_push_cmdbuf(
c->g->name,
0,
min(words - i, 128U),
offset + i * sizeof(u32),
mem);
}
dma_buf_vunmap(dmabuf, mem);
}
}
#endif
static void trace_write_pushbuffer_range(struct channel_gk20a *c,
struct nvgpu_gpfifo *g,
struct nvgpu_gpfifo __user *user_gpfifo,
int offset,
int count)
{
#ifdef CONFIG_DEBUG_FS
u32 size;
int i;
struct nvgpu_gpfifo *gp;
bool gpfifo_allocated = false;
if (!gk20a_debug_trace_cmdbuf)
return;
if (!g && !user_gpfifo)
return;
if (!g) {
size = count * sizeof(struct nvgpu_gpfifo);
if (size) {
g = nvgpu_big_malloc(c->g, size);
if (!g)
return;
if (copy_from_user(g, user_gpfifo, size)) {
nvgpu_big_free(c->g, g);
return;
}
}
gpfifo_allocated = true;
}
gp = g + offset;
for (i = 0; i < count; i++, gp++)
trace_write_pushbuffer(c, gp);
if (gpfifo_allocated)
nvgpu_big_free(c->g, g);
#endif
} }
static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
@@ -2032,7 +1917,7 @@ int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e)
return 0; return 0;
} }
static int gk20a_channel_add_job(struct channel_gk20a *c, int gk20a_channel_add_job(struct channel_gk20a *c,
struct channel_gk20a_job *job, struct channel_gk20a_job *job,
bool skip_buffer_refcounting) bool skip_buffer_refcounting)
{ {
@@ -2097,7 +1982,7 @@ err_put_buffers:
* per-job memory for completed jobs; in case of preallocated resources, this * per-job memory for completed jobs; in case of preallocated resources, this
* opens up slots for new jobs to be submitted. * opens up slots for new jobs to be submitted.
*/ */
static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
bool clean_all) bool clean_all)
{ {
struct vm_gk20a *vm; struct vm_gk20a *vm;
@@ -2257,533 +2142,6 @@ void gk20a_channel_update(struct channel_gk20a *c)
gk20a_channel_worker_enqueue(c); gk20a_channel_worker_enqueue(c);
} }
static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
struct priv_cmd_entry *cmd)
{
struct gk20a *g = c->g;
struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
struct nvgpu_gpfifo x = {
.entry0 = u64_lo32(cmd->gva),
.entry1 = u64_hi32(cmd->gva) |
pbdma_gp_entry1_length_f(cmd->size)
};
nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
&x, sizeof(x));
if (cmd->mem->aperture == APERTURE_SYSMEM)
trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
cmd->mem->cpu_va + cmd->off * sizeof(u32));
c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
}
/*
* Copy source gpfifo entries into the gpfifo ring buffer, potentially
* splitting into two memcpys to handle wrap-around.
*/
static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
struct nvgpu_gpfifo *kern_gpfifo,
struct nvgpu_gpfifo __user *user_gpfifo,
u32 num_entries)
{
/* byte offsets */
u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
u32 end = start + len; /* exclusive */
struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
struct nvgpu_gpfifo *cpu_src;
int err;
if (user_gpfifo && !c->gpfifo.pipe) {
/*
* This path (from userspace to sysmem) is special in order to
* avoid two copies unnecessarily (from user to pipe, then from
* pipe to gpu sysmem buffer).
*
* As a special case, the pipe buffer exists if PRAMIN writes
* are forced, although the buffers may not be in vidmem in
* that case.
*/
if (end > gpfifo_size) {
/* wrap-around */
int length0 = gpfifo_size - start;
int length1 = len - length0;
void __user *user2 = (u8 __user *)user_gpfifo + length0;
err = copy_from_user(gpfifo_mem->cpu_va + start,
user_gpfifo, length0);
if (err)
return err;
err = copy_from_user(gpfifo_mem->cpu_va,
user2, length1);
if (err)
return err;
} else {
err = copy_from_user(gpfifo_mem->cpu_va + start,
user_gpfifo, len);
if (err)
return err;
}
trace_write_pushbuffer_range(c, NULL, user_gpfifo,
0, num_entries);
goto out;
} else if (user_gpfifo) {
/* from userspace to vidmem or sysmem when pramin forced, use
* the common copy path below */
err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
if (err)
return err;
cpu_src = c->gpfifo.pipe;
} else {
/* from kernel to either sysmem or vidmem, don't need
* copy_from_user so use the common path below */
cpu_src = kern_gpfifo;
}
if (end > gpfifo_size) {
/* wrap-around */
int length0 = gpfifo_size - start;
int length1 = len - length0;
void *src2 = (u8 *)cpu_src + length0;
nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
} else {
nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
}
trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
out:
c->gpfifo.put = (c->gpfifo.put + num_entries) &
(c->gpfifo.entry_num - 1);
return 0;
}
/*
* Handle the submit synchronization - pre-fences and post-fences.
*/
static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
struct nvgpu_fence *fence,
struct channel_gk20a_job *job,
struct priv_cmd_entry **wait_cmd,
struct priv_cmd_entry **incr_cmd,
struct gk20a_fence **pre_fence,
struct gk20a_fence **post_fence,
bool force_need_sync_fence,
bool register_irq,
u32 flags)
{
struct gk20a *g = c->g;
bool need_sync_fence = false;
bool new_sync_created = false;
int wait_fence_fd = -1;
int err = 0;
bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
/*
* If user wants to always allocate sync_fence_fds then respect that;
* otherwise, allocate sync_fence_fd based on user flags.
*/
if (force_need_sync_fence)
need_sync_fence = true;
if (g->aggressive_sync_destroy_thresh) {
nvgpu_mutex_acquire(&c->sync_lock);
if (!c->sync) {
c->sync = gk20a_channel_sync_create(c);
if (!c->sync) {
err = -ENOMEM;
nvgpu_mutex_release(&c->sync_lock);
goto fail;
}
new_sync_created = true;
}
nvgpu_atomic_inc(&c->sync->refcount);
nvgpu_mutex_release(&c->sync_lock);
}
if (g->ops.fifo.resetup_ramfc && new_sync_created) {
err = g->ops.fifo.resetup_ramfc(c);
if (err)
goto fail;
}
/*
* Optionally insert syncpt wait in the beginning of gpfifo submission
* when user requested and the wait hasn't expired. Validate that the id
* makes sense, elide if not. The only reason this isn't being
* unceremoniously killed is to keep running some tests which trigger
* this condition.
*/
if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
job->pre_fence = gk20a_alloc_fence(c);
if (!job->pre_fence) {
err = -ENOMEM;
goto fail;
}
if (!pre_alloc_enabled)
job->wait_cmd = nvgpu_kzalloc(g,
sizeof(struct priv_cmd_entry));
if (!job->wait_cmd) {
err = -ENOMEM;
goto clean_up_pre_fence;
}
if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
wait_fence_fd = fence->id;
err = c->sync->wait_fd(c->sync, wait_fence_fd,
job->wait_cmd, job->pre_fence);
} else {
err = c->sync->wait_syncpt(c->sync, fence->id,
fence->value, job->wait_cmd,
job->pre_fence);
}
if (!err) {
if (job->wait_cmd->valid)
*wait_cmd = job->wait_cmd;
*pre_fence = job->pre_fence;
} else
goto clean_up_wait_cmd;
}
if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
need_sync_fence = true;
/*
* Always generate an increment at the end of a GPFIFO submission. This
* is used to keep track of method completion for idle railgating. The
* sync_pt/semaphore PB is added to the GPFIFO later on in submit.
*/
job->post_fence = gk20a_alloc_fence(c);
if (!job->post_fence) {
err = -ENOMEM;
goto clean_up_wait_cmd;
}
if (!pre_alloc_enabled)
job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
if (!job->incr_cmd) {
err = -ENOMEM;
goto clean_up_post_fence;
}
if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
job->post_fence, need_wfi, need_sync_fence,
register_irq);
else
err = c->sync->incr(c->sync, job->incr_cmd,
job->post_fence, need_sync_fence,
register_irq);
if (!err) {
*incr_cmd = job->incr_cmd;
*post_fence = job->post_fence;
} else
goto clean_up_incr_cmd;
return 0;
clean_up_incr_cmd:
free_priv_cmdbuf(c, job->incr_cmd);
if (!pre_alloc_enabled)
job->incr_cmd = NULL;
clean_up_post_fence:
gk20a_fence_put(job->post_fence);
job->post_fence = NULL;
clean_up_wait_cmd:
free_priv_cmdbuf(c, job->wait_cmd);
if (!pre_alloc_enabled)
job->wait_cmd = NULL;
clean_up_pre_fence:
gk20a_fence_put(job->pre_fence);
job->pre_fence = NULL;
fail:
*wait_cmd = NULL;
*pre_fence = NULL;
return err;
}
int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
struct nvgpu_gpfifo *gpfifo,
struct nvgpu_submit_gpfifo_args *args,
u32 num_entries,
u32 flags,
struct nvgpu_fence *fence,
struct gk20a_fence **fence_out,
bool force_need_sync_fence,
struct fifo_profile_gk20a *profile)
{
struct gk20a *g = c->g;
struct priv_cmd_entry *wait_cmd = NULL;
struct priv_cmd_entry *incr_cmd = NULL;
struct gk20a_fence *pre_fence = NULL;
struct gk20a_fence *post_fence = NULL;
struct channel_gk20a_job *job = NULL;
/* we might need two extra gpfifo entries - one for pre fence
* and one for post fence. */
const int extra_entries = 2;
bool skip_buffer_refcounting = (flags &
NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
int err = 0;
bool need_job_tracking;
bool need_deferred_cleanup = false;
struct nvgpu_gpfifo __user *user_gpfifo = args ?
(struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
return -ENODEV;
if (c->has_timedout)
return -ETIMEDOUT;
if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
return -ENOMEM;
/* fifo not large enough for request. Return error immediately.
* Kernel can insert gpfifo entries before and after user gpfifos.
* So, add extra_entries in user request. Also, HW with fifo size N
* can accept only N-1 entreis and so the below condition */
if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
nvgpu_err(g, "not enough gpfifo space allocated");
return -ENOMEM;
}
if (!gpfifo && !args)
return -EINVAL;
if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
!fence)
return -EINVAL;
/* an address space needs to have been bound at this point. */
if (!gk20a_channel_as_bound(c)) {
nvgpu_err(g,
"not bound to an address space at time of gpfifo"
" submission.");
return -EINVAL;
}
if (profile)
profile->timestamp[PROFILE_ENTRY] = sched_clock();
/* update debug settings */
nvgpu_ltc_sync_enabled(g);
gk20a_dbg_info("channel %d", c->chid);
/*
* Job tracking is necessary for any of the following conditions:
* - pre- or post-fence functionality
* - channel wdt
* - GPU rail-gating with non-deterministic channels
* - buffer refcounting
*
* If none of the conditions are met, then job tracking is not
* required and a fast submit can be done (ie. only need to write
* out userspace GPFIFO entries and update GP_PUT).
*/
need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
c->wdt_enabled ||
(g->can_railgate && !c->deterministic) ||
!skip_buffer_refcounting;
if (need_job_tracking) {
bool need_sync_framework = false;
/*
* If the channel is to have deterministic latency and
* job tracking is required, the channel must have
* pre-allocated resources. Otherwise, we fail the submit here
*/
if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
return -EINVAL;
need_sync_framework = force_need_sync_fence ||
gk20a_channel_sync_needs_sync_framework(g) ||
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT ||
flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
/*
* Deferred clean-up is necessary for any of the following
* conditions:
* - channel's deterministic flag is not set
* - dependency on sync framework, which could make the
* behavior of the clean-up operation non-deterministic
* (should not be performed in the submit path)
* - channel wdt
* - GPU rail-gating with non-deterministic channels
* - buffer refcounting
*
* If none of the conditions are met, then deferred clean-up
* is not required, and we clean-up one job-tracking
* resource in the submit path.
*/
need_deferred_cleanup = !c->deterministic ||
need_sync_framework ||
c->wdt_enabled ||
(g->can_railgate &&
!c->deterministic) ||
!skip_buffer_refcounting;
/*
* For deterministic channels, we don't allow deferred clean_up
* processing to occur. In cases we hit this, we fail the submit
*/
if (c->deterministic && need_deferred_cleanup)
return -EINVAL;
if (!c->deterministic) {
/*
* Get a power ref unless this is a deterministic
* channel that holds them during the channel lifetime.
* This one is released by gk20a_channel_clean_up_jobs,
* via syncpt or sema interrupt, whichever is used.
*/
err = gk20a_busy(g);
if (err) {
nvgpu_err(g,
"failed to host gk20a to submit gpfifo, process %s",
current->comm);
return err;
}
}
if (!need_deferred_cleanup) {
/* clean up a single job */
gk20a_channel_clean_up_jobs(c, false);
}
}
/* Grab access to HW to deal with do_idle */
if (c->deterministic)
nvgpu_rwsem_down_read(&g->deterministic_busy);
trace_gk20a_channel_submit_gpfifo(g->name,
c->chid,
num_entries,
flags,
fence ? fence->id : 0,
fence ? fence->value : 0);
gk20a_dbg_info("pre-submit put %d, get %d, size %d",
c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
/*
* Make sure we have enough space for gpfifo entries. Check cached
* values first and then read from HW. If no space, return EAGAIN
* and let userpace decide to re-try request or not.
*/
if (gp_free_count(c) < num_entries + extra_entries) {
if (get_gp_free_count(c) < num_entries + extra_entries) {
err = -EAGAIN;
goto clean_up;
}
}
if (c->has_timedout) {
err = -ETIMEDOUT;
goto clean_up;
}
if (need_job_tracking) {
err = channel_gk20a_alloc_job(c, &job);
if (err)
goto clean_up;
err = gk20a_submit_prepare_syncs(c, fence, job,
&wait_cmd, &incr_cmd,
&pre_fence, &post_fence,
force_need_sync_fence,
need_deferred_cleanup,
flags);
if (err)
goto clean_up_job;
}
if (profile)
profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
if (wait_cmd)
gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
if (gpfifo || user_gpfifo)
err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
num_entries);
if (err)
goto clean_up_job;
/*
* And here's where we add the incr_cmd we generated earlier. It should
* always run!
*/
if (incr_cmd)
gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
if (fence_out)
*fence_out = gk20a_fence_get(post_fence);
if (need_job_tracking)
/* TODO! Check for errors... */
gk20a_channel_add_job(c, job, skip_buffer_refcounting);
if (profile)
profile->timestamp[PROFILE_APPEND] = sched_clock();
g->ops.fifo.userd_gp_put(g, c);
if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) &&
g->ops.fifo.reschedule_runlist)
g->ops.fifo.reschedule_runlist(g, c->runlist_id);
/* No hw access beyond this point */
if (c->deterministic)
nvgpu_rwsem_up_read(&g->deterministic_busy);
trace_gk20a_channel_submitted_gpfifo(g->name,
c->chid,
num_entries,
flags,
post_fence ? post_fence->syncpt_id : 0,
post_fence ? post_fence->syncpt_value : 0);
gk20a_dbg_info("post-submit put %d, get %d, size %d",
c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
if (profile)
profile->timestamp[PROFILE_END] = sched_clock();
gk20a_dbg_fn("done");
return err;
clean_up_job:
channel_gk20a_free_job(c, job);
clean_up:
gk20a_dbg_fn("fail");
gk20a_fence_put(pre_fence);
gk20a_fence_put(post_fence);
if (c->deterministic)
nvgpu_rwsem_up_read(&g->deterministic_busy);
else if (need_deferred_cleanup)
gk20a_idle(g);
return err;
}
/* /*
* Stop deterministic channel activity for do_idle() when power needs to go off * Stop deterministic channel activity for do_idle() when power needs to go off
* momentarily but deterministic channels keep power refs for potentially a * momentarily but deterministic channels keep power refs for potentially a

View File

@@ -24,6 +24,9 @@
#ifndef CHANNEL_GK20A_H #ifndef CHANNEL_GK20A_H
#define CHANNEL_GK20A_H #define CHANNEL_GK20A_H
/* TODO: To be removed when work_struct update_fn_work is moved out of common code */
#include <linux/workqueue.h>
#include <linux/stacktrace.h> #include <linux/stacktrace.h>
#include <nvgpu/list.h> #include <nvgpu/list.h>
@@ -374,16 +377,6 @@ struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
int runlist_id, int runlist_id,
bool is_privileged_channel); bool is_privileged_channel);
int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
struct nvgpu_gpfifo *gpfifo,
struct nvgpu_submit_gpfifo_args *args,
u32 num_entries,
u32 flags,
struct nvgpu_fence *fence,
struct gk20a_fence **fence_out,
bool force_need_sync_fence,
struct fifo_profile_gk20a *profile);
int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
unsigned int num_entries, unsigned int num_entries,
unsigned int num_inflight_jobs, unsigned int num_inflight_jobs,
@@ -408,4 +401,20 @@ int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
void gk20a_channel_event_id_post_event(struct channel_gk20a *ch, void gk20a_channel_event_id_post_event(struct channel_gk20a *ch,
u32 event_id); u32 event_id);
int channel_gk20a_alloc_job(struct channel_gk20a *c,
struct channel_gk20a_job **job_out);
void channel_gk20a_free_job(struct channel_gk20a *c,
struct channel_gk20a_job *job);
u32 nvgpu_get_gp_free_count(struct channel_gk20a *c);
u32 nvgpu_gp_free_count(struct channel_gk20a *c);
int gk20a_channel_add_job(struct channel_gk20a *c,
struct channel_gk20a_job *job,
bool skip_buffer_refcounting);
void free_priv_cmdbuf(struct channel_gk20a *c,
struct priv_cmd_entry *e);
void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
bool clean_all);
u32 nvgpu_get_gpfifo_entry_size(void);
#endif /* CHANNEL_GK20A_H */ #endif /* CHANNEL_GK20A_H */