gpu: nvgpu: ModeE perfbuffer feature development

perfbuf mapping fixes:
- Allocate VM specifically for perfbuf use: using the PMU's
  results in mmu faults for larger buffers where 64k pages are used.
- Make 4GB boundary check work for large address spaces
- remove requirement to have allow_all flag set
- track perfbuf ownership and clean up appropriately

Bug 1880196
JIRA EVLR-1074

Change-Id: Ieee4eb17b64acf9b6ede37bf8e6a91892cda4a7e
Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com>
Reviewed-on: http://git-master/r/1460809
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Peter Daifuku
2017-04-06 16:39:30 -07:00
committed by mobile promotions
parent 6df49a63ca
commit 0d8f5f3fdb
3 changed files with 98 additions and 24 deletions

View File

@@ -514,6 +514,8 @@ static int dbg_unbind_channel_gk20a(struct dbg_session_gk20a *dbg_s,
return err;
}
static int gk20a_perfbuf_release_locked(struct gk20a *g, u64 offset);
int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
{
struct dbg_session_gk20a *dbg_s = filp->private_data;
@@ -534,6 +536,10 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
NVGPU_DBG_GPU_POWERGATE_MODE_ENABLE);
nvgpu_dbg_timeout_enable(dbg_s, NVGPU_DBG_GPU_IOCTL_TIMEOUT_ENABLE);
/* If this session owned the perf buffer, release it */
if (g->perfbuf.owner == dbg_s)
gk20a_perfbuf_release_locked(g, g->perfbuf.offset);
/* Per-context profiler objects were released when we called
* dbg_unbind_all_channels. We could still have global ones.
*/
@@ -1821,16 +1827,39 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_perfbuf_map_args *args)
{
struct gk20a *g = dbg_s->g;
struct mm_gk20a *mm = &g->mm;
struct vm_gk20a *vm = &mm->perfbuf.vm;
int err;
u32 virt_size;
u32 virt_addr_lo;
u32 virt_addr_hi;
u32 inst_pa_page;
u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
if (!g->allow_all)
return -EACCES;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
err = gk20a_vm_map_buffer(&g->mm.pmu.vm,
if (g->perfbuf.owner) {
nvgpu_mutex_release(&g->dbg_sessions_lock);
return -EBUSY;
}
err = gk20a_init_vm(mm, vm, big_page_size,
big_page_size << 10,
NV_MM_DEFAULT_KERNEL_SIZE,
NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE,
false, false, "perfbuf");
if (err) {
nvgpu_mutex_release(&g->dbg_sessions_lock);
return err;
}
err = gk20a_alloc_inst_block(g, &mm->perfbuf.inst_block);
if (err)
goto err_remove_vm;
g->ops.mm.init_inst_block(&mm->perfbuf.inst_block, vm, 0);
err = gk20a_vm_map_buffer(vm,
args->dmabuf_fd,
&args->offset,
0,
@@ -1839,23 +1868,21 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
args->mapping_size,
NULL);
if (err)
return err;
goto err_remove_vm;
/* perf output buffer may not cross a 4GB boundary - with a separate va
* smaller than that, it won't */
/* perf output buffer may not cross a 4GB boundary */
virt_size = u64_lo32(args->mapping_size);
virt_addr_lo = u64_lo32(args->offset);
virt_addr_hi = u64_hi32(args->offset);
/* but check anyway */
if (args->offset + virt_size > SZ_4G) {
if (u64_hi32(args->offset) != u64_hi32(args->offset + virt_size)) {
err = -EINVAL;
goto fail_unmap;
goto err_unmap;
}
err = gk20a_busy(g);
if (err) {
nvgpu_err(g, "failed to poweron");
goto fail_unmap;
goto err_unmap;
}
/* address and size are aligned to 32 bytes, the lowest bits read back
@@ -1866,7 +1893,8 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
gk20a_writel(g, perf_pmasys_outsize_r(), virt_size);
/* this field is aligned to 4K */
inst_pa_page = gk20a_mm_inst_block_addr(g, &g->mm.hwpm.inst_block) >> 12;
inst_pa_page = gk20a_mm_inst_block_addr(g,
&mm->perfbuf.inst_block) >> 12;
/* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
* should be written last */
@@ -1877,23 +1905,24 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
gk20a_idle(g);
g->perfbuf.owner = dbg_s;
g->perfbuf.offset = args->offset;
nvgpu_mutex_release(&g->dbg_sessions_lock);
return 0;
fail_unmap:
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
err_unmap:
gk20a_vm_unmap_buffer(vm, args->offset, NULL);
err_remove_vm:
gk20a_remove_vm(vm, &mm->perfbuf.inst_block);
nvgpu_mutex_release(&g->dbg_sessions_lock);
return err;
}
static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_perfbuf_unmap_args *args)
/* must be called with dbg_sessions_lock held */
static int gk20a_perfbuf_disable_locked(struct gk20a *g)
{
struct gk20a *g = dbg_s->g;
int err;
if (!g->allow_all)
return -EACCES;
err = gk20a_busy(g);
int err = gk20a_busy(g);
if (err) {
nvgpu_err(g, "failed to poweron");
return err;
@@ -1911,11 +1940,45 @@ static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
gk20a_idle(g);
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
return 0;
}
static int gk20a_perfbuf_release_locked(struct gk20a *g, u64 offset)
{
struct mm_gk20a *mm = &g->mm;
struct vm_gk20a *vm = &mm->perfbuf.vm;
int err;
err = gk20a_perfbuf_disable_locked(g);
gk20a_vm_unmap_buffer(vm, offset, NULL);
gk20a_remove_vm(vm, &mm->perfbuf.inst_block);
g->perfbuf.owner = NULL;
g->perfbuf.offset = 0;
return err;
}
static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_perfbuf_unmap_args *args)
{
struct gk20a *g = dbg_s->g;
int err;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
if ((g->perfbuf.owner != dbg_s) ||
(g->perfbuf.offset != args->offset)) {
nvgpu_mutex_release(&g->dbg_sessions_lock);
return -EINVAL;
}
err = gk20a_perfbuf_release_locked(g, args->offset);
nvgpu_mutex_release(&g->dbg_sessions_lock);
return err;
}
void gk20a_init_dbg_session_ops(struct gpu_ops *gops)
{
gops->dbg_session_ops.exec_reg_ops = exec_regops_gk20a;

View File

@@ -1027,6 +1027,12 @@ struct gk20a {
struct nvgpu_dbg_gpu_reg_op *dbg_regops_tmp_buf;
u32 dbg_regops_tmp_buf_ops;
/* For perfbuf mapping */
struct {
struct dbg_session_gk20a *owner;
u64 offset;
} perfbuf;
/* For profiler reservations */
struct nvgpu_list_node profiler_objects;
bool global_profiler_reservation_held;

View File

@@ -361,6 +361,11 @@ struct mm_gk20a {
struct nvgpu_mem inst_block;
} hwpm;
struct {
struct vm_gk20a vm;
struct nvgpu_mem inst_block;
} perfbuf;
struct {
struct vm_gk20a vm;
} cde;