gpu: nvgpu: add hw perfmon buffer mapping ioctls

Map/unmap buffers for HWPM and deal with its instance block, the minimum
work required to run the HWPM via regops from userspace.

Bug 1517458
Bug 1573150

Change-Id: If14086a88b54bf434843d7c2fee8a9113023a3b0
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/673689
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Konsta Holtta
2015-01-14 14:04:08 +02:00
committed by Dan Willemsen
parent f93a8cc36b
commit 3877adcd65
5 changed files with 256 additions and 3 deletions

View File

@@ -1,7 +1,7 @@
/* /*
* Tegra GK20A GPU Debugger/Profiler Driver * Tegra GK20A GPU Debugger/Profiler Driver
* *
* Copyright (c) 2013-2014, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2013-2015, NVIDIA CORPORATION. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
@@ -29,6 +29,7 @@
#include "regops_gk20a.h" #include "regops_gk20a.h"
#include "hw_therm_gk20a.h" #include "hw_therm_gk20a.h"
#include "hw_gr_gk20a.h" #include "hw_gr_gk20a.h"
#include "hw_perf_gk20a.h"
struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = { struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = {
.exec_reg_ops = exec_regops_gk20a, .exec_reg_ops = exec_regops_gk20a,
@@ -370,6 +371,11 @@ static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm(
struct dbg_session_gk20a *dbg_s, struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *args); struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *args);
static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_perfbuf_map_args *args);
static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_perfbuf_unmap_args *args);
long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg) unsigned long arg)
@@ -436,6 +442,16 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
(struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *)buf); (struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *)buf);
break; break;
case NVGPU_DBG_GPU_IOCTL_PERFBUF_MAP:
err = gk20a_perfbuf_map(dbg_s,
(struct nvgpu_dbg_gpu_perfbuf_map_args *)buf);
break;
case NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP:
err = gk20a_perfbuf_unmap(dbg_s,
(struct nvgpu_dbg_gpu_perfbuf_unmap_args *)buf);
break;
default: default:
gk20a_err(dev_from_gk20a(g), gk20a_err(dev_from_gk20a(g),
"unrecognized dbg gpu ioctl cmd: 0x%x", "unrecognized dbg gpu ioctl cmd: 0x%x",
@@ -775,3 +791,80 @@ static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm(
mutex_unlock(&g->dbg_sessions_lock); mutex_unlock(&g->dbg_sessions_lock);
return err; return err;
} }
static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_perfbuf_map_args *args)
{
struct gk20a *g = dbg_s->g;
int err;
u32 virt_size;
u32 virt_addr_lo;
u32 virt_addr_hi;
u32 inst_pa_page;
if (!g->allow_all)
return -EACCES;
err = gk20a_vm_map_buffer(&g->mm.pmu.vm,
args->dmabuf_fd,
&args->offset,
0,
0,
0,
args->mapping_size);
if (err)
return err;
/* perf output buffer may not cross a 4GB boundary - with a separate va
* smaller than that, it won't */
virt_size = u64_lo32(args->mapping_size);
virt_addr_lo = u64_lo32(args->offset);
virt_addr_hi = u64_hi32(args->offset);
/* but check anyway */
if (args->offset + virt_size > SZ_4G) {
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
return -EINVAL;
}
/* address and size are aligned to 32 bytes, the lowest bits read back
* as zeros */
gk20a_writel(g, perf_pmasys_outbase_r(), virt_addr_lo);
gk20a_writel(g, perf_pmasys_outbaseupper_r(),
perf_pmasys_outbaseupper_ptr_f(virt_addr_hi));
gk20a_writel(g, perf_pmasys_outsize_r(), virt_size);
/* this field is aligned to 4K */
inst_pa_page = g->mm.hwpm.inst_block.cpu_pa >> 12;
/* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
* should be written last */
gk20a_writel(g, perf_pmasys_mem_block_r(),
perf_pmasys_mem_block_base_f(inst_pa_page) |
perf_pmasys_mem_block_valid_true_f() |
perf_pmasys_mem_block_target_lfb_f());
return 0;
}
static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_perfbuf_unmap_args *args)
{
struct gk20a *g = dbg_s->g;
if (!g->allow_all)
return -EACCES;
gk20a_writel(g, perf_pmasys_outbase_r(), 0);
gk20a_writel(g, perf_pmasys_outbaseupper_r(),
perf_pmasys_outbaseupper_ptr_f(0));
gk20a_writel(g, perf_pmasys_outsize_r(), 0);
gk20a_writel(g, perf_pmasys_mem_block_r(),
perf_pmasys_mem_block_base_f(0) |
perf_pmasys_mem_block_valid_false_f() |
perf_pmasys_mem_block_target_f(0));
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
return 0;
}

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* Function naming determines intended use:
*
* <x>_r(void) : Returns the offset for register <x>.
*
* <x>_o(void) : Returns the offset for element <x>.
*
* <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
*
* <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
*
* <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
* and masked to place it at field <y> of register <x>. This value
* can be |'d with others to produce a full register value for
* register <x>.
*
* <x>_<y>_m(void) : Returns a mask for field <y> of register <x>. This
* value can be ~'d and then &'d to clear the value of field <y> for
* register <x>.
*
* <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
* to place it at field <y> of register <x>. This value can be |'d
* with others to produce a full register value for <x>.
*
* <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
* <x> value 'r' after being shifted to place its LSB at bit 0.
* This value is suitable for direct comparison with other unshifted
* values appropriate for use in field <y> of register <x>.
*
* <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
* field <y> of register <x>. This value is suitable for direct
* comparison with unshifted values appropriate for use in field <y>
* of register <x>.
*/
#ifndef _hw_perf_gk20a_h_
#define _hw_perf_gk20a_h_
static inline u32 perf_pmasys_mem_block_r(void)
{
return 0x001b4070;
}
static inline u32 perf_pmasys_mem_block_base_f(u32 v)
{
return (v & 0xfffffff) << 0;
}
static inline u32 perf_pmasys_mem_block_target_f(u32 v)
{
return (v & 0x3) << 28;
}
static inline u32 perf_pmasys_mem_block_target_v(u32 r)
{
return (r >> 28) & 0x3;
}
static inline u32 perf_pmasys_mem_block_target_lfb_v(void)
{
return 0x00000000;
}
static inline u32 perf_pmasys_mem_block_target_lfb_f(void)
{
return 0x0;
}
static inline u32 perf_pmasys_mem_block_valid_f(u32 v)
{
return (v & 0x1) << 31;
}
static inline u32 perf_pmasys_mem_block_valid_v(u32 r)
{
return (r >> 31) & 0x1;
}
static inline u32 perf_pmasys_mem_block_valid_true_v(void)
{
return 0x00000001;
}
static inline u32 perf_pmasys_mem_block_valid_true_f(void)
{
return 0x80000000;
}
static inline u32 perf_pmasys_mem_block_valid_false_v(void)
{
return 0x00000000;
}
static inline u32 perf_pmasys_mem_block_valid_false_f(void)
{
return 0x0;
}
static inline u32 perf_pmasys_outbase_r(void)
{
return 0x001b4074;
}
static inline u32 perf_pmasys_outbaseupper_r(void)
{
return 0x001b4078;
}
static inline u32 perf_pmasys_outbaseupper_ptr_f(u32 v)
{
return (v & 0xff) << 0;
}
static inline u32 perf_pmasys_outsize_r(void)
{
return 0x001b407c;
}
#endif

View File

@@ -101,6 +101,7 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
int rw_flag); int rw_flag);
static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm); static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm); static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
struct gk20a_dmabuf_priv { struct gk20a_dmabuf_priv {
@@ -280,6 +281,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
{ {
gk20a_remove_vm(&mm->bar1.vm, &mm->bar1.inst_block); gk20a_remove_vm(&mm->bar1.vm, &mm->bar1.inst_block);
gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block); gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
} }
int gk20a_init_mm_setup_sw(struct gk20a *g) int gk20a_init_mm_setup_sw(struct gk20a *g)
@@ -315,6 +317,10 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
if (err) if (err)
return err; return err;
err = gk20a_init_hwpm(mm);
if (err)
return err;
/* set vm_alloc_share op here as gk20a_as_alloc_share needs it */ /* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share; g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
mm->remove_support = gk20a_remove_mm_support; mm->remove_support = gk20a_remove_mm_support;
@@ -2720,6 +2726,21 @@ clean_up_va:
return err; return err;
} }
static int gk20a_init_hwpm(struct mm_gk20a *mm)
{
int err;
struct vm_gk20a *vm = &mm->pmu.vm;
struct gk20a *g = gk20a_from_mm(mm);
struct inst_desc *inst_block = &mm->hwpm.inst_block;
err = gk20a_alloc_inst_block(g, inst_block);
if (err)
return err;
gk20a_init_inst_block(inst_block, vm, 0);
return 0;
}
void gk20a_init_inst_block(struct inst_desc *inst_block, struct vm_gk20a *vm, void gk20a_init_inst_block(struct inst_desc *inst_block, struct vm_gk20a *vm,
u32 big_page_size) u32 big_page_size)
{ {

View File

@@ -342,6 +342,12 @@ struct mm_gk20a {
struct inst_desc inst_block; struct inst_desc inst_block;
} pmu; } pmu;
struct {
/* using pmu vm currently */
struct inst_desc inst_block;
} hwpm;
struct mutex l2_op_lock; struct mutex l2_op_lock;
void (*remove_support)(struct mm_gk20a *mm); void (*remove_support)(struct mm_gk20a *mm);

View File

@@ -432,10 +432,26 @@ struct nvgpu_dbg_gpu_suspend_resume_all_sms_args {
#define NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS \ #define NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS \
_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 6, struct nvgpu_dbg_gpu_suspend_resume_all_sms_args) _IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 6, struct nvgpu_dbg_gpu_suspend_resume_all_sms_args)
struct nvgpu_dbg_gpu_perfbuf_map_args {
__u32 dmabuf_fd; /* in */
__u32 reserved;
__u64 mapping_size; /* in, size of mapped buffer region */
__u64 offset; /* out, virtual address of the mapping */
};
struct nvgpu_dbg_gpu_perfbuf_unmap_args {
__u64 offset;
};
#define NVGPU_DBG_GPU_IOCTL_PERFBUF_MAP \
_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 7, struct nvgpu_dbg_gpu_perfbuf_map_args)
#define NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP \
_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 8, struct nvgpu_dbg_gpu_perfbuf_unmap_args)
#define NVGPU_DBG_GPU_IOCTL_LAST \ #define NVGPU_DBG_GPU_IOCTL_LAST \
_IOC_NR(NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS) _IOC_NR(NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP)
#define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \ #define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_dbg_gpu_exec_reg_ops_args) sizeof(struct nvgpu_dbg_gpu_perfbuf_map_args)
/* /*
* /dev/nvhost-gpu device * /dev/nvhost-gpu device