gpu: nvgpu: add hw perfmon buffer mapping ioctls

Map/unmap buffers for HWPM and deal with its instance block, the minimum work required to run the HWPM via regops from userspace. Bug 1517458 Bug 1573150 Change-Id: If14086a88b54bf434843d7c2fee8a9113023a3b0 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/673689 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-23 01:50:07 +03:00 · 2015-01-14 14:04:08 +02:00
parent f93a8cc36b
commit 3877adcd65
5 changed files with 256 additions and 3 deletions
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -1,7 +1,7 @@
 /*
 * Tegra GK20A GPU Debugger/Profiler Driver
 *
- * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2013-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -29,6 +29,7 @@
 #include "regops_gk20a.h"
 #include "hw_therm_gk20a.h"
 #include "hw_gr_gk20a.h"
 #include "hw_perf_gk20a.h"
 struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = {
 	.exec_reg_ops = exec_regops_gk20a,
@@ -370,6 +371,11 @@ static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm(
 		struct dbg_session_gk20a *dbg_s,
 		struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *args);
 static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 		struct nvgpu_dbg_gpu_perfbuf_map_args *args);
 static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
 		struct nvgpu_dbg_gpu_perfbuf_unmap_args *args);
 long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
 			     unsigned long arg)
@@ -436,6 +442,16 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
 		       (struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *)buf);
 		break;
 	case NVGPU_DBG_GPU_IOCTL_PERFBUF_MAP:
 		err = gk20a_perfbuf_map(dbg_s,
 		       (struct nvgpu_dbg_gpu_perfbuf_map_args *)buf);
 		break;
 	case NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP:
 		err = gk20a_perfbuf_unmap(dbg_s,
 		       (struct nvgpu_dbg_gpu_perfbuf_unmap_args *)buf);
 		break;
 	default:
 		gk20a_err(dev_from_gk20a(g),
 			   "unrecognized dbg gpu ioctl cmd: 0x%x",
@@ -775,3 +791,80 @@ static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm(
 	mutex_unlock(&g->dbg_sessions_lock);
 	return  err;
 }
 static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 		struct nvgpu_dbg_gpu_perfbuf_map_args *args)
 {
 	struct gk20a *g = dbg_s->g;
 	int err;
 	u32 virt_size;
 	u32 virt_addr_lo;
 	u32 virt_addr_hi;
 	u32 inst_pa_page;
 	if (!g->allow_all)
 		return -EACCES;
 	err = gk20a_vm_map_buffer(&g->mm.pmu.vm,
 			args->dmabuf_fd,
 			&args->offset,
 			0,
 			0,
 			0,
 			args->mapping_size);
 	if (err)
 		return err;
 	/* perf output buffer may not cross a 4GB boundary - with a separate va
 	 * smaller than that, it won't */
 	virt_size = u64_lo32(args->mapping_size);
 	virt_addr_lo = u64_lo32(args->offset);
 	virt_addr_hi = u64_hi32(args->offset);
 	/* but check anyway */
 	if (args->offset + virt_size > SZ_4G) {
 		gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
 		return -EINVAL;
 	}
 	/* address and size are aligned to 32 bytes, the lowest bits read back
 	 * as zeros */
 	gk20a_writel(g, perf_pmasys_outbase_r(), virt_addr_lo);
 	gk20a_writel(g, perf_pmasys_outbaseupper_r(),
 			perf_pmasys_outbaseupper_ptr_f(virt_addr_hi));
 	gk20a_writel(g, perf_pmasys_outsize_r(), virt_size);
 	/* this field is aligned to 4K */
 	inst_pa_page = g->mm.hwpm.inst_block.cpu_pa >> 12;
 	/* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
 	 * should be written last */
 	gk20a_writel(g, perf_pmasys_mem_block_r(),
 			perf_pmasys_mem_block_base_f(inst_pa_page) |
 			perf_pmasys_mem_block_valid_true_f() |
 			perf_pmasys_mem_block_target_lfb_f());
 	return 0;
 }
 static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
 		struct nvgpu_dbg_gpu_perfbuf_unmap_args *args)
 {
 	struct gk20a *g = dbg_s->g;
 	if (!g->allow_all)
 		return -EACCES;
 	gk20a_writel(g, perf_pmasys_outbase_r(), 0);
 	gk20a_writel(g, perf_pmasys_outbaseupper_r(),
 			perf_pmasys_outbaseupper_ptr_f(0));
 	gk20a_writel(g, perf_pmasys_outsize_r(), 0);
 	gk20a_writel(g, perf_pmasys_mem_block_r(),
 			perf_pmasys_mem_block_base_f(0) |
 			perf_pmasys_mem_block_valid_false_f() |
 			perf_pmasys_mem_block_target_f(0));
 	gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
 	return 0;
 }
--- a/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h
@@ -0,0 +1,117 @@
 /*
 * Copyright (c) 2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /*
 * Function naming determines intended use:
 *
 *     <x>_r(void) : Returns the offset for register <x>.
 *
 *     <x>_o(void) : Returns the offset for element <x>.
 *
 *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
 *
 *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
 *
 *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
 *         and masked to place it at field <y> of register <x>.  This value
 *         can be |'d with others to produce a full register value for
 *         register <x>.
 *
 *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
 *         value can be ~'d and then &'d to clear the value of field <y> for
 *         register <x>.
 *
 *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
 *         to place it at field <y> of register <x>.  This value can be |'d
 *         with others to produce a full register value for <x>.
 *
 *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
 *         <x> value 'r' after being shifted to place its LSB at bit 0.
 *         This value is suitable for direct comparison with other unshifted
 *         values appropriate for use in field <y> of register <x>.
 *
 *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
 *         field <y> of register <x>.  This value is suitable for direct
 *         comparison with unshifted values appropriate for use in field <y>
 *         of register <x>.
 */
 #ifndef _hw_perf_gk20a_h_
 #define _hw_perf_gk20a_h_
 static inline u32 perf_pmasys_mem_block_r(void)
 {
 	return 0x001b4070;
 }
 static inline u32 perf_pmasys_mem_block_base_f(u32 v)
 {
 	return (v & 0xfffffff) << 0;
 }
 static inline u32 perf_pmasys_mem_block_target_f(u32 v)
 {
 	return (v & 0x3) << 28;
 }
 static inline u32 perf_pmasys_mem_block_target_v(u32 r)
 {
 	return (r >> 28) & 0x3;
 }
 static inline u32 perf_pmasys_mem_block_target_lfb_v(void)
 {
 	return 0x00000000;
 }
 static inline u32 perf_pmasys_mem_block_target_lfb_f(void)
 {
 	return 0x0;
 }
 static inline u32 perf_pmasys_mem_block_valid_f(u32 v)
 {
 	return (v & 0x1) << 31;
 }
 static inline u32 perf_pmasys_mem_block_valid_v(u32 r)
 {
 	return (r >> 31) & 0x1;
 }
 static inline u32 perf_pmasys_mem_block_valid_true_v(void)
 {
 	return 0x00000001;
 }
 static inline u32 perf_pmasys_mem_block_valid_true_f(void)
 {
 	return 0x80000000;
 }
 static inline u32 perf_pmasys_mem_block_valid_false_v(void)
 {
 	return 0x00000000;
 }
 static inline u32 perf_pmasys_mem_block_valid_false_f(void)
 {
 	return 0x0;
 }
 static inline u32 perf_pmasys_outbase_r(void)
 {
 	return 0x001b4074;
 }
 static inline u32 perf_pmasys_outbaseupper_r(void)
 {
 	return 0x001b4078;
 }
 static inline u32 perf_pmasys_outbaseupper_ptr_f(u32 v)
 {
 	return (v & 0xff) << 0;
 }
 static inline u32 perf_pmasys_outsize_r(void)
 {
 	return 0x001b407c;
 }
 #endif
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -101,6 +101,7 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 				   int rw_flag);
 static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
 struct gk20a_dmabuf_priv {
@@ -280,6 +281,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
 {
 	gk20a_remove_vm(&mm->bar1.vm, &mm->bar1.inst_block);
 	gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
 	gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
 }
 int gk20a_init_mm_setup_sw(struct gk20a *g)
@@ -315,6 +317,10 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
 	if (err)
 		return err;
 	err = gk20a_init_hwpm(mm);
 	if (err)
 		return err;
 	/* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
 	g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
 	mm->remove_support = gk20a_remove_mm_support;
@@ -2720,6 +2726,21 @@ clean_up_va:
 	return err;
 }
 static int gk20a_init_hwpm(struct mm_gk20a *mm)
 {
 	int err;
 	struct vm_gk20a *vm = &mm->pmu.vm;
 	struct gk20a *g = gk20a_from_mm(mm);
 	struct inst_desc *inst_block = &mm->hwpm.inst_block;
 	err = gk20a_alloc_inst_block(g, inst_block);
 	if (err)
 		return err;
 	gk20a_init_inst_block(inst_block, vm, 0);
 	return 0;
 }
 void gk20a_init_inst_block(struct inst_desc *inst_block, struct vm_gk20a *vm,
 		u32 big_page_size)
 {
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -342,6 +342,12 @@ struct mm_gk20a {
 		struct inst_desc inst_block;
 	} pmu;
 	struct {
 		/* using pmu vm currently */
 		struct inst_desc inst_block;
 	} hwpm;
 	struct mutex l2_op_lock;
 	void (*remove_support)(struct mm_gk20a *mm);
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -432,10 +432,26 @@ struct nvgpu_dbg_gpu_suspend_resume_all_sms_args {
 #define NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS			\
 	_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 6, struct nvgpu_dbg_gpu_suspend_resume_all_sms_args)
 struct nvgpu_dbg_gpu_perfbuf_map_args {
 	__u32 dmabuf_fd;	/* in */
 	__u32 reserved;
 	__u64 mapping_size;	/* in, size of mapped buffer region */
 	__u64 offset;		/* out, virtual address of the mapping */
 };
 struct nvgpu_dbg_gpu_perfbuf_unmap_args {
 	__u64 offset;
 };
 #define NVGPU_DBG_GPU_IOCTL_PERFBUF_MAP \
 	_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 7, struct nvgpu_dbg_gpu_perfbuf_map_args)
 #define NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP \
 	_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 8, struct nvgpu_dbg_gpu_perfbuf_unmap_args)
 #define NVGPU_DBG_GPU_IOCTL_LAST		\
-	_IOC_NR(NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS)
+	_IOC_NR(NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP)
 #define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE		\
-	sizeof(struct nvgpu_dbg_gpu_exec_reg_ops_args)
+	sizeof(struct nvgpu_dbg_gpu_perfbuf_map_args)
 /*
 * /dev/nvhost-gpu device