gpu: nvgpu: ModeE perfbuffer feature development

perfbuf mapping fixes: - Allocate VM specifically for perfbuf use: using the PMU's results in mmu faults for larger buffers where 64k pages are used. - Make 4GB boundary check work for large address spaces - remove requirement to have allow_all flag set - track perfbuf ownership and clean up appropriately Bug 1880196 JIRA EVLR-1074 Change-Id: Ieee4eb17b64acf9b6ede37bf8e6a91892cda4a7e Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com> Reviewed-on: http://git-master/r/1460809 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2017-04-06 16:39:30 -07:00
parent 6df49a63ca
commit 0d8f5f3fdb
3 changed files with 98 additions and 24 deletions
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -514,6 +514,8 @@ static int dbg_unbind_channel_gk20a(struct dbg_session_gk20a *dbg_s,
 	return err;
 }

+static int gk20a_perfbuf_release_locked(struct gk20a *g, u64 offset);
+
 int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
 {
 	struct dbg_session_gk20a *dbg_s = filp->private_data;
@@ -534,6 +536,10 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
 				NVGPU_DBG_GPU_POWERGATE_MODE_ENABLE);
 	nvgpu_dbg_timeout_enable(dbg_s, NVGPU_DBG_GPU_IOCTL_TIMEOUT_ENABLE);

+	/* If this session owned the perf buffer, release it */
+	if (g->perfbuf.owner == dbg_s)
+		gk20a_perfbuf_release_locked(g, g->perfbuf.offset);
+
 	/* Per-context profiler objects were released when we called
 	 * dbg_unbind_all_channels. We could still have global ones.
 	 */
@@ -1821,16 +1827,39 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 		struct nvgpu_dbg_gpu_perfbuf_map_args *args)
 {
 	struct gk20a *g = dbg_s->g;
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = &mm->perfbuf.vm;
 	int err;
 	u32 virt_size;
 	u32 virt_addr_lo;
 	u32 virt_addr_hi;
 	u32 inst_pa_page;
+	u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;

-	if (!g->allow_all)
-		return -EACCES;
+	nvgpu_mutex_acquire(&g->dbg_sessions_lock);

-	err = gk20a_vm_map_buffer(&g->mm.pmu.vm,
+	if (g->perfbuf.owner) {
+		nvgpu_mutex_release(&g->dbg_sessions_lock);
+		return -EBUSY;
+	}
+
+	err = gk20a_init_vm(mm, vm, big_page_size,
+			big_page_size << 10,
+			NV_MM_DEFAULT_KERNEL_SIZE,
+			NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE,
+			false, false, "perfbuf");
+	if (err) {
+		nvgpu_mutex_release(&g->dbg_sessions_lock);
+		return err;
+	}
+
+	err = gk20a_alloc_inst_block(g, &mm->perfbuf.inst_block);
+	if (err)
+		goto err_remove_vm;
+
+	g->ops.mm.init_inst_block(&mm->perfbuf.inst_block, vm, 0);
+
+	err = gk20a_vm_map_buffer(vm,
 			args->dmabuf_fd,
 			&args->offset,
 			0,
@@ -1839,23 +1868,21 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 			args->mapping_size,
 			NULL);
 	if (err)
-		return err;
+		goto err_remove_vm;

-	/* perf output buffer may not cross a 4GB boundary - with a separate va
-	 * smaller than that, it won't */
+	/* perf output buffer may not cross a 4GB boundary */
 	virt_size = u64_lo32(args->mapping_size);
 	virt_addr_lo = u64_lo32(args->offset);
 	virt_addr_hi = u64_hi32(args->offset);
-	/* but check anyway */
-	if (args->offset + virt_size > SZ_4G) {
+	if (u64_hi32(args->offset) != u64_hi32(args->offset + virt_size)) {
 		err = -EINVAL;
-		goto fail_unmap;
+		goto err_unmap;
 	}

 	err = gk20a_busy(g);
 	if (err) {
 		nvgpu_err(g, "failed to poweron");
-		goto fail_unmap;
+		goto err_unmap;
 	}

 	/* address and size are aligned to 32 bytes, the lowest bits read back
@@ -1866,7 +1893,8 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 	gk20a_writel(g, perf_pmasys_outsize_r(), virt_size);

 	/* this field is aligned to 4K */
-	inst_pa_page = gk20a_mm_inst_block_addr(g, &g->mm.hwpm.inst_block) >> 12;
+	inst_pa_page = gk20a_mm_inst_block_addr(g,
+						&mm->perfbuf.inst_block) >> 12;

 	/* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
 	 * should be written last */
@@ -1877,23 +1905,24 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,

 	gk20a_idle(g);

+	g->perfbuf.owner = dbg_s;
+	g->perfbuf.offset = args->offset;
+	nvgpu_mutex_release(&g->dbg_sessions_lock);
+
 	return 0;

-fail_unmap:
-	gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
+err_unmap:
+	gk20a_vm_unmap_buffer(vm, args->offset, NULL);
+err_remove_vm:
+	gk20a_remove_vm(vm, &mm->perfbuf.inst_block);
+	nvgpu_mutex_release(&g->dbg_sessions_lock);
 	return err;
 }

-static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
-		struct nvgpu_dbg_gpu_perfbuf_unmap_args *args)
+/* must be called with dbg_sessions_lock held */
+static int gk20a_perfbuf_disable_locked(struct gk20a *g)
 {
-	struct gk20a *g = dbg_s->g;
-	int err;
-
-	if (!g->allow_all)
-		return -EACCES;
-
-	err = gk20a_busy(g);
+	int err = gk20a_busy(g);
 	if (err) {
 		nvgpu_err(g, "failed to poweron");
 		return err;
@@ -1911,11 +1940,45 @@ static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,

 	gk20a_idle(g);

-	gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
-
 	return 0;
 }

+static int gk20a_perfbuf_release_locked(struct gk20a *g, u64 offset)
+{
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = &mm->perfbuf.vm;
+	int err;
+
+	err = gk20a_perfbuf_disable_locked(g);
+
+	gk20a_vm_unmap_buffer(vm, offset, NULL);
+	gk20a_remove_vm(vm, &mm->perfbuf.inst_block);
+
+	g->perfbuf.owner = NULL;
+	g->perfbuf.offset = 0;
+	return err;
+}
+
+static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
+		struct nvgpu_dbg_gpu_perfbuf_unmap_args *args)
+{
+	struct gk20a *g = dbg_s->g;
+	int err;
+
+	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+	if ((g->perfbuf.owner != dbg_s) ||
+					(g->perfbuf.offset != args->offset)) {
+		nvgpu_mutex_release(&g->dbg_sessions_lock);
+		return -EINVAL;
+	}
+
+	err = gk20a_perfbuf_release_locked(g, args->offset);
+
+	nvgpu_mutex_release(&g->dbg_sessions_lock);
+
+	return err;
+}
+
 void gk20a_init_dbg_session_ops(struct gpu_ops *gops)
 {
 	gops->dbg_session_ops.exec_reg_ops = exec_regops_gk20a;
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1027,6 +1027,12 @@ struct gk20a {
 	struct nvgpu_dbg_gpu_reg_op *dbg_regops_tmp_buf;
 	u32 dbg_regops_tmp_buf_ops;

+	/* For perfbuf mapping */
+	struct {
+		struct dbg_session_gk20a *owner;
+		u64 offset;
+	} perfbuf;
+
 	/* For profiler reservations */
 	struct nvgpu_list_node profiler_objects;
 	bool global_profiler_reservation_held;
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -361,6 +361,11 @@ struct mm_gk20a {
 		struct nvgpu_mem inst_block;
 	} hwpm;

+	struct {
+		struct vm_gk20a vm;
+		struct nvgpu_mem inst_block;
+	} perfbuf;
+
 	struct {
 		struct vm_gk20a vm;
 	} cde;