gpu: nvgpu: Initial MAP_BUFFER_BATCH implementation

Add batch support for mapping and unmapping. Batching essentially helps transform some per-map/unmap overhead to per-batch overhead, namely gk20a_busy()/gk20a_idle() calls, GPU L2 flushes, and GPU TLB invalidates. Batching with size 64 has been measured to yield >20x speed-up in low-level fixed-address mapping microbenchmarks. Bug 1614735 Bug 1623949 Change-Id: Ie22b9caea5a7c3fc68a968d1b7f8488dfce72085 Signed-off-by: Sami Kiminki <skiminki@nvidia.com> Reviewed-on: http://git-master/r/733231 (cherry picked from commit de4a7cfb93e8228a4a0c6a2815755a8df4531c91) Reviewed-on: http://git-master/r/763812 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-23 09:57:08 +03:00 · 2015-04-20 18:12:22 +03:00
parent ae7b988b0d
commit e7ba93fefb
9 changed files with 244 additions and 45 deletions
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -828,7 +828,8 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 			0,
 			0,
 			0,
-			args->mapping_size);
+			args->mapping_size,
+			NULL);
 	if (err)
 		return err;

@@ -839,7 +840,7 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
 	virt_addr_hi = u64_hi32(args->offset);
 	/* but check anyway */
 	if (args->offset + virt_size > SZ_4G) {
-		gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
+		gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
 		return -EINVAL;
 	}

@@ -881,7 +882,7 @@ static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
 			perf_pmasys_mem_block_valid_false_f() |
 			perf_pmasys_mem_block_target_f(0));

-	gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
+	gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);

 	return 0;
 }