gpu: nvgpu: Initial MAP_BUFFER_BATCH implementation

Add batch support for mapping and unmapping. Batching essentially helps transform some per-map/unmap overhead to per-batch overhead, namely gk20a_busy()/gk20a_idle() calls, GPU L2 flushes, and GPU TLB invalidates. Batching with size 64 has been measured to yield >20x speed-up in low-level fixed-address mapping microbenchmarks. Bug 1614735 Bug 1623949 Change-Id: Ie22b9caea5a7c3fc68a968d1b7f8488dfce72085 Signed-off-by: Sami Kiminki <skiminki@nvidia.com> Reviewed-on: http://git-master/r/733231 (cherry picked from commit de4a7cfb93e8228a4a0c6a2815755a8df4531c91) Reviewed-on: http://git-master/r/763812 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2015-04-20 18:12:22 +03:00
parent ae7b988b0d
commit e7ba93fefb
9 changed files with 244 additions and 45 deletions
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -66,7 +66,8 @@ static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
 				u32 flags,
 				int rw_flag,
 				bool clear_ctags,
-				bool sparse)
+				bool sparse,
+				struct vm_gk20a_mapping_batch *batch)
 {
 	int err = 0;
 	struct device *d = dev_from_vm(vm);
@@ -130,7 +131,8 @@ static void vgpu_locked_gmmu_unmap(struct vm_gk20a *vm,
 				int pgsz_idx,
 				bool va_allocated,
 				int rw_flag,
-				bool sparse)
+				bool sparse,
+				struct vm_gk20a_mapping_batch *batch)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
@@ -182,7 +184,7 @@ static void vgpu_vm_remove_support(struct vm_gk20a *vm)
 	while (node) {
 		mapped_buffer =
 			container_of(node, struct mapped_buffer_node, node);
-		gk20a_vm_unmap_locked(mapped_buffer);
+		gk20a_vm_unmap_locked(mapped_buffer, NULL);
 		node = rb_first(&vm->mapped_buffers);
 	}