gpu: nvgpu: Initial MAP_BUFFER_BATCH implementation

Add batch support for mapping and unmapping. Batching essentially
helps transform some per-map/unmap overhead to per-batch overhead,
namely gk20a_busy()/gk20a_idle() calls, GPU L2 flushes, and GPU TLB
invalidates. Batching with size 64 has been measured to yield >20x
speed-up in low-level fixed-address mapping microbenchmarks.

Bug 1614735
Bug 1623949

Change-Id: Ie22b9caea5a7c3fc68a968d1b7f8488dfce72085
Signed-off-by: Sami Kiminki <skiminki@nvidia.com>
Reviewed-on: http://git-master/r/733231
(cherry picked from commit de4a7cfb93e8228a4a0c6a2815755a8df4531c91)
Reviewed-on: http://git-master/r/763812
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Sami Kiminki
2015-04-20 18:12:22 +03:00
committed by Terje Bergstrom
parent ae7b988b0d
commit e7ba93fefb
9 changed files with 244 additions and 45 deletions

View File

@@ -828,7 +828,8 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
0,
0,
0,
args->mapping_size);
args->mapping_size,
NULL);
if (err)
return err;
@@ -839,7 +840,7 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
virt_addr_hi = u64_hi32(args->offset);
/* but check anyway */
if (args->offset + virt_size > SZ_4G) {
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
return -EINVAL;
}
@@ -881,7 +882,7 @@ static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
perf_pmasys_mem_block_valid_false_f() |
perf_pmasys_mem_block_target_f(0));
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
return 0;
}