gpu: nvgpu: Initial MAP_BUFFER_BATCH implementation

Add batch support for mapping and unmapping. Batching essentially
helps transform some per-map/unmap overhead to per-batch overhead,
namely gk20a_busy()/gk20a_idle() calls, GPU L2 flushes, and GPU TLB
invalidates. Batching with size 64 has been measured to yield >20x
speed-up in low-level fixed-address mapping microbenchmarks.

Bug 1614735
Bug 1623949

Change-Id: Ie22b9caea5a7c3fc68a968d1b7f8488dfce72085
Signed-off-by: Sami Kiminki <skiminki@nvidia.com>
Reviewed-on: http://git-master/r/733231
(cherry picked from commit de4a7cfb93e8228a4a0c6a2815755a8df4531c91)
Reviewed-on: http://git-master/r/763812
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Sami Kiminki
2015-04-20 18:12:22 +03:00
committed by Terje Bergstrom
parent ae7b988b0d
commit e7ba93fefb
9 changed files with 244 additions and 45 deletions

View File

@@ -151,8 +151,8 @@ static int gk20a_as_ioctl_map_buffer_ex(
&args->offset, args->flags,
args->kind,
args->buffer_offset,
args->mapping_size
);
args->mapping_size,
NULL);
}
static int gk20a_as_ioctl_map_buffer(
@@ -163,7 +163,7 @@ static int gk20a_as_ioctl_map_buffer(
return gk20a_vm_map_buffer(as_share->vm, args->dmabuf_fd,
&args->o_a.offset,
args->flags, NV_KIND_DEFAULT,
0, 0);
0, 0, NULL);
/* args->o_a.offset will be set if !err */
}
@@ -172,7 +172,86 @@ static int gk20a_as_ioctl_unmap_buffer(
struct nvgpu_as_unmap_buffer_args *args)
{
gk20a_dbg_fn("");
return gk20a_vm_unmap_buffer(as_share->vm, args->offset);
return gk20a_vm_unmap_buffer(as_share->vm, args->offset, NULL);
}
static int gk20a_as_ioctl_map_buffer_batch(
struct gk20a_as_share *as_share,
struct nvgpu_as_map_buffer_batch_args *args)
{
struct gk20a *g = as_share->vm->mm->g;
u32 i;
int err = 0;
struct nvgpu_as_unmap_buffer_args __user *user_unmap_args =
(struct nvgpu_as_unmap_buffer_args __user *)(uintptr_t)
args->unmaps;
struct nvgpu_as_map_buffer_ex_args __user *user_map_args =
(struct nvgpu_as_map_buffer_ex_args __user *)(uintptr_t)
args->maps;
struct vm_gk20a_mapping_batch batch;
gk20a_dbg_fn("");
if (args->num_unmaps > g->gpu_characteristics.map_buffer_batch_limit ||
args->num_maps > g->gpu_characteristics.map_buffer_batch_limit)
return -EINVAL;
gk20a_vm_mapping_batch_start(&batch);
for (i = 0; i < args->num_unmaps; ++i) {
struct nvgpu_as_unmap_buffer_args unmap_args;
if (copy_from_user(&unmap_args, &user_unmap_args[i],
sizeof(unmap_args))) {
err = -EFAULT;
break;
}
err = gk20a_vm_unmap_buffer(as_share->vm, unmap_args.offset,
&batch);
if (err)
break;
}
if (err) {
gk20a_vm_mapping_batch_finish(as_share->vm, &batch);
args->num_unmaps = i;
args->num_maps = 0;
return err;
}
for (i = 0; i < args->num_maps; ++i) {
struct nvgpu_as_map_buffer_ex_args map_args;
memset(&map_args, 0, sizeof(map_args));
if (copy_from_user(&map_args, &user_map_args[i],
sizeof(map_args))) {
err = -EFAULT;
break;
}
err = gk20a_vm_map_buffer(
as_share->vm, map_args.dmabuf_fd,
&map_args.offset, map_args.flags,
map_args.kind,
map_args.buffer_offset,
map_args.mapping_size,
&batch);
if (err)
break;
}
gk20a_vm_mapping_batch_finish(as_share->vm, &batch);
if (err)
args->num_maps = i;
/* note: args->num_unmaps will be unmodified, which is ok
* since all unmaps are done */
return err;
}
static int gk20a_as_ioctl_get_va_regions(
@@ -360,6 +439,10 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = gk20a_as_ioctl_map_buffer_compbits(as_share,
(struct nvgpu_as_map_buffer_compbits_args *)buf);
break;
case NVGPU_AS_IOCTL_MAP_BUFFER_BATCH:
err = gk20a_as_ioctl_map_buffer_batch(as_share,
(struct nvgpu_as_map_buffer_batch_args *)buf);
break;
default:
dev_dbg(dev_from_gk20a(g), "unrecognized as ioctl: 0x%x", cmd);
err = -ENOTTY;