diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 3f9b04327..6c7ff551f 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -1002,6 +1002,9 @@ unbind: mutex_unlock(&g->dbg_sessions_lock); + /* Make sure that when the ch is re-opened it will get a new HW sema. */ + ch->hw_sema = NULL; + /* make sure we catch accesses of unopened channels in case * there's non-refcounted channel pointers hanging around */ ch->g = NULL; diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index acd272b47..c5a1bd24f 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -108,6 +108,8 @@ struct channel_gk20a { atomic_t ref_count; wait_queue_head_t ref_count_dec_wq; + struct gk20a_semaphore_int *hw_sema; + int hw_chid; bool wdt_enabled; bool bound; diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c index d2d8c0947..9c8911e96 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c @@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher( } #endif -static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd, - u64 sema, u32 payload, bool acquire, bool wfi) +static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c, + struct gk20a_semaphore *s, struct priv_cmd_entry *cmd, + int cmd_size, bool acquire, bool wfi) { u32 off = cmd->off; + u64 va; + + /* + * RO for acquire (since we just need to read the mem) and RW for + * release since we will need to write back to the semaphore memory. + */ + va = acquire ? gk20a_semaphore_gpu_ro_va(s) : + gk20a_semaphore_gpu_rw_va(s); + + /* + * If the op is not an acquire (so therefor a release) we should + * incr the underlying sema next_value. + */ + if (!acquire) + gk20a_semaphore_incr(s); + /* semaphore_a */ gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004); /* offset_upper */ - gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff); + gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff); /* semaphore_b */ gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005); /* offset */ - gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff); - /* semaphore_c */ - gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006); - /* payload */ - gk20a_mem_wr32(g, cmd->mem, off++, payload); + gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff); + if (acquire) { + /* semaphore_c */ + gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006); + /* payload */ + gk20a_mem_wr32(g, cmd->mem, off++, + gk20a_semaphore_get_value(s)); /* semaphore_d */ gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); /* operation: acq_geq, switch_en */ gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12)); } else { + /* semaphore_c */ + gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006); + /* payload */ + gk20a_mem_wr32(g, cmd->mem, off++, + gk20a_semaphore_get_value(s)); /* semaphore_d */ gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); /* operation: release, wfi */ @@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd, /* ignored */ gk20a_mem_wr32(g, cmd->mem, off++, 0); } - return off - cmd->off; } static int gk20a_channel_semaphore_wait_syncpt( @@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt( return -ENODEV; } +/* + * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18. + * But since there's no API for getting the underlying sync_pts we have to do + * some conditional compilation. + */ +#ifdef CONFIG_SYNC +static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) + struct sync_pt *pt; + + pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list); + return gk20a_sync_pt_inst_get_sema(pt); +#else + return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt); +#endif +} + +/* + * Attempt a fast path for waiting on a sync_fence. Basically if the passed + * sync_fence is backed by a gk20a_semaphore then there's no reason to go + * through the rigmarole of setting up a separate semaphore which waits on an + * interrupt from the GPU and then triggers a worker thread to execute a SW + * based semaphore release. Instead just have the GPU wait on the same semaphore + * that is going to be incremented by the GPU. + * + * This function returns 2 possible values: -ENODEV or 0 on success. In the case + * of -ENODEV the fastpath cannot be taken due to the fence not being backed by + * a GPU semaphore. + */ +static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c, + struct sync_fence *fence, + struct priv_cmd_entry **wait_cmd, + struct gk20a_semaphore **fp_sema) +{ + struct gk20a_semaphore *sema; + int err; + + if (!gk20a_is_sema_backed_sync_fence(fence)) + return -ENODEV; + + sema = sema_from_sync_fence(fence); + + /* + * If there's no underlying sema then that means the underlying sema has + * already signaled. + */ + if (!sema) { + *fp_sema = NULL; + return 0; + } + + err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd); + if (err) + return err; + + gk20a_semaphore_get(sema); + BUG_ON(!atomic_read(&sema->value)); + add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false); + + /* + * Make sure that gk20a_channel_semaphore_wait_fd() can create another + * fence with the underlying semaphore. + */ + *fp_sema = sema; + + return 0; +} +#endif + static int gk20a_channel_semaphore_wait_fd( struct gk20a_channel_sync *s, int fd, struct priv_cmd_entry **entry, @@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd( container_of(s, struct gk20a_channel_semaphore, ops); struct channel_gk20a *c = sema->c; #ifdef CONFIG_SYNC + struct gk20a_semaphore *fp_sema; struct sync_fence *sync_fence; struct priv_cmd_entry *wait_cmd = NULL; - struct wait_fence_work *w; - int written; - int err, ret; - u64 va; + struct wait_fence_work *w = NULL; + int err, ret, status; sync_fence = gk20a_sync_fence_fdget(fd); if (!sync_fence) return -EINVAL; - w = kzalloc(sizeof(*w), GFP_KERNEL); - if (!w) { - err = -ENOMEM; - goto fail; - } - sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher); - w->ch = c; - w->sema = gk20a_semaphore_alloc(sema->pool); - if (!w->sema) { - gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores"); - err = -ENOMEM; - goto fail; + ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema); + if (ret == 0) { + if (fp_sema) + *fence = gk20a_fence_from_semaphore(sema->timeline, + fp_sema, + &c->semaphore_wq, + NULL, false); + else + /* + * Allocate an empty fence. It will instantly return + * from gk20a_fence_wait(). + */ + *fence = gk20a_alloc_fence(NULL, NULL, false); + + sync_fence_put(sync_fence); + goto skip_slow_path; } - /* worker takes one reference */ - gk20a_semaphore_get(w->sema); + /* If the fence has signaled there is no reason to wait on it. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) + status = sync_fence->status; +#else + status = atomic_read(&sync_fence->status); +#endif + if (status) { + sync_fence_put(sync_fence); + goto skip_slow_path; + } err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd); if (err) { gk20a_err(dev_from_gk20a(c->g), "not enough priv cmd buffer space"); - goto fail; + sync_fence_put(sync_fence); + return -ENOMEM; } - va = gk20a_semaphore_gpu_va(w->sema, c->vm); - /* GPU unblocked when when the semaphore value becomes 1. */ - written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false); + w = kzalloc(sizeof(*w), GFP_KERNEL); + if (!w) { + err = -ENOMEM; + goto fail_free_cmdbuf; + } + + sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher); + w->ch = c; + w->sema = gk20a_semaphore_alloc(c); + if (!w->sema) { + gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores"); + err = -ENOMEM; + goto fail_free_worker; + } + + /* worker takes one reference */ + gk20a_semaphore_get(w->sema); + gk20a_semaphore_incr(w->sema); + + /* GPU unblocked when the semaphore value increments. */ + add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false); - WARN_ON(written != wait_cmd->size); ret = sync_fence_wait_async(sync_fence, &w->waiter); /* * If the sync_fence has already signaled then the above async_wait * will never trigger. This causes the semaphore release op to never * happen which, in turn, hangs the GPU. That's bad. So let's just - * do the semaphore_release right now. + * do the gk20a_semaphore_release() right now. */ - if (ret == 1) + if (ret == 1) { + sync_fence_put(sync_fence); gk20a_semaphore_release(w->sema); + gk20a_semaphore_put(w->sema); + } /* XXX - this fixes an actual bug, we need to hold a ref to this semaphore while the job is in flight. */ *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema, &c->semaphore_wq, NULL, false); + +skip_slow_path: *entry = wait_cmd; return 0; -fail: + +fail_free_worker: if (w && w->sema) gk20a_semaphore_put(w->sema); kfree(w); sync_fence_put(sync_fence); +fail_free_cmdbuf: + if (wait_cmd) + gk20a_free_priv_cmdbuf(c, wait_cmd); return err; #else gk20a_err(dev_from_gk20a(c->g), @@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr( struct gk20a_fence **fence, bool need_sync_fence) { - u64 va; int incr_cmd_size; - int written; struct priv_cmd_entry *incr_cmd = NULL; struct gk20a_channel_semaphore *sp = container_of(s, struct gk20a_channel_semaphore, ops); @@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr( struct gk20a_semaphore *semaphore; int err = 0; - semaphore = gk20a_semaphore_alloc(sp->pool); + semaphore = gk20a_semaphore_alloc(c); if (!semaphore) { gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores"); @@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr( } /* Release the completion semaphore. */ - va = gk20a_semaphore_gpu_va(semaphore, c->vm); - written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd); - WARN_ON(written != incr_cmd_size); + add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd); *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore, &c->semaphore_wq, @@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr( { /* Don't put wfi cmd to this one since we're not returning * a fence to user space. */ - return __gk20a_channel_semaphore_incr(s, false /* no wfi */, - NULL, entry, fence, need_sync_fence); + return __gk20a_channel_semaphore_incr(s, + false /* no wfi */, + NULL, + entry, fence, need_sync_fence); } static int gk20a_channel_semaphore_incr_user( @@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s) container_of(s, struct gk20a_channel_semaphore, ops); if (sema->timeline) gk20a_sync_timeline_destroy(sema->timeline); - if (sema->pool) { - gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm); - gk20a_semaphore_pool_put(sema->pool); - } + + /* The sema pool is cleaned up by the VM destroy. */ + sema->pool = NULL; + kfree(sema); } static struct gk20a_channel_sync * gk20a_channel_semaphore_create(struct channel_gk20a *c) { - int err; int asid = -1; struct gk20a_channel_semaphore *sema; char pool_name[20]; @@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c) asid = c->vm->as_share->id; sprintf(pool_name, "semaphore_pool-%d", c->hw_chid); - sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024); - if (!sema->pool) - goto clean_up; - - /* Map the semaphore pool to the channel vm. Map as read-write to the - * owner channel (all other channels should map as read only!). */ - err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none); - if (err) - goto clean_up; + sema->pool = c->vm->sema_pool; #ifdef CONFIG_SYNC sema->timeline = gk20a_sync_timeline_create( "gk20a_ch%d_as%d", c->hw_chid, asid); - if (!sema->timeline) - goto clean_up; + if (!sema->timeline) { + gk20a_channel_semaphore_destroy(&sema->ops); + return NULL; + } #endif atomic_set(&sema->ops.refcount, 0); sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt; @@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c) sema->ops.destroy = gk20a_channel_semaphore_destroy; return &sema->ops; -clean_up: - gk20a_channel_semaphore_destroy(&sema->ops); - return NULL; } void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync) diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c index 23522882f..fbbaa2a7a 100644 --- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c @@ -155,8 +155,8 @@ struct gk20a_fence *gk20a_fence_from_semaphore( #ifdef CONFIG_SYNC sync_fence = gk20a_sync_fence_create(timeline, semaphore, - dependency, "f-gk20a-0x%04x", - semaphore->offset & 0xffff); + dependency, "f-gk20a-0x%04x", + gk20a_semaphore_gpu_ro_va(semaphore)); if (!sync_fence) return NULL; #endif diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 5ab09ac38..7bd9775e4 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -738,6 +738,11 @@ struct gk20a { #endif struct gk20a_ctxsw_ucode_info ctxsw_ucode_info; + /* + * A group of semaphore pools. One for each channel. + */ + struct gk20a_semaphore_sea *sema_sea; + /* held while manipulating # of debug/profiler sessions present */ /* also prevents debug sessions from attaching until released */ struct mutex dbg_sessions_lock; diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index 3b21e8432..9299266fa 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c @@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm) struct rb_node *node; gk20a_dbg_fn(""); + + /* + * Do this outside of the update_gmmu_lock since unmapping the semaphore + * pool involves unmapping a GMMU mapping which means aquiring the + * update_gmmu_lock. + */ + if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) { + gk20a_semaphore_pool_unmap(vm->sema_pool, vm); + gk20a_semaphore_pool_put(vm->sema_pool); + } + mutex_lock(&vm->update_gmmu_lock); /* TBD: add a flag here for the unmap code to recognize teardown @@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = { {.update_entry = NULL} }; +/* + * Initialize a semaphore pool. Just return successfully if we do not need + * semaphores (i.e when sync-pts are active). + */ +int gk20a_init_sema_pool(struct vm_gk20a *vm) +{ + struct gk20a_semaphore_sea *sema_sea; + struct mm_gk20a *mm = vm->mm; + struct gk20a *g = mm->g; + int err; + + /* + * Don't waste the memory on semaphores if we don't need them. + */ + if (gk20a_platform_has_syncpoints(g->dev)) + return 0; + + if (vm->sema_pool) + return 0; + + sema_sea = gk20a_semaphore_sea_create(g); + if (!sema_sea) + return -ENOMEM; + + vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea); + if (!vm->sema_pool) { + gk20a_vm_put(vm); + return -ENOMEM; + } + + /* + * Allocate a chunk of GPU VA space for mapping the semaphores. We will + * do a fixed alloc in the kernel VM so that all channels have the same + * RO address range for the semaphores. + * + * !!! TODO: cleanup. + */ + sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel], + vm->va_limit - + mm->channel.kernel_size, + 512 * PAGE_SIZE); + if (!sema_sea->gpu_va) { + gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va); + gk20a_vm_put(vm); + return -ENOMEM; + } + + err = gk20a_semaphore_pool_map(vm->sema_pool, vm); + if (err) { + gk20a_semaphore_pool_unmap(vm->sema_pool, vm); + gk20a_bfree(&vm->vma[gmmu_page_size_small], + vm->sema_pool->gpu_va); + gk20a_vm_put(vm); + } + + return 0; +} + int gk20a_init_vm(struct mm_gk20a *mm, struct vm_gk20a *vm, u32 big_page_size, @@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm, vm->big_pages = big_pages; vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big]; - vm->userspace_managed = userspace_managed; - vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g, vm->big_page_size); @@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm, kref_init(&vm->ref); INIT_LIST_HEAD(&vm->reserved_va_list); + /* + * This is only necessary for channel address spaces. The best way to + * distinguish channel address spaces from other address spaces is by + * size - if the address space is 4GB or less, it's not a channel. + */ + if (vm->va_limit > SZ_4G) { + err = gk20a_init_sema_pool(vm); + if (err) + goto clean_up_big_allocator; + } + return 0; clean_up_big_allocator: diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index db74a5ca6..7bb4d011c 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h @@ -287,6 +287,11 @@ struct vm_gk20a { /* if non-NULL, kref_put will use this batch when unmapping. Must hold vm->update_gmmu_lock. */ struct vm_gk20a_mapping_batch *kref_put_batch; + + /* + * Each address space needs to have a semaphore pool. + */ + struct gk20a_semaphore_pool *sema_pool; }; struct gk20a; diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c index 3b17bfcb2..aa375b245 100644 --- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c @@ -15,63 +15,284 @@ * more details. */ -#include "semaphore_gk20a.h" +#define pr_fmt(fmt) "gpu_sema: " fmt + #include +#include #include + +#include + #include "gk20a.h" #include "mm_gk20a.h" +#include "semaphore_gk20a.h" -static const int SEMAPHORE_SIZE = 16; +#define __lock_sema_sea(s) \ + do { \ + mutex_lock(&s->sea_lock); \ + } while (0) -struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct gk20a *g, - const char *unique_name, size_t capacity) +#define __unlock_sema_sea(s) \ + do { \ + mutex_unlock(&s->sea_lock); \ + } while (0) + +/* + * Return the sema_sea pointer. + */ +struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g) { - struct gk20a_semaphore_pool *p; - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) + return g->sema_sea; +} + +static int __gk20a_semaphore_sea_grow(struct gk20a_semaphore_sea *sea) +{ + int ret = 0; + struct gk20a *gk20a = sea->gk20a; + + __lock_sema_sea(sea); + + ret = gk20a_gmmu_alloc_attr(gk20a, DMA_ATTR_NO_KERNEL_MAPPING, + PAGE_SIZE * SEMAPHORE_POOL_COUNT, + &sea->sea_mem); + if (ret) + goto out; + + sea->ro_sg_table = sea->sea_mem.sgt; + sea->size = SEMAPHORE_POOL_COUNT; + sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE; + +out: + __unlock_sema_sea(sea); + return ret; +} + +/* + * Create the semaphore sea. Only create it once - subsequent calls to this will + * return the originally created sea pointer. + */ +struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *g) +{ + if (g->sema_sea) + return g->sema_sea; + + g->sema_sea = kzalloc(sizeof(*g->sema_sea), GFP_KERNEL); + if (!g->sema_sea) return NULL; - kref_init(&p->ref); - INIT_LIST_HEAD(&p->maps); - mutex_init(&p->maps_mutex); - p->g = g; + g->sema_sea->size = 0; + g->sema_sea->page_count = 0; + g->sema_sea->gk20a = g; + INIT_LIST_HEAD(&g->sema_sea->pool_list); + mutex_init(&g->sema_sea->sea_lock); - /* Alloc one 4k page of semaphore per channel. */ - if (gk20a_gmmu_alloc(g, roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE), - &p->mem)) - goto clean_up; + if (__gk20a_semaphore_sea_grow(g->sema_sea)) + goto cleanup; - /* Sacrifice one semaphore in the name of returning error codes. */ - if (gk20a_allocator_init(&p->alloc, unique_name, - SEMAPHORE_SIZE, p->mem.size - SEMAPHORE_SIZE, - SEMAPHORE_SIZE)) - goto clean_up; + return g->sema_sea; - gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->mem.cpu_va, - (u64)sg_dma_address(p->mem.sgt->sgl), - (u64)sg_phys(p->mem.sgt->sgl)); - return p; - -clean_up: - if (p->mem.size) - gk20a_gmmu_free(p->g, &p->mem); - kfree(p); +cleanup: + kfree(g->sema_sea); + g->sema_sea = NULL; return NULL; } +static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len) +{ + unsigned long idx = find_first_zero_bit(bitmap, len); + + if (idx == len) + return -ENOSPC; + + set_bit(idx, bitmap); + + return (int)idx; +} + +/* + * Allocate a pool from the sea. + */ +struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc( + struct gk20a_semaphore_sea *sea) +{ + struct gk20a_semaphore_pool *p; + unsigned long page_idx; + int err = 0; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + __lock_sema_sea(sea); + + page_idx = __semaphore_bitmap_alloc(sea->pools_alloced, + SEMAPHORE_POOL_COUNT); + if (page_idx < 0) { + err = page_idx; + goto fail; + } + + p->page = sea->sea_mem.pages[page_idx]; + p->ro_sg_table = sea->ro_sg_table; + p->page_idx = page_idx; + p->sema_sea = sea; + INIT_LIST_HEAD(&p->hw_semas); + kref_init(&p->ref); + mutex_init(&p->pool_lock); + + sea->page_count++; + list_add(&p->pool_list_entry, &sea->pool_list); + __unlock_sema_sea(sea); + + return p; + +fail: + __unlock_sema_sea(sea); + kfree(p); + return ERR_PTR(err); +} + +/* + * Map a pool into the passed vm's address space. This handles both the fixed + * global RO mapping and the non-fixed private RW mapping. + */ +int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p, + struct vm_gk20a *vm) +{ + int ents, err = 0; + u64 addr; + + p->cpu_va = vmap(&p->page, 1, 0, + pgprot_writecombine(PAGE_KERNEL)); + + /* First do the RW mapping. */ + p->rw_sg_table = kzalloc(sizeof(*p->rw_sg_table), GFP_KERNEL); + if (!p->rw_sg_table) + return -ENOMEM; + + err = sg_alloc_table_from_pages(p->rw_sg_table, &p->page, 1, 0, + PAGE_SIZE, GFP_KERNEL); + if (err) { + err = -ENOMEM; + goto fail; + } + + /* Add IOMMU mapping... */ + ents = dma_map_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1, + DMA_BIDIRECTIONAL); + if (ents != 1) { + err = -ENOMEM; + goto fail_free_sgt; + } + + /* Map into the GPU... Doesn't need to be fixed. */ + p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE, + 0, gk20a_mem_flag_none, false); + if (!p->gpu_va) { + err = -ENOMEM; + goto fail_unmap_sgt; + } + + /* + * And now the global mapping. Take the sea lock so that we don't race + * with a concurrent remap. + */ + __lock_sema_sea(p->sema_sea); + + BUG_ON(p->mapped); + addr = gk20a_gmmu_fixed_map(vm, &p->sema_sea->ro_sg_table, + p->sema_sea->gpu_va, p->sema_sea->map_size, + 0, + gk20a_mem_flag_read_only, + false); + if (!addr) { + err = -ENOMEM; + BUG(); + goto fail_unlock; + } + p->gpu_va_ro = addr; + p->mapped = 1; + + __unlock_sema_sea(p->sema_sea); + + return 0; + +fail_unlock: + __unlock_sema_sea(p->sema_sea); +fail_unmap_sgt: + dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1, + DMA_BIDIRECTIONAL); +fail_free_sgt: + sg_free_table(p->rw_sg_table); +fail: + kfree(p->rw_sg_table); + p->rw_sg_table = NULL; + return err; +} + +/* + * Unmap a semaphore_pool. + */ +void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p, + struct vm_gk20a *vm) +{ + struct gk20a_semaphore_int *hw_sema; + + kunmap(p->cpu_va); + + /* First the global RO mapping... */ + __lock_sema_sea(p->sema_sea); + gk20a_gmmu_unmap(vm, p->gpu_va_ro, + p->sema_sea->map_size, gk20a_mem_flag_none); + p->ro_sg_table = NULL; + __unlock_sema_sea(p->sema_sea); + + /* And now the private RW mapping. */ + gk20a_gmmu_unmap(vm, p->gpu_va, PAGE_SIZE, gk20a_mem_flag_none); + p->gpu_va = 0; + + dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1, + DMA_BIDIRECTIONAL); + + sg_free_table(p->rw_sg_table); + kfree(p->rw_sg_table); + p->rw_sg_table = NULL; + + gk20a_dbg_info("Unmapped sema-pool: idx = %d", p->page_idx); + list_for_each_entry(hw_sema, &p->hw_semas, hw_sema_list) + /* + * Make sure the mem addresses are all NULL so if this gets + * reused we will fault. + */ + hw_sema->value = NULL; +} + +/* + * Completely free a sempahore_pool. You should make sure this pool is not + * mapped otherwise there's going to be a memory leak. + */ static void gk20a_semaphore_pool_free(struct kref *ref) { struct gk20a_semaphore_pool *p = container_of(ref, struct gk20a_semaphore_pool, ref); - mutex_lock(&p->maps_mutex); - WARN_ON(!list_empty(&p->maps)); - mutex_unlock(&p->maps_mutex); - gk20a_gmmu_free(p->g, &p->mem); - gk20a_allocator_destroy(&p->alloc); + struct gk20a_semaphore_sea *s = p->sema_sea; + struct gk20a_semaphore_int *hw_sema, *tmp; + + WARN_ON(p->gpu_va || p->rw_sg_table || p->ro_sg_table); + + __lock_sema_sea(s); + list_del(&p->pool_list_entry); + clear_bit(p->page_idx, s->pools_alloced); + s->page_count--; + __unlock_sema_sea(s); + + list_for_each_entry_safe(hw_sema, tmp, &p->hw_semas, hw_sema_list) + kfree(hw_sema); + kfree(p); } -static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p) +void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p) { kref_get(&p->ref); } @@ -81,104 +302,96 @@ void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p) kref_put(&p->ref, gk20a_semaphore_pool_free); } -static struct gk20a_semaphore_pool_map * -gk20a_semaphore_pool_find_map_locked(struct gk20a_semaphore_pool *p, - struct vm_gk20a *vm) +/* + * Get the address for a semaphore_pool - if global is true then return the + * global RO address instead of the RW address owned by the semaphore's VM. + */ +u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global) { - struct gk20a_semaphore_pool_map *map, *found = NULL; - list_for_each_entry(map, &p->maps, list) { - if (map->vm == vm) { - found = map; - break; - } - } - return found; + if (!global) + return p->gpu_va; + + return p->gpu_va_ro + (PAGE_SIZE * p->page_idx); } -int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p, - struct vm_gk20a *vm, - enum gk20a_mem_rw_flag rw_flag) +static int __gk20a_init_hw_sema(struct channel_gk20a *ch) { - struct gk20a_semaphore_pool_map *map; + int hw_sema_idx; + int ret = 0; + struct gk20a_semaphore_int *hw_sema; + struct gk20a_semaphore_pool *p = ch->vm->sema_pool; - map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) - return -ENOMEM; - map->vm = vm; - map->rw_flag = rw_flag; - map->gpu_va = gk20a_gmmu_map(vm, &p->mem.sgt, p->mem.size, - 0/*uncached*/, rw_flag, - false); - if (!map->gpu_va) { - kfree(map); - return -ENOMEM; + BUG_ON(!p); + + mutex_lock(&p->pool_lock); + + /* Find an available HW semaphore. */ + hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced, + PAGE_SIZE / SEMAPHORE_SIZE); + if (hw_sema_idx < 0) { + ret = hw_sema_idx; + goto fail; } - gk20a_vm_get(vm); - mutex_lock(&p->maps_mutex); - WARN_ON(gk20a_semaphore_pool_find_map_locked(p, vm)); - list_add(&map->list, &p->maps); - mutex_unlock(&p->maps_mutex); + hw_sema = kzalloc(sizeof(struct gk20a_semaphore_int), GFP_KERNEL); + if (!hw_sema) { + ret = -ENOMEM; + goto fail_free_idx; + } + + ch->hw_sema = hw_sema; + hw_sema->ch = ch; + hw_sema->p = p; + hw_sema->idx = hw_sema_idx; + hw_sema->offset = SEMAPHORE_SIZE * hw_sema_idx; + atomic_set(&hw_sema->next_value, 0); + hw_sema->value = p->cpu_va + hw_sema->offset; + writel(0, hw_sema->value); + + list_add(&hw_sema->hw_sema_list, &p->hw_semas); + + mutex_unlock(&p->pool_lock); + return 0; + +fail_free_idx: + clear_bit(hw_sema_idx, p->semas_alloced); +fail: + mutex_unlock(&p->pool_lock); + return ret; } -void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p, - struct vm_gk20a *vm) -{ - struct gk20a_semaphore_pool_map *map; - WARN_ON(!vm); - - mutex_lock(&p->maps_mutex); - map = gk20a_semaphore_pool_find_map_locked(p, vm); - if (map) { - gk20a_gmmu_unmap(vm, map->gpu_va, p->mem.size, map->rw_flag); - gk20a_vm_put(vm); - list_del(&map->list); - kfree(map); - } - mutex_unlock(&p->maps_mutex); -} - -u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, - struct vm_gk20a *vm) -{ - struct gk20a_semaphore_pool_map *map; - u64 gpu_va = 0; - - mutex_lock(&p->maps_mutex); - map = gk20a_semaphore_pool_find_map_locked(p, vm); - if (map) - gpu_va = map->gpu_va; - mutex_unlock(&p->maps_mutex); - - return gpu_va; -} - -struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool) +/* + * Allocate a semaphore from the passed pool. + * + * Since semaphores are ref-counted there's no explicit free for external code + * to use. When the ref-count hits 0 the internal free will happen. + */ +struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch) { struct gk20a_semaphore *s; + int ret; + + if (!ch->hw_sema) { + ret = __gk20a_init_hw_sema(ch); + if (ret) + return ERR_PTR(ret); + } s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) return NULL; - s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE); - if (!s->offset) { - gk20a_err(dev_from_gk20a(pool->g), - "failed to allocate semaphore"); - kfree(s); - return NULL; - } - - gk20a_semaphore_pool_get(pool); - s->pool = pool; - kref_init(&s->ref); - /* Initially acquired. */ - gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 0); - gk20a_dbg_info("created semaphore offset=%d, value=%d", - s->offset, - gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset)); + s->hw_sema = ch->hw_sema; + atomic_set(&s->value, 0); + + /* + * Take a ref on the pool so that we can keep this pool alive for + * as long as this semaphore is alive. + */ + gk20a_semaphore_pool_get(s->hw_sema->p); + return s; } @@ -187,8 +400,8 @@ static void gk20a_semaphore_free(struct kref *ref) struct gk20a_semaphore *s = container_of(ref, struct gk20a_semaphore, ref); - gk20a_bfree(&s->pool->alloc, s->offset); - gk20a_semaphore_pool_put(s->pool); + gk20a_semaphore_pool_put(s->hw_sema->p); + kfree(s); } diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h index 1f12e262b..58081b56f 100644 --- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h @@ -15,17 +15,128 @@ #define SEMAPHORE_GK20A_H #include -#include "gk20a_allocator.h" -#include "mm_gk20a.h" +#include +#include + +#include "gk20a.h" +#include "mm_gk20a.h" +#include "channel_gk20a.h" + +/* + * Max number of channels that can be used is 512. This of course needs to be + * fixed to be dynamic but still fast. + */ +#define SEMAPHORE_POOL_COUNT 512 +#define SEMAPHORE_SIZE 16 +#define SEMAPHORE_SEA_GROWTH_RATE 32 + +struct gk20a_semaphore_sea; + +/* + * Underlying semaphore data structure. This semaphore can be shared amongst + * other semaphore instances. + */ +struct gk20a_semaphore_int { + int idx; /* Semaphore index. */ + u32 offset; /* Offset into the pool. */ + atomic_t next_value; /* Next available value. */ + u32 *value; /* Current value (access w/ readl()). */ + u32 nr_incrs; /* Number of increments programmed. */ + struct gk20a_semaphore_pool *p; /* Pool that owns this sema. */ + struct channel_gk20a *ch; /* Channel that owns this sema. */ + struct list_head hw_sema_list; /* List of HW semaphores. */ +}; + +/* + * A semaphore which the rest of the driver actually uses. This consists of a + * pointer to a real semaphore and a value to wait for. This allows one physical + * semaphore to be shared among an essentially infinite number of submits. + */ +struct gk20a_semaphore { + struct gk20a_semaphore_int *hw_sema; + + atomic_t value; + int incremented; -/* A memory pool for holding semaphores. */ -struct gk20a_semaphore_pool { - struct mem_desc mem; - struct gk20a *g; - struct list_head maps; - struct mutex maps_mutex; struct kref ref; - struct gk20a_allocator alloc; +}; + +/* + * A semaphore pool. Each address space will own exactly one of these. + */ +struct gk20a_semaphore_pool { + struct page *page; /* This pool's page of memory */ + struct list_head pool_list_entry; /* Node for list of pools. */ + void *cpu_va; /* CPU access to the pool. */ + u64 gpu_va; /* GPU access to the pool. */ + u64 gpu_va_ro; /* GPU access to the pool. */ + int page_idx; /* Index into sea bitmap. */ + + struct list_head hw_semas; /* List of HW semas. */ + DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE); + + struct gk20a_semaphore_sea *sema_sea; /* Sea that owns this pool. */ + + struct mutex pool_lock; + + /* + * This is the address spaces's personal RW table. Other channels will + * ultimately map this page as RO. + */ + struct sg_table *rw_sg_table; + + /* + * This is to keep track of whether the pool has had its sg_table + * updated during sea resizing. + */ + struct sg_table *ro_sg_table; + + int mapped; + + /* + * Sometimes a channel can be released before other channels are + * done waiting on it. This ref count ensures that the pool doesn't + * go away until all semaphores using this pool are cleaned up first. + */ + struct kref ref; +}; + +/* + * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple + * channels can share a VM each channel gets it's own HW semaphore from the + * pool. Channels then allocate regular semaphores - basically just a value that + * signifies when a particular job is done. + */ +struct gk20a_semaphore_sea { + struct list_head pool_list; /* List of pools in this sea. */ + struct gk20a *gk20a; + + size_t size; /* Number of pages available. */ + u64 gpu_va; /* GPU virtual address of sema sea. */ + u64 map_size; /* Size of the mapping. */ + + /* + * TODO: + * List of pages that we use to back the pools. The number of pages + * can grow dynamically since allocating 512 pages for all channels at + * once would be a tremendous waste. + */ + int page_count; /* Pages allocated to pools. */ + + struct sg_table *ro_sg_table; + /* + struct page *pages[SEMAPHORE_POOL_COUNT]; + */ + + struct mem_desc sea_mem; + + /* + * Can't use a regular allocator here since the full range of pools are + * not always allocated. Instead just use a bitmap. + */ + DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT); + + struct mutex sea_lock; /* Lock alloc/free calls. */ }; enum gk20a_mem_rw_flag { @@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag { gk20a_mem_flag_write_only = 2, }; -/* A semaphore pool can be mapped to multiple GPU address spaces. */ -struct gk20a_semaphore_pool_map { - u64 gpu_va; - enum gk20a_mem_rw_flag rw_flag; - struct vm_gk20a *vm; - struct list_head list; -}; +/* + * Semaphore sea functions. + */ +struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a); +int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea, + struct vm_gk20a *vm); +void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea, + struct vm_gk20a *vm); +struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g); -/* A semaphore that lives inside a semaphore pool. */ -struct gk20a_semaphore { - struct gk20a_semaphore_pool *pool; - /* - * value exists within the pool's memory at the specified offset. - * 0=acquired, 1=released. - */ - u32 offset; /* byte offset within pool */ - struct kref ref; -}; +/* + * Semaphore pool functions. + */ +struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc( + struct gk20a_semaphore_sea *sea); +int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool, + struct vm_gk20a *vm); +void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool, + struct vm_gk20a *vm); +u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global); +void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p); +void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p); -/* Create a semaphore pool that can hold at most 'capacity' semaphores. */ -struct gk20a_semaphore_pool * -gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name, - size_t capacity); -void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *); -int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *, - struct vm_gk20a *, - enum gk20a_mem_rw_flag); -void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *, - struct vm_gk20a *); -u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *, - struct vm_gk20a *); +/* + * Semaphore functions. + */ +struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch); +void gk20a_semaphore_put(struct gk20a_semaphore *s); +void gk20a_semaphore_get(struct gk20a_semaphore *s); -/* Allocate a semaphore from the semaphore pool. The newly allocated - * semaphore will be in acquired state (value=0). */ -struct gk20a_semaphore * -gk20a_semaphore_alloc(struct gk20a_semaphore_pool *); -void gk20a_semaphore_put(struct gk20a_semaphore *); -void gk20a_semaphore_get(struct gk20a_semaphore *); - -static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s, - struct vm_gk20a *vm) +/* + * Return the address of a specific semaphore. + * + * Don't call this on a semaphore you don't own - the VA returned will make no + * sense in your specific channel's VM. + */ +static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s) { - return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset; + return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) + + s->hw_sema->offset; +} + +/* + * Get the global RO address for the semaphore. Can be called on any semaphore + * regardless of whether you own it. + */ +static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s) +{ + return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) + + s->hw_sema->offset; +} + +static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema) +{ + return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) + + hw_sema->offset; +} + +/* + * TODO: handle wrap around... Hmm, how to do this? + */ +static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s) +{ + u32 sema_val = readl(s->hw_sema->value); + + /* + * If the underlying semaphore value is greater than or equal to + * the value of the semaphore then the semaphore has been signaled + * (a.k.a. released). + */ + return sema_val >= atomic_read(&s->value); } static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s) { - u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset); - - /* When often block on value reaching a certain threshold. We must make - * sure that if we get unblocked, we haven't read anything too early. */ - smp_rmb(); - return v == 0; + return !gk20a_semaphore_is_released(s); } +/* + * Read the underlying value from a semaphore. + */ +static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s) +{ + return readl(s->hw_sema->value); +} + +static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s) +{ + return atomic_read(&s->value); +} + +static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s) +{ + return atomic_read(&s->hw_sema->next_value); +} + +/* + * Note - if you call this then any prior semaphores will also be released. + */ static inline void gk20a_semaphore_release(struct gk20a_semaphore *s) { - smp_wmb(); - gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1); + u32 current_val; + u32 val = gk20a_semaphore_get_value(s); + int attempts = 0; + + /* + * Wait until the sema value is 1 less than the write value. That + * way this function is essentially an increment. + * + * TODO: tune the wait a little better. + */ + while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) { + msleep(100); + attempts += 1; + if (attempts > 100) { + WARN(1, "Stall on sema release!"); + return; + } + } + + /* + * If the semaphore has already passed the value we would write then + * this is really just a NO-OP. + */ + if (current_val >= val) + return; + + writel(val, s->hw_sema->value); +} + +/* + * Configure a software based increment on this semaphore. This is useful for + * when we want the GPU to wait on a SW event before processing a channel. + * Another way to describe this is when the GPU needs to wait on a SW pre-fence. + * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which + * then allows the GPU to continue. + * + * Also used to prep a semaphore for an INCR by the GPU. + */ +static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s) +{ + BUG_ON(s->incremented); + + atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value)); + s->incremented = 1; } #endif