diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu index e3c115c61..7e7037069 100644 --- a/drivers/gpu/nvgpu/Makefile.nvgpu +++ b/drivers/gpu/nvgpu/Makefile.nvgpu @@ -55,6 +55,7 @@ nvgpu-y := \ gk20a/gk20a_allocator.o \ gk20a/gk20a_allocator_bitmap.o \ gk20a/gk20a_allocator_buddy.o \ + gk20a/gk20a_allocator_page.o \ gk20a/cde_gk20a.o \ gk20a/platform_gk20a_generic.o \ gk20a/tsg_gk20a.o \ diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h index f3b6dab38..9becf053c 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h @@ -99,9 +99,32 @@ struct gk20a_allocator { * allocations you need to keep track of the meta-data yourself (in this * case the base and length of the allocation as opposed to just the base * of the allocation). + * + * GPU_ALLOC_4K_VIDMEM_PAGES + * + * We manage vidmem pages at a large page granularity for performance + * reasons; however, this can lead to wasting memory. For page allocators + * setting this flag will tell the allocator to manage pools of 4K pages + * inside internally allocated large pages. + * + * GPU_ALLOC_FORCE_CONTIG + * + * Force allocations to be contiguous. Currently only relevant for page + * allocators since all other allocators are naturally contiguous. + * + * GPU_ALLOC_NO_SCATTER_GATHER + * + * The page allocator normally returns a scatter gather data structure for + * allocations (to handle discontiguous pages). However, at times that can + * be annoying so this flag forces the page allocator to return a u64 + * pointing to the allocation base (requires GPU_ALLOC_FORCE_CONTIG to be + * set as well). */ #define GPU_ALLOC_GVA_SPACE 0x1 #define GPU_ALLOC_NO_ALLOC_PAGE 0x2 +#define GPU_ALLOC_4K_VIDMEM_PAGES 0x4 +#define GPU_ALLOC_FORCE_CONTIG 0x8 +#define GPU_ALLOC_NO_SCATTER_GATHER 0x10 static inline void alloc_lock(struct gk20a_allocator *a) { @@ -131,6 +154,13 @@ int gk20a_bitmap_allocator_init(struct gk20a_allocator *__a, const char *name, u64 base, u64 length, u64 blk_size, u64 flags); +/* + * Page allocator initializers. + */ +int gk20a_page_allocator_init(struct gk20a_allocator *__a, + const char *name, u64 base, u64 length, + u64 blk_size, u64 flags); + #define GPU_BALLOC_MAX_ORDER 31 /* @@ -199,7 +229,7 @@ void gk20a_alloc_debugfs_init(struct platform_device *pdev); } while (0) #define __alloc_dbg(a, fmt, arg...) \ - pr_info("%-25s %25s() " fmt, (a)->name, __func__, ##arg) + pr_warn("%-25s %25s() " fmt, (a)->name, __func__, ##arg) #if defined(ALLOCATOR_DEBUG) /* diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c new file mode 100644 index 000000000..534027cc5 --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include "gk20a_allocator.h" +#include "buddy_allocator_priv.h" +#include "page_allocator_priv.h" + +#define palloc_dbg(a, fmt, arg...) \ + alloc_dbg(palloc_owner(a), fmt, ##arg) + +static struct kmem_cache *page_alloc_cache; +static struct kmem_cache *page_alloc_chunk_cache; +static DEFINE_MUTEX(meta_data_cache_lock); + +static u64 gk20a_page_alloc_length(struct gk20a_allocator *a) +{ + struct gk20a_page_allocator *va = a->priv; + + return gk20a_alloc_length(&va->source_allocator); +} + +static u64 gk20a_page_alloc_base(struct gk20a_allocator *a) +{ + struct gk20a_page_allocator *va = a->priv; + + return gk20a_alloc_base(&va->source_allocator); +} + +static int gk20a_page_alloc_inited(struct gk20a_allocator *a) +{ + struct gk20a_page_allocator *va = a->priv; + + return gk20a_alloc_initialized(&va->source_allocator); +} + +static u64 gk20a_page_alloc_end(struct gk20a_allocator *a) +{ + struct gk20a_page_allocator *va = a->priv; + + return gk20a_alloc_end(&va->source_allocator); +} + +static int __insert_page_alloc(struct gk20a_page_allocator *a, + struct gk20a_page_alloc *alloc) +{ + struct rb_node **new = &a->allocs.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct gk20a_page_alloc *tmp = + container_of(*new, struct gk20a_page_alloc, + tree_entry); + + parent = *new; + if (alloc->base < tmp->base) { + new = &((*new)->rb_left); + } else if (alloc->base > tmp->base) { + new = &((*new)->rb_right); + } else { + WARN(1, "Duplicate entries in allocated list!\n"); + return 0; + } + } + + rb_link_node(&alloc->tree_entry, parent, new); + rb_insert_color(&alloc->tree_entry, &a->allocs); + + return 0; +} + +static struct gk20a_page_alloc *__find_page_alloc( + struct gk20a_page_allocator *a, + u64 addr) +{ + struct rb_node *node = a->allocs.rb_node; + struct gk20a_page_alloc *alloc; + + while (node) { + alloc = container_of(node, struct gk20a_page_alloc, tree_entry); + + if (addr < alloc->base) + node = node->rb_left; + else if (addr > alloc->base) + node = node->rb_right; + else + break; + } + + if (!node) + return NULL; + + rb_erase(node, &a->allocs); + + return alloc; +} + +/* + * Allocate physical pages. Since the underlying allocator is a buddy allocator + * the returned pages are always contiguous. However, since there could be + * fragmentation in the space this allocator will collate smaller non-contiguous + * allocations together if necessary. + */ +static struct gk20a_page_alloc *__gk20a_alloc_pages( + struct gk20a_page_allocator *a, u64 pages) +{ + struct gk20a_page_alloc *alloc; + struct page_alloc_chunk *c; + u64 max_chunk_len = pages << a->page_shift; + int i = 0; + + alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL); + if (!alloc) + goto fail; + + INIT_LIST_HEAD(&alloc->alloc_chunks); + alloc->length = pages << a->page_shift; + + while (pages) { + u64 chunk_addr = 0; + u64 chunk_pages = 1 << __fls(pages); + u64 chunk_len = chunk_pages << a->page_shift; + + /* + * Take care of the possibility that the allocation must be + * contiguous. If this is not the first iteration then that + * means the first iteration failed to alloc the entire + * requested size. The buddy allocator guarantees any given + * single alloc is contiguous. + */ + if (a->flags & GPU_ALLOC_FORCE_CONTIG && i != 0) + goto fail_cleanup; + + if (chunk_len > max_chunk_len) + chunk_len = max_chunk_len; + + /* + * Keep attempting to allocate in smaller chunks until the alloc + * either succeeds or is smaller than the page_size of the + * allocator (i.e the allocator is OOM). + */ + do { + chunk_addr = gk20a_alloc(&a->source_allocator, + chunk_len); + + /* Divide by 2 and try again */ + if (!chunk_addr) { + palloc_dbg(a, "balloc failed: 0x%llx\n", + chunk_len); + chunk_len >>= 1; + max_chunk_len = chunk_len; + } + } while (!chunk_addr && chunk_len >= a->page_size); + + if (!chunk_addr) { + palloc_dbg(a, "bailing @ 0x%llx\n", chunk_len); + goto fail_cleanup; + } + + c = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL); + if (!c) { + gk20a_free(&a->source_allocator, chunk_addr); + goto fail_cleanup; + } + + pages -= chunk_pages; + + c->base = chunk_addr; + c->length = chunk_len; + list_add(&c->list_entry, &alloc->alloc_chunks); + + i++; + } + + alloc->nr_chunks = i; + c = list_first_entry(&alloc->alloc_chunks, + struct page_alloc_chunk, list_entry); + alloc->base = c->base; + + return alloc; + +fail_cleanup: + while (!list_empty(&alloc->alloc_chunks)) { + c = list_first_entry(&alloc->alloc_chunks, + struct page_alloc_chunk, list_entry); + list_del(&c->list_entry); + kfree(c); + } + kfree(alloc); +fail: + return ERR_PTR(-ENOMEM); +} + +/* + * Allocate enough pages to satisfy @len. Page size is determined at + * initialization of the allocator. + * + * The return is actually a pointer to a struct gk20a_page_alloc pointer. This + * is because it doesn't make a lot of sense to return the address of the first + * page in the list of pages (since they could be discontiguous). This has + * precedent in the dma_alloc APIs, though, it's really just an annoying + * artifact of the fact that the gk20a_alloc() API requires a u64 return type. + */ +static u64 gk20a_page_alloc(struct gk20a_allocator *__a, u64 len) +{ + struct gk20a_page_allocator *a = page_allocator(__a); + struct gk20a_page_alloc *alloc = NULL; + struct page_alloc_chunk *c; + u64 real_len; + u64 pages; + int i = 0; + + /* + * If we want contig pages we have to round up to a power of two. It's + * easier to do that here than in the buddy allocator. + */ + real_len = a->flags & GPU_ALLOC_FORCE_CONTIG ? + roundup_pow_of_two(len) : len; + + pages = ALIGN(real_len, a->page_size) >> a->page_shift; + + alloc_lock(__a); + + alloc = __gk20a_alloc_pages(a, pages); + if (IS_ERR(alloc)) { + alloc_unlock(__a); + palloc_dbg(a, "Alloc 0x%llx (%llu) (failed)\n", + pages << a->page_shift, pages); + return 0; + } + + __insert_page_alloc(a, alloc); + alloc_unlock(__a); + + palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n", + pages << a->page_shift, pages, alloc->base); + list_for_each_entry(c, &alloc->alloc_chunks, list_entry) { + palloc_dbg(a, " Chunk %2d: 0x%010llx + 0x%llx\n", + i++, c->base, c->length); + } + + a->nr_allocs++; + a->pages_alloced += pages; + + if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER) + return alloc->base; + else + return (u64) (uintptr_t) alloc; +} + +static void __gk20a_free_pages(struct gk20a_page_allocator *a, + struct gk20a_page_alloc *alloc) +{ + struct page_alloc_chunk *chunk; + + while (!list_empty(&alloc->alloc_chunks)) { + chunk = list_first_entry(&alloc->alloc_chunks, + struct page_alloc_chunk, + list_entry); + list_del(&chunk->list_entry); + + gk20a_free(&a->source_allocator, chunk->base); + kfree(chunk); + } + + kfree(alloc); +} + +/* + * Note: this will remove the gk20a_page_alloc struct from the RB tree + * if it's found. + */ +static void gk20a_page_free(struct gk20a_allocator *__a, u64 base) +{ + struct gk20a_page_allocator *a = page_allocator(__a); + struct gk20a_page_alloc *alloc; + + alloc_lock(__a); + + if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER) + alloc = __find_page_alloc(a, base); + else + alloc = __find_page_alloc(a, + ((struct gk20a_page_alloc *)(uintptr_t)base)->base); + + if (!alloc) { + palloc_dbg(a, "Hrm, found no alloc?\n"); + goto done; + } + + a->nr_frees++; + a->pages_freed += (alloc->length >> a->page_shift); + + /* + * Frees *alloc. + */ + __gk20a_free_pages(a, alloc); + + palloc_dbg(a, "Free 0x%010llx id=0x%010llx\n", + alloc->length, alloc->base); + +done: + alloc_unlock(__a); +} + +static struct gk20a_page_alloc *__gk20a_alloc_pages_fixed( + struct gk20a_page_allocator *a, u64 base, u64 length) +{ + struct gk20a_page_alloc *alloc; + struct page_alloc_chunk *c; + + alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL); + c = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL); + if (!alloc || !c) + goto fail; + + alloc->base = gk20a_alloc_fixed(&a->source_allocator, base, length); + if (!alloc->base) { + WARN(1, "gk20a: failed to fixed alloc pages @ 0x%010llx", base); + goto fail; + } + + alloc->nr_chunks = 1; + alloc->length = length; + INIT_LIST_HEAD(&alloc->alloc_chunks); + + c->base = alloc->base; + c->length = length; + list_add(&c->list_entry, &alloc->alloc_chunks); + + return alloc; + +fail: + kfree(c); + kfree(alloc); + return ERR_PTR(-ENOMEM); +} + +static u64 gk20a_page_alloc_fixed(struct gk20a_allocator *__a, + u64 base, u64 len) +{ + struct gk20a_page_allocator *a = page_allocator(__a); + struct gk20a_page_alloc *alloc = NULL; + struct page_alloc_chunk *c; + u64 aligned_len, pages; + int i = 0; + + aligned_len = ALIGN(len, a->page_size); + pages = aligned_len >> a->page_shift; + + alloc_lock(__a); + + alloc = __gk20a_alloc_pages_fixed(a, base, aligned_len); + if (IS_ERR(alloc)) { + alloc_unlock(__a); + return 0; + } + + __insert_page_alloc(a, alloc); + alloc_unlock(__a); + + palloc_dbg(a, "Alloc [fixed] @ 0x%010llx + 0x%llx (%llu)\n", + alloc->base, aligned_len, pages); + list_for_each_entry(c, &alloc->alloc_chunks, list_entry) { + palloc_dbg(a, " Chunk %2d: 0x%010llx + 0x%llx\n", + i++, c->base, c->length); + } + + a->nr_fixed_allocs++; + a->pages_alloced += pages; + + if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER) + return alloc->base; + else + return (u64) (uintptr_t) alloc; +} + +static void gk20a_page_free_fixed(struct gk20a_allocator *__a, + u64 base, u64 len) +{ + struct gk20a_page_allocator *a = page_allocator(__a); + struct gk20a_page_alloc *alloc; + + alloc_lock(__a); + + if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER) { + alloc = __find_page_alloc(a, base); + if (!alloc) + goto done; + } else { + alloc = (struct gk20a_page_alloc *) (uintptr_t) base; + } + + /* + * This works for the time being since the buddy allocator + * uses the same free function for both fixed and regular + * allocs. This would have to be updated if the underlying + * allocator were to change. + */ + __gk20a_free_pages(a, alloc); + + palloc_dbg(a, "Free [fixed] 0x%010llx + 0x%llx\n", + alloc->base, alloc->length); + a->nr_fixed_frees++; + a->pages_freed += (alloc->length >> a->page_shift); + +done: + alloc_unlock(__a); +} + +static void gk20a_page_allocator_destroy(struct gk20a_allocator *__a) +{ + struct gk20a_page_allocator *a = page_allocator(__a); + + alloc_lock(__a); + kfree(a); + __a->priv = NULL; + alloc_unlock(__a); +} + +static void gk20a_page_print_stats(struct gk20a_allocator *__a, + struct seq_file *s, int lock) +{ + struct gk20a_page_allocator *a = page_allocator(__a); + + if (lock) + alloc_lock(__a); + + __alloc_pstat(s, __a, "Page allocator:\n"); + __alloc_pstat(s, __a, " allocs %lld\n", a->nr_allocs); + __alloc_pstat(s, __a, " frees %lld\n", a->nr_frees); + __alloc_pstat(s, __a, " fixed_allocs %lld\n", a->nr_fixed_allocs); + __alloc_pstat(s, __a, " fixed_frees %lld\n", a->nr_fixed_frees); + __alloc_pstat(s, __a, " pages alloced %lld\n", a->pages_alloced); + __alloc_pstat(s, __a, " pages freed %lld\n", a->pages_freed); + __alloc_pstat(s, __a, "\n"); + __alloc_pstat(s, __a, "Source alloc: %s\n", + a->source_allocator.name); + + gk20a_alloc_print_stats(&a->source_allocator, s, lock); + + if (lock) + alloc_unlock(__a); +} + +static const struct gk20a_allocator_ops page_ops = { + .alloc = gk20a_page_alloc, + .free = gk20a_page_free, + + .alloc_fixed = gk20a_page_alloc_fixed, + .free_fixed = gk20a_page_free_fixed, + + .base = gk20a_page_alloc_base, + .length = gk20a_page_alloc_length, + .end = gk20a_page_alloc_end, + .inited = gk20a_page_alloc_inited, + + .fini = gk20a_page_allocator_destroy, + + .print_stats = gk20a_page_print_stats, +}; + +int gk20a_page_allocator_init(struct gk20a_allocator *__a, + const char *name, u64 base, u64 length, + u64 blk_size, u64 flags) +{ + struct gk20a_page_allocator *a; + char buddy_name[sizeof(__a->name)]; + int err; + + mutex_lock(&meta_data_cache_lock); + if (!page_alloc_cache) + page_alloc_cache = KMEM_CACHE(gk20a_page_alloc, 0); + if (!page_alloc_chunk_cache) + page_alloc_chunk_cache = KMEM_CACHE(page_alloc_chunk, 0); + mutex_unlock(&meta_data_cache_lock); + + if (!page_alloc_cache || !page_alloc_chunk_cache) + return -ENOMEM; + + a = kzalloc(sizeof(struct gk20a_page_allocator), GFP_KERNEL); + if (!a) + return -ENOMEM; + + err = __gk20a_alloc_common_init(__a, name, a, false, &page_ops); + if (err) + goto fail; + + a->base = base; + a->length = length; + a->page_size = blk_size; + a->page_shift = __ffs(blk_size); + a->allocs = RB_ROOT; + a->owner = __a; + a->flags = flags; + + snprintf(buddy_name, sizeof(buddy_name), "%s-src", name); + + err = gk20a_buddy_allocator_init(&a->source_allocator, buddy_name, base, + length, blk_size, 0); + if (err) + goto fail; + + gk20a_init_alloc_debug(__a); + palloc_dbg(a, "New allocator: type page\n"); + palloc_dbg(a, " base 0x%llx\n", a->base); + palloc_dbg(a, " size 0x%llx\n", a->length); + palloc_dbg(a, " page_size 0x%llx\n", a->page_size); + palloc_dbg(a, " flags 0x%llx\n", a->flags); + + return 0; + +fail: + kfree(a); + return err; +} diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index b63444d0c..cab109023 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c @@ -764,8 +764,10 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm) if (!size) return 0; - err = gk20a_buddy_allocator_init(&g->mm.vidmem.allocator, "vidmem", - SZ_4K, size - SZ_4K, SZ_4K, 0); + err = gk20a_page_allocator_init(&g->mm.vidmem.allocator, "vidmem", + SZ_4K, size - SZ_4K, SZ_4K, + GPU_ALLOC_FORCE_CONTIG | + GPU_ALLOC_NO_SCATTER_GATHER); if (err) { gk20a_err(d, "Failed to register vidmem for size %zu: %d", size, err); diff --git a/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h b/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h new file mode 100644 index 000000000..bce5b75ef --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef PAGE_ALLOCATOR_PRIV_H +#define PAGE_ALLOCATOR_PRIV_H + +#include +#include + +#include "gk20a_allocator.h" + +struct gk20a_allocator; + +struct page_alloc_chunk { + struct list_head list_entry; + + u64 base; + u64 length; +}; + +/* + * Struct to handle internal management of page allocation. It holds a list + * of the chunks of page that make up the overall allocation - much like a + * scatter gather table. + */ +struct gk20a_page_alloc { + struct list_head alloc_chunks; + + int nr_chunks; + u64 length; + + /* + * Only useful for the RB tree - since the alloc will have discontiguous + * pages the base is essentially irrelevant except for the fact that it + * is guarenteed to be unique. + */ + u64 base; + + struct rb_node tree_entry; +}; + +struct gk20a_page_allocator { + struct gk20a_allocator *owner; /* Owner of this allocator. */ + + /* + * Use a buddy allocator to manage the allocation of the underlying + * pages. This lets us abstract the discontiguous allocation handling + * out of the annoyingly complicated buddy allocator. + */ + struct gk20a_allocator source_allocator; + + /* + * Page params. + */ + u64 base; + u64 length; + u64 page_size; + u32 page_shift; + + struct rb_root allocs; /* Outstanding allocations. */ + + u64 flags; + + /* + * Stat tracking. + */ + u64 nr_allocs; + u64 nr_frees; + u64 nr_fixed_allocs; + u64 nr_fixed_frees; + u64 pages_alloced; + u64 pages_freed; +}; + +static inline struct gk20a_page_allocator *page_allocator( + struct gk20a_allocator *a) +{ + return (struct gk20a_page_allocator *)(a)->priv; +} + +static inline struct gk20a_allocator *palloc_owner( + struct gk20a_page_allocator *a) +{ + return a->owner; +} + +#endif