gpu: nvgpu: New allocator for VA space

Implement a new buddy allocation scheme for the GPU's VA space. The bitmap allocator was using too much memory and is not a scaleable solution as the GPU's address space keeps getting bigger. The buddy allocation scheme is much more memory efficient when the majority of the address space is not allocated. The buddy allocator is not constrained by the notion of a split address space. The bitmap allocator could only manage either small pages or large pages but not both at the same time. Thus the bottom of the address space was for small pages, the top for large pages. Although, that split is not removed quite yet, the new allocator enables that to happen. The buddy allocator is also very scalable. It manages the relatively small comptag space to the enormous GPU VA space and everything in between. This is important since the GPU has lots of different sized spaces that need managing. Currently there are certain limitations. For one the allocator does not handle the fixed allocations from CUDA very well. It can do so but with certain caveats. The PTE page size is always set to small. This means the BA may place other small page allocations in the buddies around the fixed allocation. It does this to avoid having large and small page allocations in the same PDE. Change-Id: I501cd15af03611536490137331d43761c402c7f9 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/740694 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2015-03-18 13:33:09 -07:00
parent 0566aee853
commit a2e8523645
13 changed files with 1406 additions and 364 deletions
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -199,21 +199,14 @@ static int gk20a_as_ioctl_get_va_regions(

 	for (i = 0; i < write_entries; ++i) {
 		struct nvgpu_as_va_region region;
-		u32 base, limit;

 		memset(&region, 0, sizeof(struct nvgpu_as_va_region));

-		if (!vm->vma[i].constraint.enable) {
-			base = vm->vma[i].base;
-			limit = vm->vma[i].limit;
-		} else {
-			base = vm->vma[i].constraint.base;
-			limit = vm->vma[i].constraint.limit;
-		}
-
 		region.page_size = vm->gmmu_page_sizes[i];
-		region.offset = (u64)base * region.page_size;
-		region.pages = limit - base; /* NOTE: limit is exclusive */
+		region.offset = vm->vma[i].base;
+		/* No __aeabi_uldivmod() on some platforms... */
+		region.pages = (vm->vma[i].end - vm->vma[i].start) >>
+			ilog2(region.page_size);

 		if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
 			return -EFAULT;
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -59,6 +59,7 @@
 #include "hw_fb_gk20a.h"
 #include "gk20a_scale.h"
 #include "dbg_gpu_gk20a.h"
+#include "gk20a_allocator.h"
 #include "hal.h"
 #include "vgpu/vgpu.h"

@@ -1510,6 +1511,7 @@ static int gk20a_probe(struct platform_device *dev)
 					&gk20a->mm.disable_bigpage);
 	gk20a_pmu_debugfs_init(dev);
 	gk20a_cde_debugfs_init(dev);
+	gk20a_alloc_debugfs_init(dev);
 #endif

 	gk20a_init_gr(gk20a);
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -17,75 +17,190 @@
 #ifndef GK20A_ALLOCATOR_H
 #define GK20A_ALLOCATOR_H

+#include <linux/list.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
-#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/platform_device.h>

 /* #define ALLOCATOR_DEBUG */

-/* main struct */
-struct gk20a_allocator {
+/*
+ * Each buddy is an element in a binary tree.
+ */
+struct gk20a_buddy {
+	struct gk20a_buddy *parent;	/* Parent node. */
+	struct gk20a_buddy *buddy;	/* This node's buddy. */
+	struct gk20a_buddy *left;	/* Lower address sub-node. */
+	struct gk20a_buddy *right;	/* Higher address sub-node. */

-	char name[32];			/* name for allocator */
-	struct rb_root rb_root;		/* rb tree root for blocks */
+	struct list_head buddy_entry;	/* List entry for various lists. */
+	struct rb_node alloced_entry;	/* RB tree of allocations. */

-	u32 base;			/* min value of this linear space */
-	u32 limit;			/* max value = limit - 1 */
+	u64 start;			/* Start address of this buddy. */
+	u64 end;			/* End address of this buddy. */
+	u64 order;			/* Buddy order. */

-	unsigned long *bitmap;		/* bitmap */
-
-	struct gk20a_alloc_block *block_first;	/* first block in list */
-	struct gk20a_alloc_block *block_recent; /* last visited block */
-
-	u32 first_free_addr;		/* first free addr, non-contigous
-					   allocation preferred start,
-					   in order to pick up small holes */
-	u32 last_free_addr;		/* last free addr, contiguous
-					   allocation preferred start */
-	u32 cached_hole_size;		/* max free hole size up to
-					   last_free_addr */
-	u32 block_count;		/* number of blocks */
-
-	struct rw_semaphore rw_sema;	/* lock */
-	struct kmem_cache *block_cache;	/* slab cache */
-
-	/* if enabled, constrain to [base, limit) */
-	struct {
-		bool enable;
-		u32 base;
-		u32 limit;
-	} constraint;
-
-	int (*alloc)(struct gk20a_allocator *allocator,
-		u32 *addr, u32 len, u32 align);
-	int (*free)(struct gk20a_allocator *allocator,
-		u32 addr, u32 len, u32 align);
+#define BALLOC_BUDDY_ALLOCED	0x1
+#define BALLOC_BUDDY_SPLIT	0x2
+#define BALLOC_BUDDY_IN_LIST	0x4
+	int flags;			/* List of associated flags. */

+	/*
+	 * Size of the PDE this buddy is using. This allows for grouping like
+	 * sized allocations into the same PDE.
+	 */
+#define BALLOC_PTE_SIZE_ANY	0x0
+#define BALLOC_PTE_SIZE_SMALL	0x1
+#define BALLOC_PTE_SIZE_BIG	0x2
+	int pte_size;
 };

-int gk20a_allocator_init(struct gk20a_allocator *allocator,
-			const char *name, u32 base, u32 size);
+#define __buddy_flag_ops(flag, flag_up)					\
+	static inline int buddy_is_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		return b->flags & BALLOC_BUDDY_ ## flag_up;		\
+	}								\
+	static inline void buddy_set_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		b->flags |= BALLOC_BUDDY_ ## flag_up;			\
+	}								\
+	static inline void buddy_clr_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		b->flags &= ~BALLOC_BUDDY_ ## flag_up;			\
+	}
+
+/*
+ * int  buddy_is_alloced(struct gk20a_buddy *b);
+ * void buddy_set_alloced(struct gk20a_buddy *b);
+ * void buddy_clr_alloced(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_split(struct gk20a_buddy *b);
+ * void buddy_set_split(struct gk20a_buddy *b);
+ * void buddy_clr_split(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_in_list(struct gk20a_buddy *b);
+ * void buddy_set_in_list(struct gk20a_buddy *b);
+ * void buddy_clr_in_list(struct gk20a_buddy *b);
+ */
+__buddy_flag_ops(alloced, ALLOCED);
+__buddy_flag_ops(split,   SPLIT);
+__buddy_flag_ops(in_list, IN_LIST);
+
+/*
+ * Keeps info for a fixed allocation.
+ */
+struct gk20a_fixed_alloc {
+	struct list_head buddies;	/* List of buddies. */
+	struct rb_node alloced_entry;	/* RB tree of fixed allocations. */
+
+	u64 start;			/* Start of fixed block. */
+	u64 end;			/* End address. */
+};
+
+struct vm_gk20a;
+
+/*
+ * GPU buddy allocator for the various GPU address spaces. Each addressable unit
+ * doesn't have to correspond to a byte. In some cases each unit is a more
+ * complex object such as a comp_tag line or the like.
+ *
+ * The max order is computed based on the size of the minimum order and the size
+ * of the address space.
+ *
+ * order_size is the size of an order 0 buddy.
+ */
+struct gk20a_allocator {
+
+	struct vm_gk20a *vm;		/* Parent VM - can be NULL. */
+
+	char name[32];			/* Name of allocator. */
+
+	u64 base;			/* Base address of the space. */
+	u64 length;			/* Length of the space. */
+	u64 blk_size;			/* Size of order 0 allocation. */
+	u64 blk_shift;			/* Shift to divide by blk_size. */
+
+	int init;			/* Non-zero if initialized. */
+
+	/* Internal stuff. */
+	u64 start;			/* Real start (aligned to blk_size). */
+	u64 end;			/* Real end, trimmed if needed. */
+	u64 count;			/* Count of objects in space. */
+	u64 blks;			/* Count of blks in the space. */
+	u64 max_order;			/* Specific maximum order. */
+
+	struct rb_root alloced_buddies;	/* Outstanding allocations. */
+	struct rb_root fixed_allocs;	/* Outstanding fixed allocations. */
+
+	struct mutex lock;		/* Protects buddy access. */
+
+#define GPU_BALLOC_GVA_SPACE		0x1
+	u64 flags;
+
+	/*
+	 * Impose an upper bound on the maximum order.
+	 */
+#define GPU_BALLOC_MAX_ORDER		31
+#define GPU_BALLOC_ORDER_LIST_LEN	(GPU_BALLOC_MAX_ORDER + 1)
+
+	struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
+
+	/*
+	 * This is for when the allocator is managing a GVA space (the
+	 * GPU_BALLOC_GVA_SPACE bit is set in @flags). This requires
+	 * that we group like sized allocations into PDE blocks.
+	 */
+	u64 pte_blk_order;
+
+	struct dentry *debugfs_entry;
+
+	u64 bytes_alloced;
+	u64 bytes_alloced_real;
+	u64 bytes_freed;
+};
+
+#define balloc_lock(a)		mutex_lock(&(a)->lock)
+#define balloc_unlock(a)	mutex_unlock(&(a)->lock)
+
+#define balloc_get_order_list(a, order)	(&(a)->buddy_list[(order)])
+#define balloc_order_to_len(a, order)	((1 << order) * (a)->blk_size)
+#define balloc_base_shift(a, base)	((base) - (a)->start)
+#define balloc_base_unshift(a, base)	((base) + (a)->start)
+
+int  gk20a_allocator_init(struct gk20a_allocator *allocator,
+			  const char *name, u64 base, u64 size, u64 order0);
+int  __gk20a_allocator_init(struct gk20a_allocator *allocator,
+			    struct vm_gk20a *vm, const char *name,
+			    u64 base, u64 size, u64 order0,
+			    u64 max_order, u64 flags);
 void gk20a_allocator_destroy(struct gk20a_allocator *allocator);

-int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
-			u32 *addr, u32 len, u32 align);
+/*
+ * Normal alloc/free operations for the buddy allocator.
+ */
+u64  gk20a_balloc(struct gk20a_allocator *allocator, u64 len);
+void gk20a_bfree(struct gk20a_allocator *allocator, u64 addr);

-int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
-			u32 addr, u32 len, u32 align);
+/*
+ * Special interface to allocate a memory regions with a specific starting
+ * address. Yikes.
+ */
+u64  gk20a_balloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
+
+/*
+ * Debugfs init.
+ */
+void gk20a_alloc_debugfs_init(struct platform_device *pdev);

 #if defined(ALLOCATOR_DEBUG)
-
-#define allocator_dbg(alloctor, format, arg...)				\
-do {								\
-	if (1)							\
-		pr_debug("gk20a_allocator (%s) %s: " format "\n",\
-			alloctor->name, __func__, ##arg);\
-} while (0)
-
-#else /* ALLOCATOR_DEBUG */
-
-#define allocator_dbg(format, arg...)
-
-#endif /* ALLOCATOR_DEBUG */
+#define balloc_dbg(alloctor, format, arg...)		\
+	pr_info("%-25s %25s() " format,			\
+		alloctor->name, __func__, ##arg)
+#else
+#define balloc_dbg(allocator, format, arg...)
+#endif

 #endif /* GK20A_ALLOCATOR_H */
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -89,9 +89,8 @@ static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (err)
 		return err;

-	gk20a_allocator_init(&gr->comp_tags, "comptag",
-			      1, /* start */
-			      max_comptag_lines - 1); /* length*/
+	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
+			       1, max_comptag_lines - 1, 1, 10, 0);

 	gr->comptags_per_cacheline = comptags_per_cacheline;
 	gr->slices_per_ltc = slices_per_fbp / g->ltc_count;
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -132,10 +132,8 @@ static void gk20a_mm_delete_priv(void *_priv)

 	if (priv->comptags.lines) {
 		BUG_ON(!priv->comptag_allocator);
-		priv->comptag_allocator->free(priv->comptag_allocator,
-					      priv->comptags.offset,
-					      priv->comptags.allocated_lines,
-					      1);
+		gk20a_bfree(priv->comptag_allocator,
+			    priv->comptags.real_offset);
 	}

 	/* Free buffer states */
@@ -224,10 +222,9 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 				u32 lines, bool user_mappable)
 {
 	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
-	u32 offset = 0;
-	int err;
 	u32 ctaglines_to_allocate;
-	u32 ctagline_align;
+	u32 ctagline_align = 1;
+	u32 offset;
 	const u32 aggregate_cacheline_sz =
 		g->gr.cacheline_size * g->gr.slices_per_ltc *
 		g->ltc_count;
@@ -241,7 +238,6 @@ static int gk20a_alloc_comptags(struct gk20a *g,

 	if (!user_mappable) {
 		ctaglines_to_allocate = lines;
-		ctagline_align = 1;
 	} else {
 		/* Unfortunately, we cannot use allocation alignment
 		 * here, since compbits per cacheline is not always a
@@ -273,72 +269,26 @@ static int gk20a_alloc_comptags(struct gk20a *g,

 		if (ctaglines_to_allocate < lines)
 			return -EINVAL; /* integer overflow */
+		pr_info("user-mapped CTAGS: %u\n", ctaglines_to_allocate);
 	}

 	/* store the allocator so we can use it when we free the ctags */
 	priv->comptag_allocator = allocator;
-	err = allocator->alloc(allocator, &offset,
-			       ctaglines_to_allocate, 1);
-	if (!err) {
-		const u32 alignment_lines =
-			DIV_ROUND_UP(offset, ctagline_align) * ctagline_align -
-			offset;
+	offset = gk20a_balloc(allocator, ctaglines_to_allocate);
+	if (!offset)
+		return -ENOMEM;

-		/* prune the preceding ctaglines that were allocated
-		   for alignment */
-		if (alignment_lines) {
-			/* free alignment lines */
-			int tmp=
-				allocator->free(allocator, offset,
-						alignment_lines,
-						1);
-			WARN_ON(tmp);
+	priv->comptags.lines = lines;
+	priv->comptags.real_offset = offset;

-			offset += alignment_lines;
-			ctaglines_to_allocate -= alignment_lines;
-		}
+	if (user_mappable)
+		offset = DIV_ROUND_UP(offset, ctagline_align) * ctagline_align;

-		/* check if we can prune the trailing, too */
-		if (user_mappable)
-		{
-			u32 needed_cachelines =
-				DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline);
+	priv->comptags.offset = offset;

-			u32 first_unneeded_cacheline =
-				DIV_ROUND_UP(round_up(needed_cachelines *
-						      aggregate_cacheline_sz,
-						      small_pgsz),
-					     aggregate_cacheline_sz);
-			u32 needed_ctaglines =
-				first_unneeded_cacheline *
-				g->gr.comptags_per_cacheline;
-
-			if (needed_ctaglines < ctaglines_to_allocate) {
-				/* free alignment lines */
-				int tmp=
-					allocator->free(
-						allocator,
-						offset + needed_ctaglines,
-						(ctaglines_to_allocate -
-						 needed_ctaglines),
-						1);
-				WARN_ON(tmp);
-
-				ctaglines_to_allocate = needed_ctaglines;
-			}
-		}
-
-		priv->comptags.offset = offset;
-		priv->comptags.lines = lines;
-		priv->comptags.allocated_lines = ctaglines_to_allocate;
-		priv->comptags.user_mappable = user_mappable;
-	}
-	return err;
+	return 0;
 }

-
-
-
 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
 {
 	gk20a_dbg_fn("");
@@ -889,14 +839,12 @@ static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
 }

 u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
-		     u64 size,
-		     enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+		      u64 size,
+		      enum gmmu_pgsz_gk20a gmmu_pgsz_idx)

 {
 	struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
-	int err;
 	u64 offset;
-	u32 start_page_nr = 0, num_pages;
 	u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];

 	if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) {
@@ -912,28 +860,19 @@ u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,

 	}

-	/* be certain we round up to gmmu_page_size if needed */
-	/* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
+	/* Be certain we round up to gmmu_page_size if needed */
 	size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
-
 	gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
 			vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);

-	/* The vma allocator represents page accounting. */
-	num_pages = size >> ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
-
-	err = vma->alloc(vma, &start_page_nr, num_pages, 1);
-
-	if (err) {
+	offset = gk20a_balloc(vma, size);
+	if (!offset) {
 		gk20a_err(dev_from_vm(vm),
-			   "%s oom: sz=0x%llx", vma->name, size);
+			  "%s oom: sz=0x%llx", vma->name, size);
 		return 0;
 	}

-	offset = (u64)start_page_nr <<
-		 ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
 	gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
-
 	return offset;
 }

@@ -942,25 +881,12 @@ int gk20a_vm_free_va(struct vm_gk20a *vm,
 		     enum gmmu_pgsz_gk20a pgsz_idx)
 {
 	struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
-	u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
-	u32 page_shift = ilog2(page_size);
-	u32 start_page_nr, num_pages;
-	int err;

 	gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
 			vma->name, offset, size);
+	gk20a_bfree(vma, offset);

-	start_page_nr = (u32)(offset >> page_shift);
-	num_pages = (u32)((size + page_size - 1) >> page_shift);
-
-	err = vma->free(vma, start_page_nr, num_pages, 1);
-	if (err) {
-		gk20a_err(dev_from_vm(vm),
-			   "not found: offset=0x%llx, sz=0x%llx",
-			   offset, size);
-	}
-
-	return err;
+	return 0;
 }

 static int insert_mapped_buffer(struct rb_root *root,
@@ -1136,7 +1062,7 @@ static int validate_fixed_buffer(struct vm_gk20a *vm,

 	if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) {
 		gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
-			   map_offset);
+			  map_offset);
 		return -EINVAL;
 	}

@@ -2433,7 +2359,6 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 		char *name)
 {
 	int err, i;
-	u32 num_small_pages, num_large_pages, low_hole_pages;
 	char alloc_name[32];
 	u64 small_vma_size, large_vma_size;
 	u32 pde_lo, pde_hi;
@@ -2494,34 +2419,31 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 		large_vma_size = vm->va_limit - small_vma_size;
 	}

-	num_small_pages = (u32)(small_vma_size >>
-		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
-	/* num_pages above is without regard to the low-side hole. */
-	low_hole_pages = (vm->va_start >>
-			  ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
 	snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
 		 vm->gmmu_page_sizes[gmmu_page_size_small]>>10);
-	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
-			     alloc_name,
-			     low_hole_pages,		 /*start*/
-			     num_small_pages - low_hole_pages);/* length*/
+	err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
+				     vm, alloc_name,
+				     vm->va_start,
+				     small_vma_size - vm->va_start,
+				     SZ_4K,
+				     GPU_BALLOC_MAX_ORDER,
+				     GPU_BALLOC_GVA_SPACE);
 	if (err)
 		goto clean_up_ptes;

 	if (big_pages) {
-		u32 start = (u32)(small_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-		num_large_pages = (u32)(large_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-
 		snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
 			 name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10);
-		err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
-				      alloc_name,
-				      start,			/* start */
-				      num_large_pages);		/* length */
+		/*
+		 * Big page VMA starts at the end of the small page VMA.
+		 */
+		err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+					     vm, alloc_name,
+					     small_vma_size,
+					     large_vma_size,
+					     big_page_size,
+					     GPU_BALLOC_MAX_ORDER,
+					     GPU_BALLOC_GVA_SPACE);
 		if (err)
 			goto clean_up_small_allocator;
 	}
@@ -2602,9 +2524,9 @@ int gk20a_vm_release_share(struct gk20a_as_share *as_share)
 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 			 struct nvgpu_as_alloc_space_args *args)

-{	int err = -ENOMEM;
+{
+	int err = -ENOMEM;
 	int pgsz_idx = gmmu_page_size_small;
-	u32 start_page_nr;
 	struct gk20a_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct gk20a *g = vm->mm->g;
@@ -2635,21 +2557,19 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 		goto clean_up;
 	}

-	start_page_nr = 0;
-	if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
-		start_page_nr = (u32)(args->o_a.offset >>
-				ilog2(vm->gmmu_page_sizes[pgsz_idx]));
-
 	vma = &vm->vma[pgsz_idx];
-	err = vma->alloc(vma, &start_page_nr, args->pages, 1);
-	if (err) {
+	if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
+		vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset,
+						 (u64)args->pages *
+						 (u64)args->page_size);
+	else
+		vaddr_start = gk20a_balloc(vma, args->pages * args->page_size);
+
+	if (!vaddr_start) {
 		kfree(va_node);
 		goto clean_up;
 	}

-	vaddr_start = (u64)start_page_nr <<
-		      ilog2(vm->gmmu_page_sizes[pgsz_idx]);
-
 	va_node->vaddr_start = vaddr_start;
 	va_node->size = (u64)args->page_size * (u64)args->pages;
 	va_node->pgsz_idx = pgsz_idx;
@@ -2673,7 +2593,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 					 true);
 		if (!map_offset) {
 			mutex_unlock(&vm->update_gmmu_lock);
-			vma->free(vma, start_page_nr, args->pages, 1);
+			gk20a_bfree(vma, vaddr_start);
 			kfree(va_node);
 			goto clean_up;
 		}
@@ -2685,6 +2605,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 	mutex_unlock(&vm->update_gmmu_lock);

 	args->o_a.offset = vaddr_start;
+	err = 0;

 clean_up:
 	return err;
@@ -2695,7 +2616,6 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 {
 	int err = -ENOMEM;
 	int pgsz_idx;
-	u32 start_page_nr;
 	struct gk20a_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct vm_reserved_va_node *va_node;
@@ -2708,14 +2628,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 	pgsz_idx = __nv_gmmu_va_is_upper(vm, args->offset) ?
 			gmmu_page_size_big : gmmu_page_size_small;

-	start_page_nr = (u32)(args->offset >>
-			ilog2(vm->gmmu_page_sizes[pgsz_idx]));
-
 	vma = &vm->vma[pgsz_idx];
-	err = vma->free(vma, start_page_nr, args->pages, 1);
-
-	if (err)
-		goto clean_up;
+	gk20a_bfree(vma, args->offset);

 	mutex_lock(&vm->update_gmmu_lock);
 	va_node = addr_to_reservation(vm, args->offset);
@@ -2745,8 +2659,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 		kfree(va_node);
 	}
 	mutex_unlock(&vm->update_gmmu_lock);
+	err = 0;

-clean_up:
 	return err;
 }

--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -131,6 +131,7 @@ enum gmmu_pgsz_gk20a {
 };

 struct gk20a_comptags {
+	u32 real_offset;
 	u32 offset;
 	u32 lines;
 	u32 allocated_lines;
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2438,7 +2438,6 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	struct pmu_payload payload;
 	u32 seq;
 	u32 data;
-	int err = 0;

 	gk20a_dbg_fn("");

@@ -2489,12 +2488,11 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);

 	if (!pmu->sample_buffer)
-		err = pmu->dmem.alloc(&pmu->dmem,
-				      &pmu->sample_buffer, 2 * sizeof(u16),
-				      PMU_DMEM_ALLOC_ALIGNMENT);
-	if (err) {
+		pmu->sample_buffer = gk20a_balloc(&pmu->dmem,
+						  2 * sizeof(u16));
+	if (!pmu->sample_buffer) {
 		gk20a_err(dev_from_gk20a(g),
-			"failed to allocate perfmon sample buffer");
+			  "failed to allocate perfmon sample buffer");
 		return -ENOMEM;
 	}

@@ -2592,15 +2590,17 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
 	for (i = 0; i < PMU_QUEUE_COUNT; i++)
 		pmu_queue_init(pmu, i, init);

-	if (!pmu->dmem.alloc) {
-		/*Align start and end addresses*/
+	if (!pmu->dmem.init) {
+		/* Align start and end addresses */
 		u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
-					PMU_DMEM_ALLOC_ALIGNMENT);
+				  PMU_DMEM_ALLOC_ALIGNMENT);
 		u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) +
-			pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
+			   pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
 			~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
 		u32 size = end - start;
-		gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem", start, size);
+		__gk20a_allocator_init(&pmu->dmem, NULL, "gk20a_pmu_dmem",
+				       start, size,
+				       PMU_DMEM_ALLOC_ALIGNMENT, 4, 0);
 	}

 	pmu->pmu_ready = true;
@@ -2737,20 +2737,14 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
 		seq->callback = NULL;
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_in_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_in_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_in_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_in_a_ptr(seq)));
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_out_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_out_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_out_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_out_a_ptr(seq)));

 	if (seq->callback)
 		seq->callback(g, msg, seq->cb_params, seq->desc, ret);
@@ -3387,11 +3381,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 			pv->pmu_allocation_set_dmem_size(pmu, in,
 			(u16)max(payload->in.size, payload->out.size));

-		err = pmu->dmem.alloc(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset_addr(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
-		if (err)
+		*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
+			gk20a_balloc(&pmu->dmem,
+				     pv->pmu_allocation_get_dmem_size(pmu, in));
+		if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
 			goto clean_up;

 		pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
@@ -3412,11 +3405,12 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 		(u16)payload->out.size);

 		if (payload->out.buf != payload->in.buf) {
-			err = pmu->dmem.alloc(&pmu->dmem,
-				pv->pmu_allocation_get_dmem_offset_addr(pmu, out),
-				pv->pmu_allocation_get_dmem_size(pmu, out),
-				PMU_DMEM_ALLOC_ALIGNMENT);
-			if (err)
+
+			*(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
+				gk20a_balloc(&pmu->dmem,
+				    pv->pmu_allocation_get_dmem_size(pmu, out));
+			if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
+								       out)))
 				goto clean_up;
 		} else {
 			BUG_ON(in == NULL);
@@ -3444,15 +3438,11 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 clean_up:
 	gk20a_dbg_fn("fail");
 	if (in)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, in));
 	if (out)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, out),
-			pv->pmu_allocation_get_dmem_size(pmu, out),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, out));

 	pmu_seq_release(pmu, seq);
 	return err;
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -3,7 +3,7 @@
 *
 * GK20A PMU (aka. gPMU outside gk20a context)
 *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -437,7 +437,7 @@ struct pmu_ucode_desc {
 #define PMU_UNIT_ID_IS_VALID(id)		\
 		(((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START))

-#define PMU_DMEM_ALLOC_ALIGNMENT	(32)
+#define PMU_DMEM_ALLOC_ALIGNMENT	(4)
 #define PMU_DMEM_ALIGNMENT		(4)

 #define PMU_CMD_FLAGS_PMU_MASK		(0xF0)
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -3,7 +3,7 @@
 *
 * GK20A Semaphores
 *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -44,8 +44,10 @@ struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct device *d,
 	if (gk20a_get_sgtable(d, &p->sgt, p->cpu_va, p->iova, p->size))
 		goto clean_up;

-	if (gk20a_allocator_init(&p->alloc, unique_name, 0,
-			     p->size))
+	/* Sacrifice one semaphore in the name of returning error codes. */
+	if (gk20a_allocator_init(&p->alloc, unique_name,
+				 SEMAPHORE_SIZE, p->size - SEMAPHORE_SIZE,
+				 SEMAPHORE_SIZE))
 		goto clean_up;

 	gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->cpu_va,
@@ -163,8 +165,8 @@ struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
 	if (!s)
 		return NULL;

-	if (pool->alloc.alloc(&pool->alloc, &s->offset, SEMAPHORE_SIZE,
-				SEMAPHORE_SIZE)) {
+	s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
+	if (!s->offset) {
 		gk20a_err(pool->dev, "failed to allocate semaphore");
 		kfree(s);
 		return NULL;
@@ -186,8 +188,7 @@ static void gk20a_semaphore_free(struct kref *ref)
 	struct gk20a_semaphore *s =
 		container_of(ref, struct gk20a_semaphore, ref);

-	s->pool->alloc.free(&s->pool->alloc, s->offset, SEMAPHORE_SIZE,
-			SEMAPHORE_SIZE);
+	gk20a_bfree(&s->pool->alloc, s->offset);
 	gk20a_semaphore_pool_put(s->pool);
 	kfree(s);
 }
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
@@ -90,9 +90,8 @@ static int gm20b_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (err)
 		return err;

-	gk20a_allocator_init(&gr->comp_tags, "comptag",
-			      1, /* start */
-			      max_comptag_lines - 1); /* length*/
+	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
+			       1, max_comptag_lines - 1, 1, 10, 0);

 	gr->comptags_per_cacheline = comptags_per_cacheline;
 	gr->slices_per_ltc = slices_per_ltc;
--- a/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
@@ -41,9 +41,8 @@ static int vgpu_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (max_comptag_lines < 2)
 		return -ENXIO;

-	gk20a_allocator_init(&gr->comp_tags, "comptag",
-			      1, /* start */
-			      max_comptag_lines - 1); /* length*/
+	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
+			       1, max_comptag_lines - 1, 1, 10, 0); /* length*/
 	return 0;
 }

--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -243,11 +243,9 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 	struct tegra_vgpu_as_share_params *p = &msg.params.as_share;
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm;
-	u32 num_small_pages, num_large_pages, low_hole_pages;
 	u64 small_vma_size, large_vma_size;
 	char name[32];
 	int err, i;
-	u32 start;

 	/* note: keep the page sizes sorted lowest to highest here */
 	u32 gmmu_page_sizes[gmmu_nr_page_sizes] = {
@@ -294,33 +292,27 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 	small_vma_size = (u64)16 << 30;
 	large_vma_size = vm->va_limit - small_vma_size;

-	num_small_pages = (u32)(small_vma_size >>
-		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
-	/* num_pages above is without regard to the low-side hole. */
-	low_hole_pages = (vm->va_start >>
-			  ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
 	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
 		 gmmu_page_sizes[gmmu_page_size_small]>>10);
-	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
-			     name,
-			     low_hole_pages,		 /*start*/
-			     num_small_pages - low_hole_pages);/* length*/
+	err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
+				     vm, name,
+				     vm->va_start,
+				     small_vma_size - vm->va_start,
+				     SZ_4K,
+				     GPU_BALLOC_MAX_ORDER,
+				     GPU_BALLOC_GVA_SPACE);
 	if (err)
 		goto clean_up_share;

-	start = (u32)(small_vma_size >>
-		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-	num_large_pages = (u32)(large_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-
 	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
 		gmmu_page_sizes[gmmu_page_size_big]>>10);
-	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
-			      name,
-			      start,			/* start */
-			      num_large_pages);		/* length */
+	err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+				     vm, name,
+				     small_vma_size,
+				     large_vma_size,
+				     big_page_size,
+				     GPU_BALLOC_MAX_ORDER,
+				     GPU_BALLOC_GVA_SPACE);
 	if (err)
 		goto clean_up_small_allocator;