Revert "Revert "gpu: nvgpu: New allocator for VA space""

This reverts commit 7eb42bc239dbd207208ff491c3fb65c3d83274d8. The original commit was actually fine. Change-Id: I564ce6530ac73fcfad17dcec9c53f0353b4f02d4 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/743300 (cherry picked from commit e99aa2485f8992eabe3556f3ebcb57bdc8ad91ff) Reviewed-on: http://git-master/r/743301 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2015-05-15 09:13:28 -07:00
parent 7ddd6e261e
commit 01f359f3f1
13 changed files with 1406 additions and 374 deletions
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -199,21 +199,14 @@ static int gk20a_as_ioctl_get_va_regions(

 	for (i = 0; i < write_entries; ++i) {
 		struct nvgpu_as_va_region region;
-		u32 base, limit;

 		memset(&region, 0, sizeof(struct nvgpu_as_va_region));

-		if (!vm->vma[i].constraint.enable) {
-			base = vm->vma[i].base;
-			limit = vm->vma[i].limit;
-		} else {
-			base = vm->vma[i].constraint.base;
-			limit = vm->vma[i].constraint.limit;
-		}
-
 		region.page_size = vm->gmmu_page_sizes[i];
-		region.offset = (u64)base * region.page_size;
-		region.pages = limit - base; /* NOTE: limit is exclusive */
+		region.offset = vm->vma[i].base;
+		/* No __aeabi_uldivmod() on some platforms... */
+		region.pages = (vm->vma[i].end - vm->vma[i].start) >>
+			ilog2(region.page_size);

 		if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
 			return -EFAULT;
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -59,6 +59,7 @@
 #include "hw_fb_gk20a.h"
 #include "gk20a_scale.h"
 #include "dbg_gpu_gk20a.h"
+#include "gk20a_allocator.h"
 #include "hal.h"
 #include "vgpu/vgpu.h"

@@ -1532,6 +1533,7 @@ static int gk20a_probe(struct platform_device *dev)
 	gr_gk20a_debugfs_init(gk20a);
 	gk20a_pmu_debugfs_init(dev);
 	gk20a_cde_debugfs_init(dev);
+	gk20a_alloc_debugfs_init(dev);
 #endif

 	gk20a_init_gr(gk20a);
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -17,75 +17,190 @@
 #ifndef GK20A_ALLOCATOR_H
 #define GK20A_ALLOCATOR_H

+#include <linux/list.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
-#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/platform_device.h>

 /* #define ALLOCATOR_DEBUG */

-/* main struct */
-struct gk20a_allocator {
+/*
+ * Each buddy is an element in a binary tree.
+ */
+struct gk20a_buddy {
+	struct gk20a_buddy *parent;	/* Parent node. */
+	struct gk20a_buddy *buddy;	/* This node's buddy. */
+	struct gk20a_buddy *left;	/* Lower address sub-node. */
+	struct gk20a_buddy *right;	/* Higher address sub-node. */

-	char name[32];			/* name for allocator */
-	struct rb_root rb_root;		/* rb tree root for blocks */
+	struct list_head buddy_entry;	/* List entry for various lists. */
+	struct rb_node alloced_entry;	/* RB tree of allocations. */

-	u32 base;			/* min value of this linear space */
-	u32 limit;			/* max value = limit - 1 */
+	u64 start;			/* Start address of this buddy. */
+	u64 end;			/* End address of this buddy. */
+	u64 order;			/* Buddy order. */

-	unsigned long *bitmap;		/* bitmap */
-
-	struct gk20a_alloc_block *block_first;	/* first block in list */
-	struct gk20a_alloc_block *block_recent; /* last visited block */
-
-	u32 first_free_addr;		/* first free addr, non-contigous
-					   allocation preferred start,
-					   in order to pick up small holes */
-	u32 last_free_addr;		/* last free addr, contiguous
-					   allocation preferred start */
-	u32 cached_hole_size;		/* max free hole size up to
-					   last_free_addr */
-	u32 block_count;		/* number of blocks */
-
-	struct rw_semaphore rw_sema;	/* lock */
-	struct kmem_cache *block_cache;	/* slab cache */
-
-	/* if enabled, constrain to [base, limit) */
-	struct {
-		bool enable;
-		u32 base;
-		u32 limit;
-	} constraint;
-
-	int (*alloc)(struct gk20a_allocator *allocator,
-		u32 *addr, u32 len, u32 align);
-	int (*free)(struct gk20a_allocator *allocator,
-		u32 addr, u32 len, u32 align);
+#define BALLOC_BUDDY_ALLOCED	0x1
+#define BALLOC_BUDDY_SPLIT	0x2
+#define BALLOC_BUDDY_IN_LIST	0x4
+	int flags;			/* List of associated flags. */

+	/*
+	 * Size of the PDE this buddy is using. This allows for grouping like
+	 * sized allocations into the same PDE.
+	 */
+#define BALLOC_PTE_SIZE_ANY	0x0
+#define BALLOC_PTE_SIZE_SMALL	0x1
+#define BALLOC_PTE_SIZE_BIG	0x2
+	int pte_size;
 };

-int gk20a_allocator_init(struct gk20a_allocator *allocator,
-			const char *name, u32 base, u32 size);
+#define __buddy_flag_ops(flag, flag_up)					\
+	static inline int buddy_is_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		return b->flags & BALLOC_BUDDY_ ## flag_up;		\
+	}								\
+	static inline void buddy_set_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		b->flags |= BALLOC_BUDDY_ ## flag_up;			\
+	}								\
+	static inline void buddy_clr_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		b->flags &= ~BALLOC_BUDDY_ ## flag_up;			\
+	}
+
+/*
+ * int  buddy_is_alloced(struct gk20a_buddy *b);
+ * void buddy_set_alloced(struct gk20a_buddy *b);
+ * void buddy_clr_alloced(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_split(struct gk20a_buddy *b);
+ * void buddy_set_split(struct gk20a_buddy *b);
+ * void buddy_clr_split(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_in_list(struct gk20a_buddy *b);
+ * void buddy_set_in_list(struct gk20a_buddy *b);
+ * void buddy_clr_in_list(struct gk20a_buddy *b);
+ */
+__buddy_flag_ops(alloced, ALLOCED);
+__buddy_flag_ops(split,   SPLIT);
+__buddy_flag_ops(in_list, IN_LIST);
+
+/*
+ * Keeps info for a fixed allocation.
+ */
+struct gk20a_fixed_alloc {
+	struct list_head buddies;	/* List of buddies. */
+	struct rb_node alloced_entry;	/* RB tree of fixed allocations. */
+
+	u64 start;			/* Start of fixed block. */
+	u64 end;			/* End address. */
+};
+
+struct vm_gk20a;
+
+/*
+ * GPU buddy allocator for the various GPU address spaces. Each addressable unit
+ * doesn't have to correspond to a byte. In some cases each unit is a more
+ * complex object such as a comp_tag line or the like.
+ *
+ * The max order is computed based on the size of the minimum order and the size
+ * of the address space.
+ *
+ * order_size is the size of an order 0 buddy.
+ */
+struct gk20a_allocator {
+
+	struct vm_gk20a *vm;		/* Parent VM - can be NULL. */
+
+	char name[32];			/* Name of allocator. */
+
+	u64 base;			/* Base address of the space. */
+	u64 length;			/* Length of the space. */
+	u64 blk_size;			/* Size of order 0 allocation. */
+	u64 blk_shift;			/* Shift to divide by blk_size. */
+
+	int init;			/* Non-zero if initialized. */
+
+	/* Internal stuff. */
+	u64 start;			/* Real start (aligned to blk_size). */
+	u64 end;			/* Real end, trimmed if needed. */
+	u64 count;			/* Count of objects in space. */
+	u64 blks;			/* Count of blks in the space. */
+	u64 max_order;			/* Specific maximum order. */
+
+	struct rb_root alloced_buddies;	/* Outstanding allocations. */
+	struct rb_root fixed_allocs;	/* Outstanding fixed allocations. */
+
+	struct mutex lock;		/* Protects buddy access. */
+
+#define GPU_BALLOC_GVA_SPACE		0x1
+	u64 flags;
+
+	/*
+	 * Impose an upper bound on the maximum order.
+	 */
+#define GPU_BALLOC_MAX_ORDER		31
+#define GPU_BALLOC_ORDER_LIST_LEN	(GPU_BALLOC_MAX_ORDER + 1)
+
+	struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
+
+	/*
+	 * This is for when the allocator is managing a GVA space (the
+	 * GPU_BALLOC_GVA_SPACE bit is set in @flags). This requires
+	 * that we group like sized allocations into PDE blocks.
+	 */
+	u64 pte_blk_order;
+
+	struct dentry *debugfs_entry;
+
+	u64 bytes_alloced;
+	u64 bytes_alloced_real;
+	u64 bytes_freed;
+};
+
+#define balloc_lock(a)		mutex_lock(&(a)->lock)
+#define balloc_unlock(a)	mutex_unlock(&(a)->lock)
+
+#define balloc_get_order_list(a, order)	(&(a)->buddy_list[(order)])
+#define balloc_order_to_len(a, order)	((1 << order) * (a)->blk_size)
+#define balloc_base_shift(a, base)	((base) - (a)->start)
+#define balloc_base_unshift(a, base)	((base) + (a)->start)
+
+int  gk20a_allocator_init(struct gk20a_allocator *allocator,
+			  const char *name, u64 base, u64 size, u64 order0);
+int  __gk20a_allocator_init(struct gk20a_allocator *allocator,
+			    struct vm_gk20a *vm, const char *name,
+			    u64 base, u64 size, u64 order0,
+			    u64 max_order, u64 flags);
 void gk20a_allocator_destroy(struct gk20a_allocator *allocator);

-int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
-			u32 *addr, u32 len, u32 align);
+/*
+ * Normal alloc/free operations for the buddy allocator.
+ */
+u64  gk20a_balloc(struct gk20a_allocator *allocator, u64 len);
+void gk20a_bfree(struct gk20a_allocator *allocator, u64 addr);

-int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
-			u32 addr, u32 len, u32 align);
+/*
+ * Special interface to allocate a memory regions with a specific starting
+ * address. Yikes.
+ */
+u64  gk20a_balloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
+
+/*
+ * Debugfs init.
+ */
+void gk20a_alloc_debugfs_init(struct platform_device *pdev);

 #if defined(ALLOCATOR_DEBUG)
-
-#define allocator_dbg(alloctor, format, arg...)				\
-do {								\
-	if (1)							\
-		pr_debug("gk20a_allocator (%s) %s: " format "\n",\
-			alloctor->name, __func__, ##arg);\
-} while (0)
-
-#else /* ALLOCATOR_DEBUG */
-
-#define allocator_dbg(format, arg...)
-
-#endif /* ALLOCATOR_DEBUG */
+#define balloc_dbg(alloctor, format, arg...)		\
+	pr_info("%-25s %25s() " format,			\
+		alloctor->name, __func__, ##arg)
+#else
+#define balloc_dbg(allocator, format, arg...)
+#endif

 #endif /* GK20A_ALLOCATOR_H */
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -89,9 +89,8 @@ static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (err)
 		return err;

-	gk20a_allocator_init(&gr->comp_tags, "comptag",
-			      1, /* start */
-			      max_comptag_lines - 1); /* length*/
+	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
+			       1, max_comptag_lines - 1, 1, 10, 0);

 	gr->comptags_per_cacheline = comptags_per_cacheline;
 	gr->slices_per_ltc = slices_per_fbp / g->ltc_count;
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -132,10 +132,8 @@ static void gk20a_mm_delete_priv(void *_priv)

 	if (priv->comptags.lines) {
 		BUG_ON(!priv->comptag_allocator);
-		priv->comptag_allocator->free(priv->comptag_allocator,
-					      priv->comptags.offset,
-					      priv->comptags.allocated_lines,
-					      1);
+		gk20a_bfree(priv->comptag_allocator,
+			    priv->comptags.real_offset);
 	}

 	/* Free buffer states */
@@ -226,10 +224,9 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 				u32 *ctag_map_win_ctagline)
 {
 	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
-	u32 offset = 0;
-	int err;
 	u32 ctaglines_to_allocate;
-	u32 ctagline_align;
+	u32 ctagline_align = 1;
+	u32 offset;
 	const u32 aggregate_cacheline_sz =
 		g->gr.cacheline_size * g->gr.slices_per_ltc *
 		g->ltc_count;
@@ -243,7 +240,6 @@ static int gk20a_alloc_comptags(struct gk20a *g,

 	if (!user_mappable) {
 		ctaglines_to_allocate = lines;
-		ctagline_align = 1;
 	} else {
 		/* Unfortunately, we cannot use allocation alignment
 		 * here, since compbits per cacheline is not always a
@@ -275,82 +271,26 @@ static int gk20a_alloc_comptags(struct gk20a *g,

 		if (ctaglines_to_allocate < lines)
 			return -EINVAL; /* integer overflow */
+		pr_info("user-mapped CTAGS: %u\n", ctaglines_to_allocate);
 	}

 	/* store the allocator so we can use it when we free the ctags */
 	priv->comptag_allocator = allocator;
-	err = allocator->alloc(allocator, &offset,
-			       ctaglines_to_allocate, 1);
-	if (!err) {
-		const u32 alignment_lines =
-			DIV_ROUND_UP(offset, ctagline_align) * ctagline_align -
-			offset;
+	offset = gk20a_balloc(allocator, ctaglines_to_allocate);
+	if (!offset)
+		return -ENOMEM;

-		/* prune the preceding ctaglines that were allocated
-		   for alignment */
-		if (alignment_lines) {
-			/* free alignment lines */
-			int tmp=
-				allocator->free(allocator, offset,
-						alignment_lines,
-						1);
-			WARN_ON(tmp);
+	priv->comptags.lines = lines;
+	priv->comptags.real_offset = offset;

-			offset += alignment_lines;
-			ctaglines_to_allocate -= alignment_lines;
-		}
+	if (user_mappable)
+		offset = DIV_ROUND_UP(offset, ctagline_align) * ctagline_align;

-		/* check if we can prune the trailing, too */
-		if (user_mappable)
-		{
-			u32 needed_cachelines =
-				DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline);
+	priv->comptags.offset = offset;

-			u32 first_unneeded_cacheline =
-				DIV_ROUND_UP(round_up(needed_cachelines *
-						      aggregate_cacheline_sz,
-						      small_pgsz),
-					     aggregate_cacheline_sz);
-			u32 needed_ctaglines =
-				first_unneeded_cacheline *
-				g->gr.comptags_per_cacheline;
-
-			u64 win_size;
-
-			if (needed_ctaglines < ctaglines_to_allocate) {
-				/* free alignment lines */
-				int tmp=
-					allocator->free(
-						allocator,
-						offset + needed_ctaglines,
-						(ctaglines_to_allocate -
-						 needed_ctaglines),
-						1);
-				WARN_ON(tmp);
-
-				ctaglines_to_allocate = needed_ctaglines;
-			}
-
-			*ctag_map_win_ctagline = offset;
-			win_size =
-				DIV_ROUND_UP(lines,
-					     g->gr.comptags_per_cacheline) *
-				aggregate_cacheline_sz;
-
-			*ctag_map_win_size = round_up(win_size, small_pgsz);
-		}
-
-		priv->comptags.offset = offset;
-		priv->comptags.lines = lines;
-		priv->comptags.allocated_lines = ctaglines_to_allocate;
-		priv->comptags.user_mappable = user_mappable;
-	}
-	return err;
+	return 0;
 }

-
-
-
 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
 {
 	gk20a_dbg_fn("");
@@ -901,14 +841,12 @@ static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
 }

 u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
-		     u64 size,
-		     enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+		      u64 size,
+		      enum gmmu_pgsz_gk20a gmmu_pgsz_idx)

 {
 	struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
-	int err;
 	u64 offset;
-	u32 start_page_nr = 0, num_pages;
 	u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];

 	if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) {
@@ -924,28 +862,19 @@ u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,

 	}

-	/* be certain we round up to gmmu_page_size if needed */
-	/* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
+	/* Be certain we round up to gmmu_page_size if needed */
 	size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
-
 	gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
 			vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);

-	/* The vma allocator represents page accounting. */
-	num_pages = size >> ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
-
-	err = vma->alloc(vma, &start_page_nr, num_pages, 1);
-
-	if (err) {
+	offset = gk20a_balloc(vma, size);
+	if (!offset) {
 		gk20a_err(dev_from_vm(vm),
-			   "%s oom: sz=0x%llx", vma->name, size);
+			  "%s oom: sz=0x%llx", vma->name, size);
 		return 0;
 	}

-	offset = (u64)start_page_nr <<
-		 ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
 	gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
-
 	return offset;
 }

@@ -954,25 +883,12 @@ int gk20a_vm_free_va(struct vm_gk20a *vm,
 		     enum gmmu_pgsz_gk20a pgsz_idx)
 {
 	struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
-	u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
-	u32 page_shift = ilog2(page_size);
-	u32 start_page_nr, num_pages;
-	int err;

 	gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
 			vma->name, offset, size);
+	gk20a_bfree(vma, offset);

-	start_page_nr = (u32)(offset >> page_shift);
-	num_pages = (u32)((size + page_size - 1) >> page_shift);
-
-	err = vma->free(vma, start_page_nr, num_pages, 1);
-	if (err) {
-		gk20a_err(dev_from_vm(vm),
-			   "not found: offset=0x%llx, sz=0x%llx",
-			   offset, size);
-	}
-
-	return err;
+	return 0;
 }

 static int insert_mapped_buffer(struct rb_root *root,
@@ -1169,7 +1085,7 @@ static int validate_fixed_buffer(struct vm_gk20a *vm,

 	if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) {
 		gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
-			   map_offset);
+			  map_offset);
 		return -EINVAL;
 	}

@@ -2613,7 +2529,6 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 		char *name)
 {
 	int err, i;
-	u32 num_small_pages, num_large_pages, low_hole_pages;
 	char alloc_name[32];
 	u64 small_vma_size, large_vma_size;
 	u32 pde_lo, pde_hi;
@@ -2674,34 +2589,31 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 		large_vma_size = vm->va_limit - small_vma_size;
 	}

-	num_small_pages = (u32)(small_vma_size >>
-		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
-	/* num_pages above is without regard to the low-side hole. */
-	low_hole_pages = (vm->va_start >>
-			  ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
 	snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
 		 vm->gmmu_page_sizes[gmmu_page_size_small]>>10);
-	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
-			     alloc_name,
-			     low_hole_pages,		 /*start*/
-			     num_small_pages - low_hole_pages);/* length*/
+	err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
+				     vm, alloc_name,
+				     vm->va_start,
+				     small_vma_size - vm->va_start,
+				     SZ_4K,
+				     GPU_BALLOC_MAX_ORDER,
+				     GPU_BALLOC_GVA_SPACE);
 	if (err)
 		goto clean_up_ptes;

 	if (big_pages) {
-		u32 start = (u32)(small_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-		num_large_pages = (u32)(large_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-
 		snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
 			 name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10);
-		err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
-				      alloc_name,
-				      start,			/* start */
-				      num_large_pages);		/* length */
+		/*
+		 * Big page VMA starts at the end of the small page VMA.
+		 */
+		err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+					     vm, alloc_name,
+					     small_vma_size,
+					     large_vma_size,
+					     big_page_size,
+					     GPU_BALLOC_MAX_ORDER,
+					     GPU_BALLOC_GVA_SPACE);
 		if (err)
 			goto clean_up_small_allocator;
 	}
@@ -2782,9 +2694,9 @@ int gk20a_vm_release_share(struct gk20a_as_share *as_share)
 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 			 struct nvgpu_as_alloc_space_args *args)

-{	int err = -ENOMEM;
+{
+	int err = -ENOMEM;
 	int pgsz_idx = gmmu_page_size_small;
-	u32 start_page_nr;
 	struct gk20a_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct gk20a *g = vm->mm->g;
@@ -2815,21 +2727,19 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 		goto clean_up;
 	}

-	start_page_nr = 0;
-	if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
-		start_page_nr = (u32)(args->o_a.offset >>
-				ilog2(vm->gmmu_page_sizes[pgsz_idx]));
-
 	vma = &vm->vma[pgsz_idx];
-	err = vma->alloc(vma, &start_page_nr, args->pages, 1);
-	if (err) {
+	if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
+		vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset,
+						 (u64)args->pages *
+						 (u64)args->page_size);
+	else
+		vaddr_start = gk20a_balloc(vma, args->pages * args->page_size);
+
+	if (!vaddr_start) {
 		kfree(va_node);
 		goto clean_up;
 	}

-	vaddr_start = (u64)start_page_nr <<
-		      ilog2(vm->gmmu_page_sizes[pgsz_idx]);
-
 	va_node->vaddr_start = vaddr_start;
 	va_node->size = (u64)args->page_size * (u64)args->pages;
 	va_node->pgsz_idx = pgsz_idx;
@@ -2853,7 +2763,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 					 true);
 		if (!map_offset) {
 			mutex_unlock(&vm->update_gmmu_lock);
-			vma->free(vma, start_page_nr, args->pages, 1);
+			gk20a_bfree(vma, vaddr_start);
 			kfree(va_node);
 			goto clean_up;
 		}
@@ -2865,6 +2775,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 	mutex_unlock(&vm->update_gmmu_lock);

 	args->o_a.offset = vaddr_start;
+	err = 0;

 clean_up:
 	return err;
@@ -2875,7 +2786,6 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 {
 	int err = -ENOMEM;
 	int pgsz_idx;
-	u32 start_page_nr;
 	struct gk20a_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct vm_reserved_va_node *va_node;
@@ -2888,14 +2798,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 	pgsz_idx = __nv_gmmu_va_is_upper(vm, args->offset) ?
 			gmmu_page_size_big : gmmu_page_size_small;

-	start_page_nr = (u32)(args->offset >>
-			ilog2(vm->gmmu_page_sizes[pgsz_idx]));
-
 	vma = &vm->vma[pgsz_idx];
-	err = vma->free(vma, start_page_nr, args->pages, 1);
-
-	if (err)
-		goto clean_up;
+	gk20a_bfree(vma, args->offset);

 	mutex_lock(&vm->update_gmmu_lock);
 	va_node = addr_to_reservation(vm, args->offset);
@@ -2925,8 +2829,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 		kfree(va_node);
 	}
 	mutex_unlock(&vm->update_gmmu_lock);
+	err = 0;

-clean_up:
 	return err;
 }

--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -131,6 +131,7 @@ enum gmmu_pgsz_gk20a {
 };

 struct gk20a_comptags {
+	u32 real_offset;
 	u32 offset;
 	u32 lines;
 	u32 allocated_lines;
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2816,7 +2816,6 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	struct pmu_payload payload;
 	u32 seq;
 	u32 data;
-	int err = 0;

 	gk20a_dbg_fn("");

@@ -2867,12 +2866,11 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);

 	if (!pmu->sample_buffer)
-		err = pmu->dmem.alloc(&pmu->dmem,
-				      &pmu->sample_buffer, 2 * sizeof(u16),
-				      PMU_DMEM_ALLOC_ALIGNMENT);
-	if (err) {
+		pmu->sample_buffer = gk20a_balloc(&pmu->dmem,
+						  2 * sizeof(u16));
+	if (!pmu->sample_buffer) {
 		gk20a_err(dev_from_gk20a(g),
-			"failed to allocate perfmon sample buffer");
+			  "failed to allocate perfmon sample buffer");
 		return -ENOMEM;
 	}

@@ -2970,15 +2968,17 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
 	for (i = 0; i < PMU_QUEUE_COUNT; i++)
 		pmu_queue_init(pmu, i, init);

-	if (!pmu->dmem.alloc) {
-		/*Align start and end addresses*/
+	if (!pmu->dmem.init) {
+		/* Align start and end addresses */
 		u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
-					PMU_DMEM_ALLOC_ALIGNMENT);
+				  PMU_DMEM_ALLOC_ALIGNMENT);
 		u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) +
-			pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
+			   pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
 			~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
 		u32 size = end - start;
-		gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem", start, size);
+		__gk20a_allocator_init(&pmu->dmem, NULL, "gk20a_pmu_dmem",
+				       start, size,
+				       PMU_DMEM_ALLOC_ALIGNMENT, 4, 0);
 	}

 	pmu->pmu_ready = true;
@@ -3115,20 +3115,14 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
 		seq->callback = NULL;
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_in_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_in_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_in_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_in_a_ptr(seq)));
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_out_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_out_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_out_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_out_a_ptr(seq)));

 	if (seq->callback)
 		seq->callback(g, msg, seq->cb_params, seq->desc, ret);
@@ -3769,11 +3763,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 			pv->pmu_allocation_set_dmem_size(pmu, in,
 			(u16)max(payload->in.size, payload->out.size));

-		err = pmu->dmem.alloc(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset_addr(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
-		if (err)
+		*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
+			gk20a_balloc(&pmu->dmem,
+				     pv->pmu_allocation_get_dmem_size(pmu, in));
+		if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
 			goto clean_up;

 		pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
@@ -3794,11 +3787,12 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 		(u16)payload->out.size);

 		if (payload->out.buf != payload->in.buf) {
-			err = pmu->dmem.alloc(&pmu->dmem,
-				pv->pmu_allocation_get_dmem_offset_addr(pmu, out),
-				pv->pmu_allocation_get_dmem_size(pmu, out),
-				PMU_DMEM_ALLOC_ALIGNMENT);
-			if (err)
+
+			*(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
+				gk20a_balloc(&pmu->dmem,
+				    pv->pmu_allocation_get_dmem_size(pmu, out));
+			if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
+								       out)))
 				goto clean_up;
 		} else {
 			BUG_ON(in == NULL);
@@ -3826,15 +3820,11 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 clean_up:
 	gk20a_dbg_fn("fail");
 	if (in)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, in));
 	if (out)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, out),
-			pv->pmu_allocation_get_dmem_size(pmu, out),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, out));

 	pmu_seq_release(pmu, seq);
 	return err;
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -3,7 +3,7 @@
 *
 * GK20A PMU (aka. gPMU outside gk20a context)
 *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -466,7 +466,7 @@ struct pmu_ucode_desc {
 #define PMU_UNIT_ID_IS_VALID(id)		\
 		(((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START))

-#define PMU_DMEM_ALLOC_ALIGNMENT	(32)
+#define PMU_DMEM_ALLOC_ALIGNMENT	(4)
 #define PMU_DMEM_ALIGNMENT		(4)

 #define PMU_CMD_FLAGS_PMU_MASK		(0xF0)
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -3,7 +3,7 @@
 *
 * GK20A Semaphores
 *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -44,8 +44,10 @@ struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct device *d,
 	if (gk20a_get_sgtable(d, &p->sgt, p->cpu_va, p->iova, p->size))
 		goto clean_up;

-	if (gk20a_allocator_init(&p->alloc, unique_name, 0,
-			     p->size))
+	/* Sacrifice one semaphore in the name of returning error codes. */
+	if (gk20a_allocator_init(&p->alloc, unique_name,
+				 SEMAPHORE_SIZE, p->size - SEMAPHORE_SIZE,
+				 SEMAPHORE_SIZE))
 		goto clean_up;

 	gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->cpu_va,
@@ -163,8 +165,8 @@ struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
 	if (!s)
 		return NULL;

-	if (pool->alloc.alloc(&pool->alloc, &s->offset, SEMAPHORE_SIZE,
-				SEMAPHORE_SIZE)) {
+	s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
+	if (!s->offset) {
 		gk20a_err(pool->dev, "failed to allocate semaphore");
 		kfree(s);
 		return NULL;
@@ -186,8 +188,7 @@ static void gk20a_semaphore_free(struct kref *ref)
 	struct gk20a_semaphore *s =
 		container_of(ref, struct gk20a_semaphore, ref);

-	s->pool->alloc.free(&s->pool->alloc, s->offset, SEMAPHORE_SIZE,
-			SEMAPHORE_SIZE);
+	gk20a_bfree(&s->pool->alloc, s->offset);
 	gk20a_semaphore_pool_put(s->pool);
 	kfree(s);
 }
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
@@ -90,9 +90,8 @@ static int gm20b_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (err)
 		return err;

-	gk20a_allocator_init(&gr->comp_tags, "comptag",
-			      1, /* start */
-			      max_comptag_lines - 1); /* length*/
+	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
+			       1, max_comptag_lines - 1, 1, 10, 0);

 	gr->comptags_per_cacheline = comptags_per_cacheline;
 	gr->slices_per_ltc = slices_per_ltc;
--- a/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
@@ -41,9 +41,8 @@ static int vgpu_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (max_comptag_lines < 2)
 		return -ENXIO;

-	gk20a_allocator_init(&gr->comp_tags, "comptag",
-			      1, /* start */
-			      max_comptag_lines - 1); /* length*/
+	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
+			       1, max_comptag_lines - 1, 1, 10, 0); /* length*/
 	return 0;
 }

--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -243,11 +243,9 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 	struct tegra_vgpu_as_share_params *p = &msg.params.as_share;
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm;
-	u32 num_small_pages, num_large_pages, low_hole_pages;
 	u64 small_vma_size, large_vma_size;
 	char name[32];
 	int err, i;
-	u32 start;

 	/* note: keep the page sizes sorted lowest to highest here */
 	u32 gmmu_page_sizes[gmmu_nr_page_sizes] = {
@@ -294,33 +292,27 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 	small_vma_size = (u64)16 << 30;
 	large_vma_size = vm->va_limit - small_vma_size;

-	num_small_pages = (u32)(small_vma_size >>
-		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
-	/* num_pages above is without regard to the low-side hole. */
-	low_hole_pages = (vm->va_start >>
-			  ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
 	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
 		 gmmu_page_sizes[gmmu_page_size_small]>>10);
-	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
-			     name,
-			     low_hole_pages,		 /*start*/
-			     num_small_pages - low_hole_pages);/* length*/
+	err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
+				     vm, name,
+				     vm->va_start,
+				     small_vma_size - vm->va_start,
+				     SZ_4K,
+				     GPU_BALLOC_MAX_ORDER,
+				     GPU_BALLOC_GVA_SPACE);
 	if (err)
 		goto clean_up_share;

-	start = (u32)(small_vma_size >>
-		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-	num_large_pages = (u32)(large_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-
 	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
 		gmmu_page_sizes[gmmu_page_size_big]>>10);
-	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
-			      name,
-			      start,			/* start */
-			      num_large_pages);		/* length */
+	err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+				     vm, name,
+				     small_vma_size,
+				     large_vma_size,
+				     big_page_size,
+				     GPU_BALLOC_MAX_ORDER,
+				     GPU_BALLOC_GVA_SPACE);
 	if (err)
 		goto clean_up_small_allocator;