diff --git a/drivers/gpu/nvgpu/Kconfig.nvgpu b/drivers/gpu/nvgpu/Kconfig.nvgpu
index 3e3607e0d..8baf6897c 100644
--- a/drivers/gpu/nvgpu/Kconfig.nvgpu
+++ b/drivers/gpu/nvgpu/Kconfig.nvgpu
@@ -47,6 +47,17 @@ config GK20A_DEVFREQ
 
 endchoice
 
+config NVGPU_TRACK_MEM_USAGE
+        bool "Track the usage of system memory in nvgpu"
+	depends on GK20A
+	default n
+	help
+	  Say Y here to allow nvgpu to track and keep statistics on
+	  the system memory used by the driver. This does recreate
+	  some of the kmem_leak tracking but this is also applicable
+	  to other OSes which do not have Linux' kmem_leak.
+
+
 config GK20A_CYCLE_STATS
 	bool "Support GK20A GPU CYCLE STATS"
 	depends on GK20A
diff --git a/drivers/gpu/nvgpu/common/linux/kmem.c b/drivers/gpu/nvgpu/common/linux/kmem.c
index 24e0ca5df..60e793480 100644
--- a/drivers/gpu/nvgpu/common/linux/kmem.c
+++ b/drivers/gpu/nvgpu/common/linux/kmem.c
@@ -15,11 +15,22 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
+#include <linux/rbtree.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/stacktrace.h>
 
 #include <nvgpu/kmem.h>
 
+#include "gk20a/gk20a.h"
+
+#include "kmem_priv.h"
+
 /*
  * Statically declared because this needs to be shared across all nvgpu driver
  * instances. This makes sure that all kmem caches are _definitely_ uniquely
@@ -27,26 +38,793 @@
  */
 static atomic_t kmem_cache_id;
 
-/*
- * Linux specific version of the nvgpu_kmem_cache struct. This type is
- * completely opaque to the rest of the driver.
- */
-struct nvgpu_kmem_cache {
-	struct gk20a *g;
-	struct kmem_cache *cache;
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+
+static void lock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+{
+	mutex_lock(&tracker->lock);
+}
+
+static void unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+{
+	mutex_unlock(&tracker->lock);
+}
+
+static void kmem_print_mem_alloc(struct gk20a *g,
+				 struct nvgpu_mem_alloc *alloc,
+				 struct seq_file *s)
+{
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+	int i;
+
+	__pstat(s, "nvgpu-alloc: addr=0x%llx size=%ld\n",
+		alloc->addr, alloc->size);
+	for (i = 0; i < alloc->stack_length; i++)
+		__pstat(s, "  %3d [<%p>] %pS\n", i,
+			(void *)alloc->stack[i],
+			(void *)alloc->stack[i]);
+	__pstat(s, "\n");
+#else
+	__pstat(s, "nvgpu-alloc: addr=0x%llx size=%ld src=%pF\n",
+		alloc->addr, alloc->size, alloc->ip);
+#endif
+}
+
+static int nvgpu_add_alloc(struct nvgpu_mem_alloc_tracker *tracker,
+			   struct nvgpu_mem_alloc *alloc)
+{
+	struct rb_node **new = &tracker->allocs.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct nvgpu_mem_alloc *tmp = rb_entry(*new,
+						       struct nvgpu_mem_alloc,
+						       allocs_entry);
+
+		parent = *new;
+
+		if (alloc->addr < tmp->addr)
+			new = &(*new)->rb_left;
+		else if (alloc->addr > tmp->addr)
+			new = &(*new)->rb_right;
+		else
+			return -EINVAL;
+	}
+
+	/* Put the new node there */
+	rb_link_node(&alloc->allocs_entry, parent, new);
+	rb_insert_color(&alloc->allocs_entry, &tracker->allocs);
+
+	return 0;
+}
+
+static struct nvgpu_mem_alloc *nvgpu_rem_alloc(
+	struct nvgpu_mem_alloc_tracker *tracker, u64 alloc_addr)
+{
+	struct rb_node *node = tracker->allocs.rb_node;
+	struct nvgpu_mem_alloc *alloc;
+
+	while (node) {
+		alloc = container_of(node,
+				     struct nvgpu_mem_alloc, allocs_entry);
+
+		if (alloc_addr < alloc->addr)
+			node = node->rb_left;
+		else if (alloc_addr > alloc->addr)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, &tracker->allocs);
+
+	return alloc;
+}
+
+static int __nvgpu_save_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
+				   unsigned long size, unsigned long real_size,
+				   u64 addr, unsigned long ip)
+{
+	int ret;
+	struct nvgpu_mem_alloc *alloc;
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+	struct stack_trace stack_trace;
+#endif
+
+	alloc = kzalloc(sizeof(*alloc), GFP_KERNEL);
+	if (!alloc)
+		return -ENOMEM;
+
+	alloc->owner = tracker;
+	alloc->size = size;
+	alloc->real_size = real_size;
+	alloc->addr = addr;
+	alloc->ip = (void *)(uintptr_t)ip;
+
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+	stack_trace.max_entries = MAX_STACK_TRACE;
+	stack_trace.nr_entries = 0;
+	stack_trace.entries = alloc->stack;
+	/*
+	 * This 4 here skips the 2 function calls that happen for all traced
+	 * allocs due to nvgpu:
+	 *
+	 *   __nvgpu_save_kmem_alloc+0x7c/0x128
+	 *   __nvgpu_track_kzalloc+0xcc/0xf8
+	 *
+	 * And the function calls that get made by the stack trace code itself.
+	 * If the trace savings code changes this will likely have to change
+	 * as well.
+	 */
+	stack_trace.skip = 4;
+	save_stack_trace(&stack_trace);
+	alloc->stack_length = stack_trace.nr_entries;
+#endif
+
+	lock_tracker(tracker);
+	tracker->bytes_alloced += size;
+	tracker->bytes_alloced_real += real_size;
+	tracker->nr_allocs++;
+
+	/* Keep track of this for building a histogram later on. */
+	if (tracker->max_alloc < size)
+		tracker->max_alloc = size;
+	if (tracker->min_alloc > size)
+		tracker->min_alloc = size;
+
+	ret = nvgpu_add_alloc(tracker, alloc);
+	if (ret) {
+		WARN(1, "Duplicate alloc??? 0x%llx\n", addr);
+		kfree(alloc);
+		unlock_tracker(tracker);
+		return ret;
+	}
+	unlock_tracker(tracker);
+
+	return 0;
+}
+
+static int __nvgpu_free_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
+				   u64 addr)
+{
+	struct nvgpu_mem_alloc *alloc;
+
+	lock_tracker(tracker);
+	alloc = nvgpu_rem_alloc(tracker, addr);
+	if (WARN(!alloc, "Possible double-free detected: 0x%llx!", addr)) {
+		unlock_tracker(tracker);
+		return -EINVAL;
+	}
+
+	tracker->nr_frees++;
+	tracker->bytes_freed += alloc->size;
+	tracker->bytes_freed_real += alloc->real_size;
+	unlock_tracker(tracker);
+
+	return 0;
+}
+
+static void __nvgpu_check_valloc_size(unsigned long size)
+{
+	WARN(size < PAGE_SIZE, "Alloc smaller than page size! (%lu)!\n", size);
+}
+
+static void __nvgpu_check_kalloc_size(size_t size)
+{
+	WARN(size > PAGE_SIZE, "Alloc larger than page size! (%zu)!\n", size);
+}
+
+void *__nvgpu_track_vmalloc(struct gk20a *g, unsigned long size,
+			    unsigned long ip)
+{
+	void *alloc = vmalloc(size);
+
+	if (!alloc)
+		return NULL;
+
+	kmem_dbg("vmalloc: size=%-6ld addr=0x%p", size, alloc);
+	__nvgpu_check_valloc_size(size);
 
 	/*
-	 * Memory to hold the kmem_cache unique name. Only necessary on our
-	 * k3.10 kernel when not using the SLUB allocator but it's easier to
-	 * just carry this on to newer kernels.
+	 * Ignore the return message. If this fails let's not cause any issues
+	 * for the rest of the driver.
 	 */
-	char name[128];
+	__nvgpu_save_kmem_alloc(g->vmallocs, size, roundup_pow_of_two(size),
+				(u64)(uintptr_t)alloc, ip);
+
+	return alloc;
+}
+
+void *__nvgpu_track_vzalloc(struct gk20a *g, unsigned long size,
+			    unsigned long ip)
+{
+	void *alloc = vzalloc(size);
+
+	if (!alloc)
+		return NULL;
+
+	kmem_dbg("vzalloc: size=%-6ld addr=0x%p", size, alloc);
+	__nvgpu_check_valloc_size(size);
+
+	/*
+	 * Ignore the return message. If this fails let's not cause any issues
+	 * for the rest of the driver.
+	 */
+	__nvgpu_save_kmem_alloc(g->vmallocs, size, roundup_pow_of_two(size),
+				(u64)(uintptr_t)alloc, ip);
+
+	return alloc;
+}
+
+void *__nvgpu_track_kmalloc(struct gk20a *g, size_t size, unsigned long ip)
+{
+	void *alloc = kmalloc(size, GFP_KERNEL);
+
+	if (!alloc)
+		return NULL;
+
+	kmem_dbg("kmalloc: size=%-6ld addr=0x%p gfp=0x%08x",
+		 size, alloc, GFP_KERNEL);
+	__nvgpu_check_kalloc_size(size);
+
+	__nvgpu_save_kmem_alloc(g->kmallocs, size, roundup_pow_of_two(size),
+				(u64)(uintptr_t)alloc, ip);
+
+	return alloc;
+}
+
+void *__nvgpu_track_kzalloc(struct gk20a *g, size_t size, unsigned long ip)
+{
+	void *alloc = kzalloc(size, GFP_KERNEL);
+
+	if (!alloc)
+		return NULL;
+
+	kmem_dbg("kzalloc: size=%-6ld addr=0x%p gfp=0x%08x",
+		 size, alloc, GFP_KERNEL);
+	__nvgpu_check_kalloc_size(size);
+
+	__nvgpu_save_kmem_alloc(g->kmallocs, size, roundup_pow_of_two(size),
+				(u64)(uintptr_t)alloc, ip);
+
+	return alloc;
+}
+
+void *__nvgpu_track_kcalloc(struct gk20a *g, size_t n, size_t size,
+			    unsigned long ip)
+{
+	void *alloc = kcalloc(n, size, GFP_KERNEL);
+
+	if (!alloc)
+		return NULL;
+
+	kmem_dbg("kcalloc: size=%-6ld addr=0x%p gfp=0x%08x",
+		 n * size, alloc, GFP_KERNEL);
+	__nvgpu_check_kalloc_size(n * size);
+
+	__nvgpu_save_kmem_alloc(g->kmallocs, n * size,
+				roundup_pow_of_two(n * size),
+				(u64)(uintptr_t)alloc, ip);
+
+	return alloc;
+}
+
+void __nvgpu_track_vfree(struct gk20a *g, void *addr)
+{
+	/*
+	 * Often it is accepted practice to pass NULL pointers into free
+	 * functions to save code.
+	 */
+	if (!addr)
+		return;
+
+	vfree(addr);
+
+	kmem_dbg("vfree: addr=0x%p", addr);
+
+	__nvgpu_free_kmem_alloc(g->vmallocs, (u64)(uintptr_t)addr);
+}
+
+void __nvgpu_track_kfree(struct gk20a *g, void *addr)
+{
+	if (!addr)
+		return;
+
+	kfree(addr);
+
+	kmem_dbg("kfree: addr=0x%p", addr);
+
+	__nvgpu_free_kmem_alloc(g->kmallocs, (u64)(uintptr_t)addr);
+}
+
+/**
+ * to_human_readable_bytes - Determine  suffix for passed size.
+ *
+ * @bytes - Number of bytes to generate a suffix for.
+ * @hr_bytes [out] - The human readable number of bytes.
+ * @hr_suffix [out] - The suffix for the HR number of bytes.
+ *
+ * Computes a human readable decomposition of the passed number of bytes. The
+ * suffix for the bytes is passed back through the @hr_suffix pointer. The right
+ * number of bytes is then passed back in @hr_bytes. This returns the following
+ * ranges:
+ *
+ *   0 - 1023 B
+ *   1 - 1023 KB
+ *   1 - 1023 MB
+ *   1 - 1023 GB
+ *   1 - 1023 TB
+ *   1 - ...  PB
+ */
+static void __to_human_readable_bytes(u64 bytes, u64 *hr_bytes,
+				      const char **hr_suffix)
+{
+	static const char *suffixes[] =
+		{ "B", "KB", "MB", "GB", "TB", "PB" };
+
+	u64 suffix_ind = 0;
+
+	while (suffix_ind < ARRAY_SIZE(suffixes) && bytes >= 1024) {
+		bytes >>= 10;
+		suffix_ind++;
+	}
+
+	/*
+	 * Handle case where bytes > 1023PB.
+	 */
+	suffix_ind = suffix_ind < ARRAY_SIZE(suffixes) ?
+		suffix_ind : ARRAY_SIZE(suffixes) - 1;
+
+	*hr_bytes = bytes;
+	*hr_suffix = suffixes[suffix_ind];
+}
+
+/**
+ * print_hr_bytes - Print human readable bytes
+ *
+ * @s - A seq_file to print to. May be NULL.
+ * @msg - A message to print before the bytes.
+ * @bytes - Number of bytes.
+ *
+ * Print @msg followed by the human readable decomposition of the passed number
+ * of bytes.
+ *
+ * If @s is NULL then this prints will be made to the kernel log.
+ */
+static void print_hr_bytes(struct seq_file *s, const char *msg, u64 bytes)
+{
+	u64 hr_bytes;
+	const char *hr_suffix;
+
+	__to_human_readable_bytes(bytes, &hr_bytes, &hr_suffix);
+	__pstat(s, "%s%lld %s\n", msg, hr_bytes, hr_suffix);
+}
+
+/**
+ * print_histogram - Build a histogram of the memory usage.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
+ */
+static void print_histogram(struct nvgpu_mem_alloc_tracker *tracker,
+			    struct seq_file *s)
+{
+	int i;
+	u64 pot_min, pot_max;
+	u64 nr_buckets;
+	unsigned int *buckets;
+	unsigned int total_allocs;
+	struct rb_node *node;
+	static const char histogram_line[] =
+		"++++++++++++++++++++++++++++++++++++++++";
+
+	/*
+	 * pot_min is essentially a round down to the nearest power of 2. This
+	 * is the start of the histogram. pot_max is just a round up to the
+	 * nearest power of two. Each histogram bucket is one power of two so
+	 * the histogram buckets are exponential.
+	 */
+	pot_min = (u64)rounddown_pow_of_two(tracker->min_alloc);
+	pot_max = (u64)roundup_pow_of_two(tracker->max_alloc);
+
+	nr_buckets = __ffs(pot_max) - __ffs(pot_min);
+
+	buckets = kzalloc(sizeof(*buckets) * nr_buckets, GFP_KERNEL);
+	if (!buckets) {
+		__pstat(s, "OOM: could not allocate bucket storage!?\n");
+		return;
+	}
+
+	/*
+	 * Iterate across all of the allocs and determine what bucket they
+	 * should go in. Round the size down to the nearest power of two to
+	 * find the right bucket.
+	 */
+	for (node = rb_first(&tracker->allocs);
+	     node != NULL;
+	     node = rb_next(node)) {
+		int b;
+		u64 bucket_min;
+		struct nvgpu_mem_alloc *alloc;
+
+		alloc = container_of(node, struct nvgpu_mem_alloc,
+				     allocs_entry);
+		bucket_min = (u64)rounddown_pow_of_two(alloc->size);
+		if (bucket_min < tracker->min_alloc)
+			bucket_min = tracker->min_alloc;
+
+		b = __ffs(bucket_min) - __ffs(pot_min);
+
+		/*
+		 * Handle the one case were there's an alloc exactly as big as
+		 * the maximum bucket size of the largest bucket. Most of the
+		 * buckets have an inclusive minimum and exclusive maximum. But
+		 * the largest bucket needs to have an _inclusive_ maximum as
+		 * well.
+		 */
+		if (b == (int)nr_buckets)
+			b--;
+
+		buckets[b]++;
+	}
+
+	total_allocs = 0;
+	for (i = 0; i < (int)nr_buckets; i++)
+		total_allocs += buckets[i];
+
+	__pstat(s, "Alloc histogram:\n");
+
+	/*
+	 * Actually compute the histogram lines.
+	 */
+	for (i = 0; i < (int)nr_buckets; i++) {
+		char this_line[sizeof(histogram_line) + 1];
+		u64 line_length;
+		u64 hr_bytes;
+		const char *hr_suffix;
+
+		memset(this_line, 0, sizeof(this_line));
+
+		/*
+		 * Compute the normalized line length. Cant use floating point
+		 * so we will just multiply everything by 1000 and use fixed
+		 * point.
+		 */
+		line_length = (1000 * buckets[i]) / total_allocs;
+		line_length *= sizeof(histogram_line);
+		line_length /= 1000;
+
+		memset(this_line, '+', line_length);
+
+		__to_human_readable_bytes(1 << (__ffs(pot_min) + i),
+					  &hr_bytes, &hr_suffix);
+		__pstat(s, "  [%-4lld %-4lld] %-2s %5u | %s\n",
+			hr_bytes, hr_bytes << 1,
+			hr_suffix, buckets[i], this_line);
+	}
+}
+
+/**
+ * nvgpu_kmem_print_stats - Print kmem tracking stats.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
+ *
+ * Print stats from a tracker. If @s is non-null then seq_printf() will be
+ * used with @s. Otherwise the stats are pr_info()ed.
+ */
+void nvgpu_kmem_print_stats(struct nvgpu_mem_alloc_tracker *tracker,
+			    struct seq_file *s)
+{
+	lock_tracker(tracker);
+
+	__pstat(s, "Mem tracker: %s\n\n", tracker->name);
+
+	__pstat(s, "Basic Stats:\n");
+	__pstat(s,        "  Number of allocs        %lld\n",
+		tracker->nr_allocs);
+	__pstat(s,        "  Number of frees         %lld\n",
+		tracker->nr_frees);
+	print_hr_bytes(s, "  Smallest alloc          ", tracker->min_alloc);
+	print_hr_bytes(s, "  Largest alloc           ", tracker->max_alloc);
+	print_hr_bytes(s, "  Bytes allocated         ", tracker->bytes_alloced);
+	print_hr_bytes(s, "  Bytes freed             ", tracker->bytes_freed);
+	print_hr_bytes(s, "  Bytes allocated (real)  ",
+		       tracker->bytes_alloced_real);
+	print_hr_bytes(s, "  Bytes freed (real)      ",
+		       tracker->bytes_freed_real);
+	__pstat(s, "\n");
+
+	print_histogram(tracker, s);
+
+	unlock_tracker(tracker);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+static int __kmem_tracking_show(struct seq_file *s, void *unused)
+{
+	struct nvgpu_mem_alloc_tracker *tracker = s->private;
+
+	nvgpu_kmem_print_stats(tracker, s);
+
+	return 0;
+}
+
+static int __kmem_tracking_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, __kmem_tracking_show, inode->i_private);
+}
+
+static const struct file_operations __kmem_tracking_fops = {
+	.open = __kmem_tracking_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
 };
 
+static int __kmem_traces_dump_tracker(struct gk20a *g,
+				      struct nvgpu_mem_alloc_tracker *tracker,
+				      struct seq_file *s)
+{
+	struct rb_node *node;
+
+	for (node = rb_first(&tracker->allocs);
+	     node != NULL;
+	     node = rb_next(node)) {
+		struct nvgpu_mem_alloc *alloc;
+
+		alloc = container_of(node, struct nvgpu_mem_alloc,
+				     allocs_entry);
+
+		kmem_print_mem_alloc(g, alloc, s);
+	}
+
+	return 0;
+}
+
+static int __kmem_traces_show(struct seq_file *s, void *unused)
+{
+	struct gk20a *g = s->private;
+
+	lock_tracker(g->vmallocs);
+	seq_puts(s, "Oustanding vmallocs:\n");
+	__kmem_traces_dump_tracker(g, g->vmallocs, s);
+	seq_puts(s, "\n");
+	unlock_tracker(g->vmallocs);
+
+	lock_tracker(g->kmallocs);
+	seq_puts(s, "Oustanding kmallocs:\n");
+	__kmem_traces_dump_tracker(g, g->kmallocs, s);
+	unlock_tracker(g->kmallocs);
+
+	return 0;
+}
+
+static int __kmem_traces_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, __kmem_traces_show, inode->i_private);
+}
+
+static const struct file_operations __kmem_traces_fops = {
+	.open = __kmem_traces_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void nvgpu_kmem_debugfs_init(struct device *dev)
+{
+	struct gk20a_platform *plat = dev_get_drvdata(dev);
+	struct gk20a *g = get_gk20a(dev);
+	struct dentry *gpu_root = plat->debugfs;
+	struct dentry *node;
+
+	g->debugfs_kmem = debugfs_create_dir("kmem_tracking", gpu_root);
+	if (IS_ERR_OR_NULL(g->debugfs_kmem))
+		return;
+
+	node = debugfs_create_file(g->vmallocs->name, S_IRUGO,
+				   g->debugfs_kmem,
+				   g->vmallocs, &__kmem_tracking_fops);
+	node = debugfs_create_file(g->kmallocs->name, S_IRUGO,
+				   g->debugfs_kmem,
+				   g->kmallocs, &__kmem_tracking_fops);
+	node = debugfs_create_file("traces", S_IRUGO,
+				   g->debugfs_kmem,
+				   g, &__kmem_traces_fops);
+}
+#else
+void nvgpu_kmem_debugfs_init(struct device *dev)
+{
+}
+#endif
+
+static int __do_check_for_outstanding_allocs(
+	struct gk20a *g,
+	struct nvgpu_mem_alloc_tracker *tracker,
+	const char *type, bool silent)
+{
+	struct rb_node *node;
+	int count = 0;
+
+	for (node = rb_first(&tracker->allocs);
+	     node != NULL;
+	     node = rb_next(node)) {
+		struct nvgpu_mem_alloc *alloc;
+
+		alloc = container_of(node, struct nvgpu_mem_alloc,
+				     allocs_entry);
+
+		if (!silent)
+			kmem_print_mem_alloc(g, alloc, NULL);
+
+		count++;
+	}
+
+	return count;
+}
+
+/**
+ * check_for_outstanding_allocs - Count and display outstanding allocs
+ *
+ * @g      - The GPU.
+ * @silent - If set don't print anything about the allocs.
+ *
+ * Dump (or just count) the number of allocations left outstanding.
+ */
+static int check_for_outstanding_allocs(struct gk20a *g, bool silent)
+{
+	int count = 0;
+
+	count += __do_check_for_outstanding_allocs(g, g->kmallocs, "kmalloc",
+						   silent);
+	count += __do_check_for_outstanding_allocs(g, g->vmallocs, "vmalloc",
+						   silent);
+
+	return count;
+}
+
+static void do_nvgpu_kmem_cleanup(struct nvgpu_mem_alloc_tracker *tracker,
+				  void (*force_free_func)(const void *))
+{
+	struct rb_node *node;
+
+	while ((node = rb_first(&tracker->allocs)) != NULL) {
+		struct nvgpu_mem_alloc *alloc;
+
+		alloc = container_of(node, struct nvgpu_mem_alloc,
+				     allocs_entry);
+		if (force_free_func)
+			force_free_func((void *)alloc->addr);
+
+		kfree(alloc);
+	}
+}
+
+/**
+ * nvgpu_kmem_cleanup - Cleanup the kmem tracking
+ *
+ * @g          - The GPU.
+ * @force_free - If set will also free leaked objects if possible.
+ *
+ * Cleanup all of the allocs made by nvgpu_kmem tracking code. If @force_free
+ * is non-zero then the allocation made by nvgpu is also freed. This is risky,
+ * though, as it is possible that the memory is still in use by other parts of
+ * the GPU driver not aware that this has happened.
+ *
+ * In theory it should be fine if the GPU driver has been deinitialized and
+ * there are no bugs in that code. However, if there are any bugs in that code
+ * then they could likely manifest as odd crashes indeterminate amounts of time
+ * in the future. So use @force_free at your own risk.
+ */
+static void nvgpu_kmem_cleanup(struct gk20a *g, bool force_free)
+{
+	do_nvgpu_kmem_cleanup(g->kmallocs, force_free ? kfree : NULL);
+	do_nvgpu_kmem_cleanup(g->vmallocs, force_free ? vfree : NULL);
+}
+
+void nvgpu_kmem_fini(struct gk20a *g, int flags)
+{
+	int count;
+	bool silent, force_free;
+
+	if (!flags)
+		return;
+
+	silent = !(flags & NVGPU_KMEM_FINI_DUMP_ALLOCS);
+	force_free = !!(flags & NVGPU_KMEM_FINI_FORCE_CLEANUP);
+
+	count = check_for_outstanding_allocs(g, silent);
+	nvgpu_kmem_cleanup(g, force_free);
+
+	/*
+	 * If we leak objects we can either BUG() out or just WARN(). In general
+	 * it doesn't make sense to BUG() on here since leaking a few objects
+	 * won't crash the kernel but it can be helpful for development.
+	 *
+	 * If neither flag is set then we just silently do nothing.
+	 */
+	if (count > 0) {
+		if (flags & NVGPU_KMEM_FINI_WARN) {
+			WARN(1, "Letting %d allocs leak!!\n", count);
+		} else if (flags & NVGPU_KMEM_FINI_BUG) {
+			gk20a_err(g->dev, "Letting %d allocs leak!!\n", count);
+			BUG();
+		}
+	}
+}
+
+int nvgpu_kmem_init(struct gk20a *g)
+{
+	int err;
+
+	g->vmallocs = kzalloc(sizeof(*g->vmallocs), GFP_KERNEL);
+	g->kmallocs = kzalloc(sizeof(*g->kmallocs), GFP_KERNEL);
+
+	if (!g->vmallocs || !g->kmallocs) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	g->vmallocs->name = "vmalloc";
+	g->kmallocs->name = "kmalloc";
+
+	g->vmallocs->allocs = RB_ROOT;
+	g->kmallocs->allocs = RB_ROOT;
+
+	mutex_init(&g->vmallocs->lock);
+	mutex_init(&g->kmallocs->lock);
+
+	g->vmallocs->min_alloc = PAGE_SIZE;
+	g->kmallocs->min_alloc = KMALLOC_MIN_SIZE;
+
+	/*
+	 * This needs to go after all the other initialization since they use
+	 * the nvgpu_kzalloc() API.
+	 */
+	g->vmallocs->allocs_cache = nvgpu_kmem_cache_create(g,
+						sizeof(struct nvgpu_mem_alloc));
+	g->kmallocs->allocs_cache = nvgpu_kmem_cache_create(g,
+						sizeof(struct nvgpu_mem_alloc));
+
+	if (!g->vmallocs->allocs_cache || !g->kmallocs->allocs_cache) {
+		err = -ENOMEM;
+		if (g->vmallocs->allocs_cache)
+			nvgpu_kmem_cache_destroy(g->vmallocs->allocs_cache);
+		if (g->kmallocs->allocs_cache)
+			nvgpu_kmem_cache_destroy(g->kmallocs->allocs_cache);
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	if (g->vmallocs)
+		kfree(g->vmallocs);
+	if (g->kmallocs)
+		kfree(g->kmallocs);
+	return err;
+}
+
+#else /* !CONFIG_NVGPU_TRACK_MEM_USAGE */
+
+int nvgpu_kmem_init(struct gk20a *g)
+{
+	return 0;
+}
+
+void nvgpu_kmem_fini(struct gk20a *g, int flags)
+{
+}
+#endif /* CONFIG_NVGPU_TRACK_MEM_USAGE */
+
 struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size)
 {
 	struct nvgpu_kmem_cache *cache =
-		kzalloc(sizeof(struct nvgpu_kmem_cache), GFP_KERNEL);
+		nvgpu_kzalloc(g, sizeof(struct nvgpu_kmem_cache));
 
 	if (!cache)
 		return NULL;
@@ -59,7 +837,7 @@ struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size)
 	cache->cache = kmem_cache_create(cache->name,
 					 size, size, 0, NULL);
 	if (!cache->cache) {
-		kfree(cache);
+		nvgpu_kfree(g, cache);
 		return NULL;
 	}
 
@@ -68,8 +846,10 @@ struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size)
 
 void nvgpu_kmem_cache_destroy(struct nvgpu_kmem_cache *cache)
 {
+	struct gk20a *g = cache->g;
+
 	kmem_cache_destroy(cache->cache);
-	kfree(cache);
+	nvgpu_kfree(g, cache);
 }
 
 void *nvgpu_kmem_cache_alloc(struct nvgpu_kmem_cache *cache)
diff --git a/drivers/gpu/nvgpu/common/linux/kmem_priv.h b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
new file mode 100644
index 000000000..5e38ad5d1
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __KMEM_PRIV_H__
+#define __KMEM_PRIV_H__
+
+#include <linux/rbtree.h>
+
+#define __pstat(s, fmt, msg...)				\
+	do {						\
+		if (s)					\
+			seq_printf(s, fmt, ##msg);	\
+		else					\
+			pr_info(fmt, ##msg);		\
+	} while (0)
+
+#define MAX_STACK_TRACE				20
+
+/*
+ * Linux specific version of the nvgpu_kmem_cache struct. This type is
+ * completely opaque to the rest of the driver.
+ */
+struct nvgpu_kmem_cache {
+	struct gk20a *g;
+	struct kmem_cache *cache;
+
+	/*
+	 * Memory to hold the kmem_cache unique name. Only necessary on our
+	 * k3.10 kernel when not using the SLUB allocator but it's easier to
+	 * just carry this on to newer kernels.
+	 */
+	char name[128];
+};
+
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+
+struct nvgpu_mem_alloc {
+	struct nvgpu_mem_alloc_tracker *owner;
+
+	void *ip;
+#ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
+	unsigned long stack[MAX_STACK_TRACE];
+	int stack_length;
+#endif
+
+	u64 addr;
+
+	unsigned long size;
+	unsigned long real_size;
+
+	/* Ugh - linux specific. Will need to be abstracted. */
+	struct rb_node allocs_entry;
+};
+
+/*
+ * Linux specific tracking of vmalloc, kmalloc, etc.
+ */
+struct nvgpu_mem_alloc_tracker {
+	const char *name;
+	struct nvgpu_kmem_cache *allocs_cache;
+	struct rb_root allocs;
+	struct mutex lock;
+
+	u64 bytes_alloced;
+	u64 bytes_freed;
+	u64 bytes_alloced_real;
+	u64 bytes_freed_real;
+	u64 nr_allocs;
+	u64 nr_frees;
+
+	unsigned long min_alloc;
+	unsigned long max_alloc;
+};
+
+#endif /* CONFIG_NVGPU_TRACK_MEM_USAGE */
+
+#endif /* __KMEM_PRIV_H__ */
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index f228110eb..68e432599 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -986,7 +986,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
 	memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
 
 	gk20a_gmmu_unmap_free(ch_vm, &ch->gpfifo.mem);
-	nvgpu_big_free(ch->gpfifo.pipe);
+	nvgpu_big_free(g, ch->gpfifo.pipe);
 	memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
 
 #if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -1856,7 +1856,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 	}
 
 	if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
-		c->gpfifo.pipe = nvgpu_big_malloc(
+		c->gpfifo.pipe = nvgpu_big_malloc(g,
 				gpfifo_size * sizeof(struct nvgpu_gpfifo));
 		if (!c->gpfifo.pipe) {
 			err = -ENOMEM;
@@ -1927,7 +1927,7 @@ clean_up_sync:
 		c->sync = NULL;
 	}
 clean_up_unmap:
-	nvgpu_big_free(c->gpfifo.pipe);
+	nvgpu_big_free(g, c->gpfifo.pipe);
 	gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
 clean_up:
 	memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
@@ -2057,12 +2057,12 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 	if (!g) {
 		size = count * sizeof(struct nvgpu_gpfifo);
 		if (size) {
-			g = nvgpu_big_malloc(size);
+			g = nvgpu_big_malloc(c->g, size);
 			if (!g)
 				return;
 
 			if (copy_from_user(g, user_gpfifo, size)) {
-				nvgpu_big_free(g);
+				nvgpu_big_free(c->g, g);
 				return;
 			}
 		}
@@ -2074,7 +2074,7 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 		trace_write_pushbuffer(c, gp);
 
 	if (gpfifo_allocated)
-		nvgpu_big_free(g);
+		nvgpu_big_free(c->g, g);
 }
 
 static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index 4a42e03fb..0a0aada7a 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -819,7 +819,7 @@ static int nvgpu_dbg_gpu_ioctl_access_fb_memory(struct dbg_session_gk20a *dbg_s,
 		goto fail_dmabuf_put;
 	}
 
-	buffer = nvgpu_big_zalloc(access_limit_size);
+	buffer = nvgpu_big_zalloc(g, access_limit_size);
 	if (!buffer) {
 		err = -ENOMEM;
 		goto fail_dmabuf_put;
@@ -865,7 +865,7 @@ static int nvgpu_dbg_gpu_ioctl_access_fb_memory(struct dbg_session_gk20a *dbg_s,
 fail_idle:
 	gk20a_idle(g->dev);
 fail_free_buffer:
-	nvgpu_big_free(buffer);
+	nvgpu_big_free(g, buffer);
 fail_dmabuf_put:
 	dma_buf_put(dmabuf);
 
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index 67f9b5320..6341a962b 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -21,6 +21,7 @@
 #include <linux/io.h>
 
 #include <nvgpu/semaphore.h>
+#include <nvgpu/kmem.h>
 
 #include "gk20a.h"
 #include "debug_gk20a.h"
@@ -485,6 +486,9 @@ void gk20a_debug_init(struct device *dev, const char *debugfs_symlink)
 	gk20a_mm_debugfs_init(g->dev);
 	gk20a_fifo_debugfs_init(g->dev);
 	gk20a_sched_debugfs_init(g->dev);
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	nvgpu_kmem_debugfs_init(g->dev);
+#endif
 #endif
 
 }
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 3504a32f5..6b026ee23 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -43,6 +43,7 @@
 #include <linux/version.h>
 
 #include <nvgpu/nvgpu_common.h>
+#include <nvgpu/kmem.h>
 #include <nvgpu/allocator.h>
 #include <nvgpu/timers.h>
 
@@ -1598,6 +1599,8 @@ static int gk20a_probe(struct platform_device *dev)
 	set_gk20a(dev, gk20a);
 	gk20a->dev = &dev->dev;
 
+	nvgpu_kmem_init(gk20a);
+
 	gk20a->irq_stall = platform_get_irq(dev, 0);
 	gk20a->irq_nonstall = platform_get_irq(dev, 1);
 	if (gk20a->irq_stall < 0 || gk20a->irq_nonstall < 0)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 8006a4fe3..69528c1f6 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -27,6 +27,7 @@ struct gk20a_ctxsw_ucode_segments;
 struct gk20a_fecs_trace;
 struct gk20a_ctxsw_trace;
 struct acr_desc;
+struct nvgpu_mem_alloc_tracker;
 
 #include <linux/sched.h>
 #include <nvgpu/lock.h>
@@ -915,6 +916,7 @@ struct gk20a {
 	struct dentry *debugfs_runlist_interleave;
 	struct dentry *debugfs_allocators;
 	struct dentry *debugfs_xve;
+	struct dentry *debugfs_kmem;
 #endif
 	struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
 
@@ -1055,6 +1057,10 @@ struct gk20a {
 	/* Check if msi is enabled */
 	bool msi_enabled;
 #endif
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	struct nvgpu_mem_alloc_tracker *vmallocs;
+	struct nvgpu_mem_alloc_tracker *kmallocs;
+#endif
 };
 
 static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g)
@@ -1131,6 +1137,7 @@ enum gk20a_dbg_categories {
 	gpu_dbg_pmu_pstate = BIT(17), /* p state controlled by pmu */
 	gpu_dbg_xv      = BIT(18), /* XVE debugging */
 	gpu_dbg_shutdown = BIT(19), /* GPU shutdown tracing */
+	gpu_dbg_kmem    = BIT(20), /* Kmem tracking debugging */
 	gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };
 
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 36b85f3b3..e695f02ed 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -3424,7 +3424,7 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
 	gr->ctx_vars.local_golden_image = NULL;
 
 	if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map)
-		nvgpu_big_free(gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
+		nvgpu_big_free(g, gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
 	gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
 
 	gk20a_comptag_allocator_destroy(&gr->comp_tags);
@@ -8055,7 +8055,7 @@ static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
 	hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
 	map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
 
-	map = nvgpu_big_zalloc(map_size);
+	map = nvgpu_big_zalloc(g, map_size);
 	if (!map)
 		return -ENOMEM;
 
@@ -8145,7 +8145,7 @@ static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
 	return 0;
 cleanup:
 	gk20a_err(dev_from_gk20a(g), "Failed to create HWPM buffer offset map");
-	nvgpu_big_free(map);
+	nvgpu_big_free(g, map);
 	return -EINVAL;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 7a64f79b5..2ff546536 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1487,8 +1487,8 @@ int gk20a_vm_get_buffers(struct vm_gk20a *vm,
 
 	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
 
-	buffer_list = nvgpu_big_zalloc(sizeof(*buffer_list) *
-					  vm->num_user_mapped_buffers);
+	buffer_list = nvgpu_big_zalloc(vm->mm->g, sizeof(*buffer_list) *
+				       vm->num_user_mapped_buffers);
 	if (!buffer_list) {
 		nvgpu_mutex_release(&vm->update_gmmu_lock);
 		return -ENOMEM;
@@ -1572,7 +1572,7 @@ void gk20a_vm_put_buffers(struct vm_gk20a *vm,
 	gk20a_vm_mapping_batch_finish_locked(vm, &batch);
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
 
-	nvgpu_big_free(mapped_buffers);
+	nvgpu_big_free(vm->mm->g, mapped_buffers);
 }
 
 static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/kmem.h b/drivers/gpu/nvgpu/include/nvgpu/kmem.h
index c08e40a60..591925252 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/kmem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/kmem.h
@@ -14,18 +14,21 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef NVGPU_KMEM_H
-#define NVGPU_KMEM_H
+#ifndef __NVGPU_KMEM_H__
+#define __NVGPU_KMEM_H__
 
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include <asm/page.h>
+/*
+ * Incase this isn't defined already.
+ */
+#ifndef _THIS_IP_
+#define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
+#endif
 
 struct gk20a;
 
-/*
+/**
+ * DOC: Kmem cache support
+ *
  * In Linux there is support for the notion of a kmem_cache. It gives better
  * memory usage characteristics for lots of allocations of the same size. Think
  * structs that get allocated over and over. Normal kmalloc() type routines
@@ -37,26 +40,200 @@ struct gk20a;
  */
 struct nvgpu_kmem_cache;
 
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+/*
+ * Uncomment this if you want to enable stack traces in the memory profiling.
+ * Since this is a fairly high overhead operation and is only necessary for
+ * debugging actual bugs it's left here for developers to enable.
+ */
+/* #define __NVGPU_SAVE_KALLOC_STACK_TRACES */
+
+/*
+ * Defined per-OS.
+ */
+struct nvgpu_mem_alloc_tracker;
+#endif
+
+
+/**
+ * nvgpu_kmem_cache_create - create an nvgpu kernel memory cache.
+ *
+ * @g		The GPU driver struct using this cache.
+ * @size	Size of the object allocated by the cache.
+ *
+ * This cache can be used to allocate objects of size @size. Common usage would
+ * be for a struct that gets allocated a lot. In that case @size should be
+ * sizeof(struct my_struct).
+ *
+ * A given implementation of this need not do anything special. The allocation
+ * routines can simply be passed on to nvgpu_kzalloc() if desired so packing
+ * and alignment of the structs cannot be assumed.
+ */
 struct nvgpu_kmem_cache *nvgpu_kmem_cache_create(struct gk20a *g, size_t size);
+
+/**
+ * nvgpu_kmem_cache_destroy - destroy a cache created by
+ *                            nvgpu_kmem_cache_create().
+ *
+ * @cache	The cache to destroy.
+ */
 void nvgpu_kmem_cache_destroy(struct nvgpu_kmem_cache *cache);
 
+/**
+ * nvgpu_kmem_cache_alloc - Allocate an object from the cache
+ *
+ * @cache	The cache to alloc from.
+ */
 void *nvgpu_kmem_cache_alloc(struct nvgpu_kmem_cache *cache);
+
+/**
+ * nvgpu_kmem_cache_free - Free an object back to a cache
+ *
+ * @cache	The cache to return the object to.
+ * @ptr		Pointer to the object to free.
+ */
 void nvgpu_kmem_cache_free(struct nvgpu_kmem_cache *cache, void *ptr);
 
-static inline void *__nvgpu_big_alloc(size_t size, bool clear)
+/**
+ * nvgpu_kmalloc - Allocate from the kernel's allocator.
+ *
+ * @g:		Current GPU.
+ * @size:	Size of the allocation.
+ *
+ * Allocate a chunk of system memory from the kernel. Allocations larger than 1
+ * page may fail even when there may appear to be enough memory.
+ *
+ * This function may sleep so cannot be used in IRQs.
+ */
+#define nvgpu_kmalloc(g, size)		__nvgpu_kmalloc(g, size, _THIS_IP_)
+
+/**
+ * nvgpu_kzalloc - Allocate from the kernel's allocator.
+ *
+ * @g:		Current GPU.
+ * @size:	Size of the allocation.
+ *
+ * Identical to nvgpu_kalloc() except the memory will be zeroed before being
+ * returned.
+ */
+#define nvgpu_kzalloc(g, size)		__nvgpu_kzalloc(g, size, _THIS_IP_)
+
+/**
+ * nvgpu_kcalloc - Allocate from the kernel's allocator.
+ *
+ * @g:		Current GPU.
+ * @n:          Number of objects.
+ * @size:	Size of each object.
+ *
+ * Identical to nvgpu_kalloc() except the size of the memory chunk returned is
+ * @n * @size.
+ */
+#define nvgpu_kcalloc(g, n, size)	__nvgpu_kcalloc(g, n, size, _THIS_IP_)
+
+/**
+ * nvgpu_vmalloc - Allocate memory and return a map to it.
+ *
+ * @g:		Current GPU.
+ * @size:	Size of the allocation.
+ *
+ * Allocate some memory and return a pointer to a virtual memory mapping of
+ * that memory in the kernel's virtual address space. The underlying physical
+ * memory is not guaranteed to be contiguous (and indeed likely isn't). This
+ * allows for much larger allocations to be done without worrying about as much
+ * about physical memory fragmentation.
+ *
+ * This function may sleep.
+ */
+#define nvgpu_vmalloc(g, size)		__nvgpu_vmalloc(g, size, _THIS_IP_)
+
+/**
+ * nvgpu_vzalloc - Allocate memory and return a map to it.
+ *
+ * @g:		Current GPU.
+ * @size:	Size of the allocation.
+ *
+ * Identical to nvgpu_vmalloc() except this will return zero'ed memory.
+ */
+#define nvgpu_vzalloc(g, size)		__nvgpu_vzalloc(g, size, _THIS_IP_)
+
+/**
+ * nvgpu_kfree - Frees an alloc from nvgpu_kmalloc, nvgpu_kzalloc,
+ *               nvgpu_kcalloc.
+ *
+ * @g:		Current GPU.
+ * @addr:	Address of object to free.
+ */
+#define nvgpu_kfree(g, addr)		__nvgpu_kfree(g, addr)
+
+/**
+ * nvgpu_vfree - Frees an alloc from nvgpu_vmalloc, nvgpu_vzalloc.
+ *
+ * @g:		Current GPU.
+ * @addr:	Address of object to free.
+ */
+#define nvgpu_vfree(g, addr)		__nvgpu_vfree(g, addr)
+
+#define kmem_dbg(fmt, args...)			\
+	gk20a_dbg(gpu_dbg_kmem, fmt, ##args)
+
+/**
+ * nvgpu_kmem_init - Initialize the kmem tracking stuff.
+ *
+ *@g: The driver to init.
+ *
+ * Returns non-zero on failure.
+ */
+int nvgpu_kmem_init(struct gk20a *g);
+
+/**
+ * nvgpu_kmem_fini - Finalize the kmem tracking code
+ *
+ * @g     - The GPU.
+ * @flags - Flags that control operation of this finalization.
+ *
+ * Cleanup resources used by nvgpu_kmem. Available flags for cleanup are:
+ *
+ *   %NVGPU_KMEM_FINI_DO_NOTHING
+ *   %NVGPU_KMEM_FINI_FORCE_CLEANUP
+ *   %NVGPU_KMEM_FINI_DUMP_ALLOCS
+ *   %NVGPU_KMEM_FINI_WARN
+ *   %NVGPU_KMEM_FINI_BUG
+ *
+ * %NVGPU_KMEM_FINI_DO_NOTHING will be overridden by anything else specified.
+ * Put another way don't just add %NVGPU_KMEM_FINI_DO_NOTHING and expect that
+ * to suppress other flags from doing anything.
+ */
+void nvgpu_kmem_fini(struct gk20a *g, int flags);
+
+/*
+ * These will simply be ignored if CONFIG_NVGPU_TRACK_MEM_USAGE is not defined.
+ */
+#define NVGPU_KMEM_FINI_DO_NOTHING		0
+#define NVGPU_KMEM_FINI_FORCE_CLEANUP		(1 << 0)
+#define NVGPU_KMEM_FINI_DUMP_ALLOCS		(1 << 1)
+#define NVGPU_KMEM_FINI_WARN			(1 << 2)
+#define NVGPU_KMEM_FINI_BUG			(1 << 3)
+
+/*
+ * When there's other implementations make sure they are included instead of
+ * Linux when not compiling on Linux!
+ */
+#include <nvgpu/kmem_linux.h>
+
+static inline void *__nvgpu_big_alloc(struct gk20a *g, size_t size, bool clear)
 {
 	void *p;
 
 	if (size > PAGE_SIZE) {
 		if (clear)
-			p = vzalloc(size);
+			p = nvgpu_vzalloc(g, size);
 		else
-			p = vmalloc(size);
+			p = nvgpu_vmalloc(g, size);
 	} else {
 		if (clear)
-			p = kzalloc(size, GFP_KERNEL);
+			p = nvgpu_kzalloc(g, size);
 		else
-			p = kmalloc(size, GFP_KERNEL);
+			p = nvgpu_kmalloc(g, size);
 	}
 
 	return p;
@@ -65,6 +242,7 @@ static inline void *__nvgpu_big_alloc(size_t size, bool clear)
 /**
  * nvgpu_big_malloc - Pick virtual or physical alloc based on @size
  *
+ * @g - The GPU.
  * @size - Size of the allocation.
  *
  * On some platforms (i.e Linux) it is possible to allocate memory directly
@@ -83,30 +261,31 @@ static inline void *__nvgpu_big_alloc(size_t size, bool clear)
  * Returns a pointer to a virtual address range that the kernel can access or
  * %NULL on failure.
  */
-static inline void *nvgpu_big_malloc(size_t size)
+static inline void *nvgpu_big_malloc(struct gk20a *g, size_t size)
 {
-	return __nvgpu_big_alloc(size, false);
+	return __nvgpu_big_alloc(g, size, false);
 }
 
 /**
  * nvgpu_big_malloc - Pick virtual or physical alloc based on @size
  *
+ * @g - The GPU.
  * @size - Size of the allocation.
  *
  * Zeroed memory version of nvgpu_big_malloc().
  */
-static inline void *nvgpu_big_zalloc(size_t size)
+static inline void *nvgpu_big_zalloc(struct gk20a *g, size_t size)
 {
-	return __nvgpu_big_alloc(size, true);
+	return __nvgpu_big_alloc(g, size, true);
 }
 
 /**
  * nvgpu_big_free - Free and alloc from nvgpu_big_zalloc() or
  *                  nvgpu_big_malloc().
- *
+ * @g - The GPU.
  * @p - A pointer allocated by nvgpu_big_zalloc() or nvgpu_big_malloc().
  */
-static inline void nvgpu_big_free(void *p)
+static inline void nvgpu_big_free(struct gk20a *g, void *p)
 {
 	/*
 	 * This will have to be fixed eventually. Allocs that use
@@ -114,9 +293,9 @@ static inline void nvgpu_big_free(void *p)
 	 * when freeing.
 	 */
 	if (virt_addr_valid(p))
-		kfree(p);
+		nvgpu_kfree(g, p);
 	else
-		vfree(p);
+		nvgpu_vfree(g, p);
 }
 
-#endif
+#endif /* __NVGPU_KMEM_H__ */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/kmem_linux.h b/drivers/gpu/nvgpu/include/nvgpu/kmem_linux.h
new file mode 100644
index 000000000..d1cd27f31
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/kmem_linux.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __NVGPU_KMEM_LINUX_H__
+#define __NVGPU_KMEM_LINUX_H__
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/page.h>
+
+struct gk20a;
+struct device;
+
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+void *__nvgpu_track_vmalloc(struct gk20a *g, unsigned long size,
+			    unsigned long ip);
+void *__nvgpu_track_vzalloc(struct gk20a *g, unsigned long size,
+			    unsigned long ip);
+void *__nvgpu_track_kmalloc(struct gk20a *g, size_t size, unsigned long ip);
+void *__nvgpu_track_kzalloc(struct gk20a *g, size_t size, unsigned long ip);
+void *__nvgpu_track_kcalloc(struct gk20a *g, size_t n, size_t size,
+			    unsigned long ip);
+void  __nvgpu_track_vfree(struct gk20a *g, void *addr);
+void  __nvgpu_track_kfree(struct gk20a *g, void *addr);
+
+void nvgpu_kmem_debugfs_init(struct device *dev);
+#else
+static inline void nvgpu_kmem_debugfs_init(struct device *dev)
+{
+}
+#endif
+
+/**
+ * DOC: Linux pass through kmem implementation.
+ *
+ * These are the Linux implementations of the various kmem functions defined by
+ * nvgpu. This should not be included directly - instead include <nvgpu/kmem.h>.
+ */
+
+static inline void *__nvgpu_kmalloc(struct gk20a *g, unsigned long size,
+				    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	return __nvgpu_track_vmalloc(g, size, ip);
+#else
+	return kmalloc(size, GFP_KERNEL);
+#endif
+}
+
+static inline void *__nvgpu_kzalloc(struct gk20a *g, size_t size,
+				    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	return __nvgpu_track_kzalloc(g, size, ip);
+#else
+	return kzalloc(size, GFP_KERNEL);
+#endif
+}
+
+static inline void *__nvgpu_kcalloc(struct gk20a *g, size_t n, size_t size,
+				    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	return __nvgpu_track_kcalloc(g, n, size, ip);
+#else
+	return kcalloc(n, size, GFP_KERNEL);
+#endif
+}
+
+static inline void *__nvgpu_vmalloc(struct gk20a *g, unsigned long size,
+				    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	return __nvgpu_track_vmalloc(g, size, ip);
+#else
+	return vmalloc(size);
+#endif
+}
+
+static inline void *__nvgpu_vzalloc(struct gk20a *g, unsigned long size,
+				    unsigned long ip)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	return __nvgpu_track_vzalloc(g, size, ip);
+#else
+	return vzalloc(size);
+#endif
+}
+
+static inline void __nvgpu_kfree(struct gk20a *g, void *addr)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	__nvgpu_track_kfree(g, addr);
+#else
+	kfree(addr);
+#endif
+}
+
+static inline void __nvgpu_vfree(struct gk20a *g, void *addr)
+{
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+	__nvgpu_track_vfree(g, addr);
+#else
+	vfree(addr);
+#endif
+}
+
+#endif
diff --git a/drivers/gpu/nvgpu/pci.c b/drivers/gpu/nvgpu/pci.c
index 3677b02d6..39559dacc 100644
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -19,6 +19,7 @@
 #include <linux/pm_runtime.h>
 
 #include <nvgpu/nvgpu_common.h>
+#include <nvgpu/kmem.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
@@ -358,6 +359,8 @@ static int nvgpu_pci_probe(struct pci_dev *pdev,
 	platform->g = g;
 	g->dev = &pdev->dev;
 
+	nvgpu_kmem_init(g);
+
 	err = pci_enable_device(pdev);
 	if (err)
 		return err;
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index d8e0dfa1f..37b4633bf 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -19,6 +19,8 @@
 #include <linux/pm_runtime.h>
 #include <linux/pm_qos.h>
 
+#include <nvgpu/kmem.h>
+
 #include "vgpu/vgpu.h"
 #include "vgpu/fecs_trace_vgpu.h"
 #include "gk20a/debug_gk20a.h"
@@ -562,6 +564,8 @@ int vgpu_probe(struct platform_device *pdev)
 	platform->vgpu_priv = priv;
 	gk20a->dev = dev;
 
+	nvgpu_kmem_init(gk20a);
+
 	err = gk20a_user_init(dev, INTERFACE_NAME, &nvgpu_class);
 	if (err)
 		return err;