diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index e7ea3c5df..4b6a8e879 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -66,7 +66,6 @@ nvgpu-y := \
 	gk20a/fifo_gk20a.o \
 	gk20a/channel_gk20a.o \
 	gk20a/channel_sync_gk20a.o \
-	gk20a/debug_gk20a.o \
 	gk20a/dbg_gpu_gk20a.o \
 	gk20a/regops_gk20a.o \
 	gk20a/gr_gk20a.o \
@@ -107,7 +106,6 @@ nvgpu-y := \
 	gm20b/mm_gm20b.o \
 	gm20b/regops_gm20b.o \
 	gm20b/mc_gm20b.o \
-	gm20b/debug_gm20b.o \
 	gm20b/cde_gm20b.o \
 	gm20b/therm_gm20b.o \
 	gm206/bios_gm206.o \
@@ -117,6 +115,18 @@ nvgpu-y := \
 	boardobj/boardobjgrp_e255.o \
 	boardobj/boardobjgrp_e32.o
 
+nvgpu-$(CONFIG_DEBUG_FS) += \
+	common/linux/debug.o \
+	common/linux/debug_gr.o \
+	common/linux/debug_fifo.o \
+	common/linux/debug_cde.o \
+	common/linux/debug_ce.o \
+	common/linux/debug_pmu.o \
+	common/linux/debug_sched.o \
+	common/linux/debug_mm.o \
+	common/linux/debug_allocator.o \
+	common/linux/debug_kmem.o
+
 nvgpu-$(CONFIG_TEGRA_GK20A) += tegra/linux/platform_gk20a_tegra.o
 nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o
 nvgpu-$(CONFIG_GK20A_PCI) += common/linux/pci.o
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/common/linux/debug.c
similarity index 73%
rename from drivers/gpu/nvgpu/gk20a/debug_gk20a.c
rename to drivers/gpu/nvgpu/common/linux/debug.c
index ac435046e..2962a4673 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/common/linux/debug.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011-2017 NVIDIA Corporation.  All rights reserved.
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -12,26 +12,23 @@
  *
  */
 
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
-#include <linux/seq_file.h>
-#include <linux/io.h>
-#include <linux/fs.h>
+#include "debug_cde.h"
+#include "debug_ce.h"
+#include "debug_fifo.h"
+#include "debug_gr.h"
+#include "debug_mm.h"
+#include "debug_allocator.h"
+#include "debug_kmem.h"
+#include "debug_pmu.h"
+#include "debug_sched.h"
 
-#include <nvgpu/log.h>
-#include <nvgpu/kmem.h>
-#include <nvgpu/semaphore.h>
-#include <nvgpu/log.h>
-
-#include "gk20a.h"
+#include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
-#include "debug_gk20a.h"
 
-#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
-#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
-#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
-#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <nvgpu/debug.h>
 
 unsigned int gk20a_debug_trace_cmdbuf;
 
@@ -59,81 +56,22 @@ void gk20a_debug_output(struct gk20a_debug_output *o,
 	o->fn(o->ctx, o->buf, len);
 }
 
-static void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
-		 struct gk20a_debug_output *o)
-{
-	struct fifo_gk20a *f = &g->fifo;
-	u32 chid;
-	struct ch_state **ch_state;
-
-	ch_state = nvgpu_kzalloc(g, sizeof(*ch_state) * f->num_channels);
-	if (!ch_state) {
-		gk20a_debug_output(o, "cannot alloc memory for channels\n");
-		return;
-	}
-
-	for (chid = 0; chid < f->num_channels; chid++) {
-		struct channel_gk20a *ch = &f->channel[chid];
-		if (gk20a_channel_get(ch)) {
-			ch_state[chid] =
-				nvgpu_kmalloc(g, sizeof(struct ch_state) +
-					ram_in_alloc_size_v());
-			/* ref taken stays to below loop with
-			 * successful allocs */
-			if (!ch_state[chid])
-				gk20a_channel_put(ch);
-		}
-	}
-
-	for (chid = 0; chid < f->num_channels; chid++) {
-		struct channel_gk20a *ch = &f->channel[chid];
-		if (!ch_state[chid])
-			continue;
-
-		ch_state[chid]->pid = ch->pid;
-		ch_state[chid]->refs = atomic_read(&ch->ref_count);
-		nvgpu_mem_rd_n(g, &ch->inst_block, 0,
-				&ch_state[chid]->inst_block[0],
-				ram_in_alloc_size_v());
-		gk20a_channel_put(ch);
-	}
-	for (chid = 0; chid < f->num_channels; chid++) {
-		if (ch_state[chid]) {
-			g->ops.fifo.dump_channel_status_ramfc(g, o, chid,
-						 ch_state[chid]);
-			nvgpu_kfree(g, ch_state[chid]);
-		}
-	}
-	nvgpu_kfree(g, ch_state);
-}
-
-void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
-{
-	g->ops.fifo.dump_pbdma_status(g, o);
-	g->ops.fifo.dump_eng_status(g, o);
-
-	gk20a_debug_dump_all_channel_status_ramfc(g, o);
-}
-
-static int gk20a_gr_dump_regs(struct device *dev,
+static int gk20a_gr_dump_regs(struct gk20a *g,
 		struct gk20a_debug_output *o)
 {
-	struct gk20a_platform *platform = gk20a_get_platform(dev);
-	struct gk20a *g = platform->g;
-
 	if (g->ops.gr.dump_gr_regs)
 		gr_gk20a_elpg_protected_call(g, g->ops.gr.dump_gr_regs(g, o));
 
 	return 0;
 }
 
-int gk20a_gr_debug_dump(struct device *dev)
+int gk20a_gr_debug_dump(struct gk20a *g)
 {
 	struct gk20a_debug_output o = {
 		.fn = gk20a_debug_write_printk
 	};
 
-	gk20a_gr_dump_regs(dev, &o);
+	gk20a_gr_dump_regs(g, &o);
 
 	return 0;
 }
@@ -154,23 +92,22 @@ static int gk20a_gr_debug_show(struct seq_file *s, void *unused)
 		return -EINVAL;
 	}
 
-	gk20a_gr_dump_regs(dev, &o);
+	gk20a_gr_dump_regs(g, &o);
 
 	gk20a_idle(g);
 
 	return 0;
 }
 
-void gk20a_debug_dump(struct device *dev)
+void gk20a_debug_dump(struct gk20a *g)
 {
-	struct gk20a_platform *platform = gk20a_get_platform(dev);
-	struct gk20a *g = platform->g;
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
 	struct gk20a_debug_output o = {
 		.fn = gk20a_debug_write_printk
 	};
 
 	if (platform->dump_platform_dependencies)
-		platform->dump_platform_dependencies(dev);
+		platform->dump_platform_dependencies(g->dev);
 
 	/* HAL only initialized after 1st power-on */
 	if (g->ops.debug.show_dump)
@@ -227,22 +164,28 @@ static const struct file_operations gk20a_debug_fops = {
 	.release	= single_release,
 };
 
+void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
+{
+	g->ops.fifo.dump_pbdma_status(g, o);
+	g->ops.fifo.dump_eng_status(g, o);
+
+	gk20a_debug_dump_all_channel_status_ramfc(g, o);
+}
+
 void gk20a_init_debug_ops(struct gpu_ops *gops)
 {
 	gops->debug.show_dump = gk20a_debug_show_dump;
 }
 
-#ifdef CONFIG_DEBUG_FS
 static int railgate_residency_show(struct seq_file *s, void *data)
 {
-	struct device *dev = s->private;
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct gk20a *g = get_gk20a(dev);
+	struct gk20a *g = s->private;
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
 	unsigned long time_since_last_state_transition_ms;
 	unsigned long total_rail_gate_time_ms;
 	unsigned long total_rail_ungate_time_ms;
 
-	if (platform->is_railgated(dev)) {
+	if (platform->is_railgated(g->dev)) {
 		time_since_last_state_transition_ms =
 				jiffies_to_msecs(jiffies -
 				g->pstats.last_rail_gate_complete);
@@ -282,30 +225,27 @@ static const struct file_operations railgate_residency_fops = {
 	.release	= single_release,
 };
 
-int gk20a_railgating_debugfs_init(struct device *dev)
+static int gk20a_railgating_debugfs_init(struct gk20a *g)
 {
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
 	struct dentry *d;
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct gk20a *g = get_gk20a(dev);
 
 	if (!g->can_railgate)
 		return 0;
 
 	d = debugfs_create_file(
-		"railgate_residency", S_IRUGO|S_IWUSR, platform->debugfs, dev,
+		"railgate_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
 						&railgate_residency_fops);
 	if (!d)
 		return -ENOMEM;
 
 	return 0;
 }
-#endif
 
-void gk20a_debug_init(struct device *dev, const char *debugfs_symlink)
+void gk20a_debug_init(struct gk20a *g, const char *debugfs_symlink)
 {
-#ifdef CONFIG_DEBUG_FS
+	struct device *dev = g->dev;
 	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct gk20a *g = platform->g;
 
 	platform->debugfs = debugfs_create_dir(dev_name(dev), NULL);
 	if (!platform->debugfs)
@@ -409,17 +349,28 @@ void gk20a_debug_init(struct device *dev, const char *debugfs_symlink)
 #endif
 
 	gr_gk20a_debugfs_init(g);
-	gk20a_pmu_debugfs_init(g->dev);
-	gk20a_railgating_debugfs_init(g->dev);
-	gk20a_cde_debugfs_init(g->dev);
-	gk20a_ce_debugfs_init(g->dev);
-	nvgpu_alloc_debugfs_init(g->dev);
-	gk20a_mm_debugfs_init(g->dev);
-	gk20a_fifo_debugfs_init(g->dev);
-	gk20a_sched_debugfs_init(g->dev);
+	gk20a_pmu_debugfs_init(g);
+	gk20a_railgating_debugfs_init(g);
+	gk20a_cde_debugfs_init(g);
+	gk20a_ce_debugfs_init(g);
+	nvgpu_alloc_debugfs_init(g);
+	gk20a_mm_debugfs_init(g);
+	gk20a_fifo_debugfs_init(g);
+	gk20a_sched_debugfs_init(g);
 #ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
-	nvgpu_kmem_debugfs_init(g->dev);
+	nvgpu_kmem_debugfs_init(g);
 #endif
-#endif
-
+}
+
+void gk20a_debug_deinit(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	if (!platform->debugfs)
+		return;
+
+	gk20a_fifo_debugfs_deinit(g);
+
+	debugfs_remove_recursive(platform->debugfs);
+	debugfs_remove_recursive(platform->debugfs_alias);
 }
diff --git a/drivers/gpu/nvgpu/common/linux/debug_allocator.c b/drivers/gpu/nvgpu/common/linux/debug_allocator.c
new file mode 100644
index 000000000..3d4a2bb23
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_allocator.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_allocator.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <nvgpu/allocator.h>
+
+u32 nvgpu_alloc_tracing_on;
+
+void nvgpu_alloc_print_stats(struct nvgpu_allocator *__a,
+			     struct seq_file *s, int lock)
+{
+	__a->ops->print_stats(__a, s, lock);
+}
+
+static int __alloc_show(struct seq_file *s, void *unused)
+{
+	struct nvgpu_allocator *a = s->private;
+
+	nvgpu_alloc_print_stats(a, s, 1);
+
+	return 0;
+}
+
+static int __alloc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, __alloc_show, inode->i_private);
+}
+
+static const struct file_operations __alloc_fops = {
+	.open = __alloc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a)
+{
+	if (!g->debugfs_allocators)
+		return;
+
+	a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
+					       g->debugfs_allocators,
+					       a, &__alloc_fops);
+}
+
+void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a)
+{
+	if (!IS_ERR_OR_NULL(a->debugfs_entry))
+		debugfs_remove(a->debugfs_entry);
+}
+
+void nvgpu_alloc_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	g->debugfs_allocators = debugfs_create_dir("allocators", platform->debugfs);
+	if (IS_ERR_OR_NULL(g->debugfs_allocators)) {
+		g->debugfs_allocators = NULL;
+		return;
+	}
+
+	debugfs_create_u32("tracing", 0664, g->debugfs_allocators,
+			   &nvgpu_alloc_tracing_on);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_allocator.h b/drivers/gpu/nvgpu/common/linux/debug_allocator.h
new file mode 100644
index 000000000..1b21cfc5d
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_allocator.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __NVGPU_DEBUG_ALLOCATOR_H__
+#define __NVGPU_DEBUG_ALLOCATOR_H__
+
+struct gk20a;
+void nvgpu_alloc_debugfs_init(struct gk20a *g);
+
+#endif /* __NVGPU_DEBUG_ALLOCATOR_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_cde.c b/drivers/gpu/nvgpu/common/linux/debug_cde.c
new file mode 100644
index 000000000..eb7c33e23
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_cde.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_cde.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+
+
+static ssize_t gk20a_cde_reload_write(struct file *file,
+	const char __user *userbuf, size_t count, loff_t *ppos)
+{
+	struct gk20a *g = file->private_data;
+	gk20a_cde_reload(g);
+	return count;
+}
+
+static const struct file_operations gk20a_cde_reload_fops = {
+	.open		= simple_open,
+	.write		= gk20a_cde_reload_write,
+};
+
+void gk20a_cde_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	if (!platform->has_cde)
+		return;
+
+	debugfs_create_u32("cde_parameter", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->cde_app.shader_parameter);
+	debugfs_create_u32("cde_ctx_count", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->cde_app.ctx_count);
+	debugfs_create_u32("cde_ctx_usecount", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->cde_app.ctx_usecount);
+	debugfs_create_u32("cde_ctx_count_top", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->cde_app.ctx_count_top);
+	debugfs_create_file("reload_cde_firmware", S_IWUSR, platform->debugfs,
+			    g, &gk20a_cde_reload_fops);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_cde.h b/drivers/gpu/nvgpu/common/linux/debug_cde.h
new file mode 100644
index 000000000..4895edd69
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_cde.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __NVGPU_DEBUG_CDE_H__
+#define __NVGPU_DEBUG_CDE_H__
+
+struct gk20a;
+void gk20a_cde_debugfs_init(struct gk20a *g);
+
+#endif /* __NVGPU_DEBUG_CDE_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_ce.c b/drivers/gpu/nvgpu/common/linux/debug_ce.c
new file mode 100644
index 000000000..9c50870eb
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_ce.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_ce.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+
+void gk20a_ce_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->ce_app.ctx_count);
+	debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->ce_app.app_state);
+	debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->ce_app.next_ctx_id);
+}
diff --git a/drivers/gpu/nvgpu/gm20b/debug_gm20b.h b/drivers/gpu/nvgpu/common/linux/debug_ce.h
similarity index 68%
rename from drivers/gpu/nvgpu/gm20b/debug_gm20b.h
rename to drivers/gpu/nvgpu/common/linux/debug_ce.h
index c3c5fed6c..2a8750c4f 100644
--- a/drivers/gpu/nvgpu/gm20b/debug_gm20b.h
+++ b/drivers/gpu/nvgpu/common/linux/debug_ce.h
@@ -1,7 +1,5 @@
 /*
- * GM20B Debug functionality
- *
- * Copyright (C) 2015 NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -14,11 +12,10 @@
  *
  */
 
-#ifndef _DEBUG_GM20B_H_
-#define _DEBUG_GM20B_H_
+#ifndef __NVGPU_DEBUG_CE_H__
+#define __NVGPU_DEBUG_CE_H__
 
-struct gpu_ops;
+struct gk20a;
+void gk20a_ce_debugfs_init(struct gk20a *g);
 
-void gm20b_init_debug_ops(struct gpu_ops *gops);
-
-#endif
+#endif /* __NVGPU_DEBUG_CE_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_fifo.c b/drivers/gpu/nvgpu/common/linux/debug_fifo.c
new file mode 100644
index 000000000..6a28b1a52
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_fifo.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_fifo.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <nvgpu/sort.h>
+
+void __gk20a_fifo_profile_free(struct kref *ref);
+
+static void *gk20a_fifo_sched_debugfs_seq_start(
+		struct seq_file *s, loff_t *pos)
+{
+	struct gk20a *g = s->private;
+	struct fifo_gk20a *f = &g->fifo;
+
+	if (*pos >= f->num_channels)
+		return NULL;
+
+	return &f->channel[*pos];
+}
+
+static void *gk20a_fifo_sched_debugfs_seq_next(
+		struct seq_file *s, void *v, loff_t *pos)
+{
+	struct gk20a *g = s->private;
+	struct fifo_gk20a *f = &g->fifo;
+
+	++(*pos);
+	if (*pos >= f->num_channels)
+		return NULL;
+
+	return &f->channel[*pos];
+}
+
+static void gk20a_fifo_sched_debugfs_seq_stop(
+		struct seq_file *s, void *v)
+{
+}
+
+static int gk20a_fifo_sched_debugfs_seq_show(
+		struct seq_file *s, void *v)
+{
+	struct gk20a *g = s->private;
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = v;
+	struct tsg_gk20a *tsg = NULL;
+
+	struct fifo_engine_info_gk20a *engine_info;
+	struct fifo_runlist_info_gk20a *runlist;
+	u32 runlist_id;
+	int ret = SEQ_SKIP;
+	u32 engine_id;
+
+	engine_id = gk20a_fifo_get_gr_engine_id(g);
+	engine_info = (f->engine_info + engine_id);
+	runlist_id = engine_info->runlist_id;
+	runlist = &f->runlist_info[runlist_id];
+
+	if (ch == f->channel) {
+		seq_puts(s, "chid     tsgid    pid      timeslice  timeout  interleave graphics_preempt compute_preempt\n");
+		seq_puts(s, "                            (usecs)   (msecs)\n");
+		ret = 0;
+	}
+
+	if (!test_bit(ch->hw_chid, runlist->active_channels))
+		return ret;
+
+	if (gk20a_channel_get(ch)) {
+		if (gk20a_is_channel_marked_as_tsg(ch))
+			tsg = &f->tsg[ch->tsgid];
+
+		seq_printf(s, "%-8d %-8d %-8d %-9d %-8d %-10d %-8d %-8d\n",
+				ch->hw_chid,
+				ch->tsgid,
+				ch->tgid,
+				tsg ? tsg->timeslice_us : ch->timeslice_us,
+				ch->timeout_ms_max,
+				tsg ? tsg->interleave_level : ch->interleave_level,
+				ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->graphics_preempt_mode : U32_MAX,
+				ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->compute_preempt_mode : U32_MAX);
+		gk20a_channel_put(ch);
+	}
+	return 0;
+}
+
+static const struct seq_operations gk20a_fifo_sched_debugfs_seq_ops = {
+	.start = gk20a_fifo_sched_debugfs_seq_start,
+	.next = gk20a_fifo_sched_debugfs_seq_next,
+	.stop = gk20a_fifo_sched_debugfs_seq_stop,
+	.show = gk20a_fifo_sched_debugfs_seq_show
+};
+
+static int gk20a_fifo_sched_debugfs_open(struct inode *inode,
+	struct file *file)
+{
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = seq_open(file, &gk20a_fifo_sched_debugfs_seq_ops);
+	if (err)
+		return err;
+
+	gk20a_dbg(gpu_dbg_info, "i_private=%p", inode->i_private);
+
+	((struct seq_file *)file->private_data)->private = inode->i_private;
+	return 0;
+};
+
+/*
+ * The file operations structure contains our open function along with
+ * set of the canned seq_ ops.
+ */
+static const struct file_operations gk20a_fifo_sched_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = gk20a_fifo_sched_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release
+};
+
+static int gk20a_fifo_profile_enable(void *data, u64 val)
+{
+	struct gk20a *g = (struct gk20a *) data;
+	struct fifo_gk20a *f = &g->fifo;
+
+
+	nvgpu_mutex_acquire(&f->profile.lock);
+	if (val == 0) {
+		if (f->profile.enabled) {
+			f->profile.enabled = false;
+			kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
+		}
+	} else {
+		if (!f->profile.enabled) {
+			/* not kref init as it can have a running condition if
+			 * we enable/disable/enable while kickoff is happening
+			 */
+			if (!kref_get_unless_zero(&f->profile.ref)) {
+				f->profile.data = vzalloc(
+							FIFO_PROFILING_ENTRIES *
+					sizeof(struct fifo_profile_gk20a));
+				f->profile.sorted  = vzalloc(
+							FIFO_PROFILING_ENTRIES *
+							sizeof(u64));
+				if (!(f->profile.data && f->profile.sorted)) {
+					nvgpu_vfree(g, f->profile.data);
+					nvgpu_vfree(g, f->profile.sorted);
+					nvgpu_mutex_release(&f->profile.lock);
+					return -ENOMEM;
+				}
+				kref_init(&f->profile.ref);
+			}
+			atomic_set(&f->profile.get, 0);
+			f->profile.enabled = true;
+		}
+	}
+	nvgpu_mutex_release(&f->profile.lock);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(
+	gk20a_fifo_profile_enable_debugfs_fops,
+	NULL,
+	gk20a_fifo_profile_enable,
+	"%llu\n"
+);
+
+static int __profile_cmp(const void *a, const void *b)
+{
+	return *((unsigned long long *) a) - *((unsigned long long *) b);
+}
+
+/*
+ * This uses about 800b in the stack, but the function using it is not part
+ * of a callstack where much memory is being used, so it is fine
+ */
+#define PERCENTILE_WIDTH	5
+#define PERCENTILE_RANGES	(100/PERCENTILE_WIDTH)
+
+static unsigned int __gk20a_fifo_create_stats(struct gk20a *g,
+		u64 *percentiles, u32 index_end, u32 index_start)
+{
+	unsigned int nelem = 0;
+	unsigned int index;
+	struct fifo_profile_gk20a *profile;
+
+	for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) {
+		profile = &g->fifo.profile.data[index];
+
+		if (profile->timestamp[index_end] >
+				profile->timestamp[index_start]) {
+			/* This is a valid element */
+			g->fifo.profile.sorted[nelem] =
+						profile->timestamp[index_end] -
+						profile->timestamp[index_start];
+			nelem++;
+		}
+	}
+
+	/* sort it */
+	sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long),
+		__profile_cmp, NULL);
+
+	/* build ranges */
+	for (index = 0; index < PERCENTILE_RANGES; index++)
+		percentiles[index] =
+			g->fifo.profile.sorted[(PERCENTILE_WIDTH * (index + 1) *
+						nelem)/100 - 1];
+	return nelem;
+}
+
+static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused)
+{
+	struct gk20a *g = s->private;
+	unsigned int get, nelem, index;
+	/*
+	 * 800B in the stack, but function is declared statically and only
+	 * called from debugfs handler
+	 */
+	u64 percentiles_ioctl[PERCENTILE_RANGES];
+	u64 percentiles_kickoff[PERCENTILE_RANGES];
+	u64 percentiles_jobtracking[PERCENTILE_RANGES];
+	u64 percentiles_append[PERCENTILE_RANGES];
+	u64 percentiles_userd[PERCENTILE_RANGES];
+
+	if (!kref_get_unless_zero(&g->fifo.profile.ref)) {
+		seq_printf(s, "Profiling disabled\n");
+		return 0;
+	}
+
+	get = atomic_read(&g->fifo.profile.get);
+
+	__gk20a_fifo_create_stats(g, percentiles_ioctl,
+		PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY);
+	__gk20a_fifo_create_stats(g, percentiles_kickoff,
+		PROFILE_END, PROFILE_ENTRY);
+	__gk20a_fifo_create_stats(g, percentiles_jobtracking,
+		PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY);
+	__gk20a_fifo_create_stats(g, percentiles_append,
+		PROFILE_APPEND, PROFILE_JOB_TRACKING);
+	nelem = __gk20a_fifo_create_stats(g, percentiles_userd,
+		PROFILE_END, PROFILE_APPEND);
+
+	seq_printf(s, "Number of kickoffs: %d\n", nelem);
+	seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n");
+
+	for (index = 0; index < PERCENTILE_RANGES; index++)
+		seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n",
+			PERCENTILE_WIDTH * (index+1),
+			percentiles_ioctl[index],
+			percentiles_kickoff[index],
+			percentiles_append[index],
+			percentiles_jobtracking[index],
+			percentiles_userd[index]);
+
+	kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
+
+	return 0;
+}
+
+static int gk20a_fifo_profile_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, gk20a_fifo_profile_stats, inode->i_private);
+}
+
+static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = {
+	.open		= gk20a_fifo_profile_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+
+void gk20a_fifo_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	struct dentry *gpu_root = platform->debugfs;
+	struct dentry *fifo_root;
+	struct dentry *profile_root;
+
+	fifo_root = debugfs_create_dir("fifo", gpu_root);
+	if (IS_ERR_OR_NULL(fifo_root))
+		return;
+
+	gk20a_dbg(gpu_dbg_info, "g=%p", g);
+
+	debugfs_create_file("sched", 0600, fifo_root, g,
+		&gk20a_fifo_sched_debugfs_fops);
+
+	profile_root = debugfs_create_dir("profile", fifo_root);
+	if (IS_ERR_OR_NULL(profile_root))
+		return;
+
+	nvgpu_mutex_init(&g->fifo.profile.lock);
+	g->fifo.profile.enabled = false;
+	atomic_set(&g->fifo.profile.get, 0);
+	atomic_set(&g->fifo.profile.ref.refcount, 0);
+
+	debugfs_create_file("enable", 0600, profile_root, g,
+		&gk20a_fifo_profile_enable_debugfs_fops);
+
+	debugfs_create_file("stats", 0600, profile_root, g,
+		&gk20a_fifo_profile_stats_debugfs_fops);
+
+}
+
+void __gk20a_fifo_profile_free(struct kref *ref)
+{
+	struct fifo_gk20a *f = container_of(ref, struct fifo_gk20a,
+						profile.ref);
+	nvgpu_vfree(f->g, f->profile.data);
+	nvgpu_vfree(f->g, f->profile.sorted);
+}
+
+/* Get the next element in the ring buffer of profile entries
+ * and grab a reference to the structure
+ */
+struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_profile_gk20a *profile;
+	unsigned int index;
+
+	/* If kref is zero, profiling is not enabled */
+	if (!kref_get_unless_zero(&f->profile.ref))
+		return NULL;
+	index = atomic_inc_return(&f->profile.get);
+	profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES];
+
+	return profile;
+}
+
+/* Free the reference to the structure. This allows deferred cleanups */
+void gk20a_fifo_profile_release(struct gk20a *g,
+					struct fifo_profile_gk20a *profile)
+{
+	kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
+}
+
+void gk20a_fifo_debugfs_deinit(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+
+	nvgpu_mutex_acquire(&f->profile.lock);
+	if (f->profile.enabled) {
+		f->profile.enabled = false;
+		kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
+	}
+	nvgpu_mutex_release(&f->profile.lock);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_fifo.h b/drivers/gpu/nvgpu/common/linux/debug_fifo.h
new file mode 100644
index 000000000..46ac853e6
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_fifo.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __NVGPU_DEBUG_FIFO_H__
+#define __NVGPU_DEBUG_FIFO_H__
+
+struct gk20a;
+void gk20a_fifo_debugfs_init(struct gk20a *g);
+void gk20a_fifo_debugfs_deinit(struct gk20a *g);
+
+#endif /* __NVGPU_DEBUG_FIFO_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_gr.c b/drivers/gpu/nvgpu/common/linux/debug_gr.c
new file mode 100644
index 000000000..56b8612ee
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_gr.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_gr.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+
+int gr_gk20a_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	g->debugfs_gr_default_attrib_cb_size =
+		debugfs_create_u32("gr_default_attrib_cb_size",
+				   S_IRUGO|S_IWUSR, platform->debugfs,
+				   &g->gr.attrib_cb_default_size);
+
+	return 0;
+}
+
diff --git a/drivers/gpu/nvgpu/gm20b/debug_gm20b.c b/drivers/gpu/nvgpu/common/linux/debug_gr.h
similarity index 68%
rename from drivers/gpu/nvgpu/gm20b/debug_gm20b.c
rename to drivers/gpu/nvgpu/common/linux/debug_gr.h
index b266200c0..4b46acbb1 100644
--- a/drivers/gpu/nvgpu/gm20b/debug_gm20b.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_gr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -12,10 +12,10 @@
  *
  */
 
-#include "gk20a/gk20a.h"
-#include "debug_gm20b.h"
+#ifndef __NVGPU_DEBUG_GR_H__
+#define __NVGPU_DEBUG_GR_H__
 
-void gm20b_init_debug_ops(struct gpu_ops *gops)
-{
-	gops->debug.show_dump = gk20a_debug_show_dump;
-}
+struct gk20a;
+int gr_gk20a_debugfs_init(struct gk20a *g);
+
+#endif /* __NVGPU_DEBUG_GR_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_kmem.c b/drivers/gpu/nvgpu/common/linux/debug_kmem.c
new file mode 100644
index 000000000..2ee542a83
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_kmem.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_kmem.h"
+#include "kmem_priv.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+/**
+ * to_human_readable_bytes - Determine  suffix for passed size.
+ *
+ * @bytes - Number of bytes to generate a suffix for.
+ * @hr_bytes [out] - The human readable number of bytes.
+ * @hr_suffix [out] - The suffix for the HR number of bytes.
+ *
+ * Computes a human readable decomposition of the passed number of bytes. The
+ * suffix for the bytes is passed back through the @hr_suffix pointer. The right
+ * number of bytes is then passed back in @hr_bytes. This returns the following
+ * ranges:
+ *
+ *   0 - 1023 B
+ *   1 - 1023 KB
+ *   1 - 1023 MB
+ *   1 - 1023 GB
+ *   1 - 1023 TB
+ *   1 - ...  PB
+ */
+static void __to_human_readable_bytes(u64 bytes, u64 *hr_bytes,
+				      const char **hr_suffix)
+{
+	static const char *suffixes[] =
+		{ "B", "KB", "MB", "GB", "TB", "PB" };
+
+	u64 suffix_ind = 0;
+
+	while (suffix_ind < ARRAY_SIZE(suffixes) && bytes >= 1024) {
+		bytes >>= 10;
+		suffix_ind++;
+	}
+
+	/*
+	 * Handle case where bytes > 1023PB.
+	 */
+	suffix_ind = suffix_ind < ARRAY_SIZE(suffixes) ?
+		suffix_ind : ARRAY_SIZE(suffixes) - 1;
+
+	*hr_bytes = bytes;
+	*hr_suffix = suffixes[suffix_ind];
+}
+
+/**
+ * print_hr_bytes - Print human readable bytes
+ *
+ * @s - A seq_file to print to. May be NULL.
+ * @msg - A message to print before the bytes.
+ * @bytes - Number of bytes.
+ *
+ * Print @msg followed by the human readable decomposition of the passed number
+ * of bytes.
+ *
+ * If @s is NULL then this prints will be made to the kernel log.
+ */
+static void print_hr_bytes(struct seq_file *s, const char *msg, u64 bytes)
+{
+	u64 hr_bytes;
+	const char *hr_suffix;
+
+	__to_human_readable_bytes(bytes, &hr_bytes, &hr_suffix);
+	__pstat(s, "%s%lld %s\n", msg, hr_bytes, hr_suffix);
+}
+
+/**
+ * print_histogram - Build a histogram of the memory usage.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
+ */
+static void print_histogram(struct nvgpu_mem_alloc_tracker *tracker,
+			    struct seq_file *s)
+{
+	int i;
+	u64 pot_min, pot_max;
+	u64 nr_buckets;
+	unsigned int *buckets;
+	unsigned int total_allocs;
+	struct nvgpu_rbtree_node *node;
+	static const char histogram_line[] =
+		"++++++++++++++++++++++++++++++++++++++++";
+
+	/*
+	 * pot_min is essentially a round down to the nearest power of 2. This
+	 * is the start of the histogram. pot_max is just a round up to the
+	 * nearest power of two. Each histogram bucket is one power of two so
+	 * the histogram buckets are exponential.
+	 */
+	pot_min = (u64)rounddown_pow_of_two(tracker->min_alloc);
+	pot_max = (u64)roundup_pow_of_two(tracker->max_alloc);
+
+	nr_buckets = __ffs(pot_max) - __ffs(pot_min);
+
+	buckets = kzalloc(sizeof(*buckets) * nr_buckets, GFP_KERNEL);
+	if (!buckets) {
+		__pstat(s, "OOM: could not allocate bucket storage!?\n");
+		return;
+	}
+
+	/*
+	 * Iterate across all of the allocs and determine what bucket they
+	 * should go in. Round the size down to the nearest power of two to
+	 * find the right bucket.
+	 */
+	nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
+	while (node) {
+		int b;
+		u64 bucket_min;
+		struct nvgpu_mem_alloc *alloc =
+			nvgpu_mem_alloc_from_rbtree_node(node);
+
+		bucket_min = (u64)rounddown_pow_of_two(alloc->size);
+		if (bucket_min < tracker->min_alloc)
+			bucket_min = tracker->min_alloc;
+
+		b = __ffs(bucket_min) - __ffs(pot_min);
+
+		/*
+		 * Handle the one case were there's an alloc exactly as big as
+		 * the maximum bucket size of the largest bucket. Most of the
+		 * buckets have an inclusive minimum and exclusive maximum. But
+		 * the largest bucket needs to have an _inclusive_ maximum as
+		 * well.
+		 */
+		if (b == (int)nr_buckets)
+			b--;
+
+		buckets[b]++;
+
+		nvgpu_rbtree_enum_next(&node, node);
+	}
+
+	total_allocs = 0;
+	for (i = 0; i < (int)nr_buckets; i++)
+		total_allocs += buckets[i];
+
+	__pstat(s, "Alloc histogram:\n");
+
+	/*
+	 * Actually compute the histogram lines.
+	 */
+	for (i = 0; i < (int)nr_buckets; i++) {
+		char this_line[sizeof(histogram_line) + 1];
+		u64 line_length;
+		u64 hr_bytes;
+		const char *hr_suffix;
+
+		memset(this_line, 0, sizeof(this_line));
+
+		/*
+		 * Compute the normalized line length. Cant use floating point
+		 * so we will just multiply everything by 1000 and use fixed
+		 * point.
+		 */
+		line_length = (1000 * buckets[i]) / total_allocs;
+		line_length *= sizeof(histogram_line);
+		line_length /= 1000;
+
+		memset(this_line, '+', line_length);
+
+		__to_human_readable_bytes(1 << (__ffs(pot_min) + i),
+					  &hr_bytes, &hr_suffix);
+		__pstat(s, "  [%-4lld %-4lld] %-2s %5u | %s\n",
+			hr_bytes, hr_bytes << 1,
+			hr_suffix, buckets[i], this_line);
+	}
+}
+
+/**
+ * nvgpu_kmem_print_stats - Print kmem tracking stats.
+ *
+ * @tracker The tracking to pull data from.
+ * @s       A seq_file to dump info into.
+ *
+ * Print stats from a tracker. If @s is non-null then seq_printf() will be
+ * used with @s. Otherwise the stats are pr_info()ed.
+ */
+void nvgpu_kmem_print_stats(struct nvgpu_mem_alloc_tracker *tracker,
+			    struct seq_file *s)
+{
+	nvgpu_lock_tracker(tracker);
+
+	__pstat(s, "Mem tracker: %s\n\n", tracker->name);
+
+	__pstat(s, "Basic Stats:\n");
+	__pstat(s,        "  Number of allocs        %lld\n",
+		tracker->nr_allocs);
+	__pstat(s,        "  Number of frees         %lld\n",
+		tracker->nr_frees);
+	print_hr_bytes(s, "  Smallest alloc          ", tracker->min_alloc);
+	print_hr_bytes(s, "  Largest alloc           ", tracker->max_alloc);
+	print_hr_bytes(s, "  Bytes allocated         ", tracker->bytes_alloced);
+	print_hr_bytes(s, "  Bytes freed             ", tracker->bytes_freed);
+	print_hr_bytes(s, "  Bytes allocated (real)  ",
+		       tracker->bytes_alloced_real);
+	print_hr_bytes(s, "  Bytes freed (real)      ",
+		       tracker->bytes_freed_real);
+	__pstat(s, "\n");
+
+	print_histogram(tracker, s);
+
+	nvgpu_unlock_tracker(tracker);
+}
+
+static int __kmem_tracking_show(struct seq_file *s, void *unused)
+{
+	struct nvgpu_mem_alloc_tracker *tracker = s->private;
+
+	nvgpu_kmem_print_stats(tracker, s);
+
+	return 0;
+}
+
+static int __kmem_tracking_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, __kmem_tracking_show, inode->i_private);
+}
+
+static const struct file_operations __kmem_tracking_fops = {
+	.open = __kmem_tracking_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int __kmem_traces_dump_tracker(struct gk20a *g,
+				      struct nvgpu_mem_alloc_tracker *tracker,
+				      struct seq_file *s)
+{
+	struct nvgpu_rbtree_node *node;
+
+	nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
+	while (node) {
+		struct nvgpu_mem_alloc *alloc =
+			nvgpu_mem_alloc_from_rbtree_node(node);
+
+		kmem_print_mem_alloc(g, alloc, s);
+
+		nvgpu_rbtree_enum_next(&node, node);
+	}
+
+	return 0;
+}
+
+static int __kmem_traces_show(struct seq_file *s, void *unused)
+{
+	struct gk20a *g = s->private;
+
+	nvgpu_lock_tracker(g->vmallocs);
+	seq_puts(s, "Oustanding vmallocs:\n");
+	__kmem_traces_dump_tracker(g, g->vmallocs, s);
+	seq_puts(s, "\n");
+	nvgpu_unlock_tracker(g->vmallocs);
+
+	nvgpu_lock_tracker(g->kmallocs);
+	seq_puts(s, "Oustanding kmallocs:\n");
+	__kmem_traces_dump_tracker(g, g->kmallocs, s);
+	nvgpu_unlock_tracker(g->kmallocs);
+
+	return 0;
+}
+
+static int __kmem_traces_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, __kmem_traces_show, inode->i_private);
+}
+
+static const struct file_operations __kmem_traces_fops = {
+	.open = __kmem_traces_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void nvgpu_kmem_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+	struct dentry *node;
+
+	g->debugfs_kmem = debugfs_create_dir("kmem_tracking", platform->debugfs);
+	if (IS_ERR_OR_NULL(g->debugfs_kmem))
+		return;
+
+	node = debugfs_create_file(g->vmallocs->name, S_IRUGO,
+				   g->debugfs_kmem,
+				   g->vmallocs, &__kmem_tracking_fops);
+	node = debugfs_create_file(g->kmallocs->name, S_IRUGO,
+				   g->debugfs_kmem,
+				   g->kmallocs, &__kmem_tracking_fops);
+	node = debugfs_create_file("traces", S_IRUGO,
+				   g->debugfs_kmem,
+				   g, &__kmem_traces_fops);
+}
+#endif
diff --git a/drivers/gpu/nvgpu/common/linux/debug_kmem.h b/drivers/gpu/nvgpu/common/linux/debug_kmem.h
new file mode 100644
index 000000000..44322b533
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_kmem.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __NVGPU_DEBUG_KMEM_H__
+#define __NVGPU_DEBUG_KMEM_H__
+
+struct gk20a;
+#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
+void nvgpu_kmem_debugfs_init(struct gk20a *g);
+#endif
+
+#endif /* __NVGPU_DEBUG_KMEM_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_mm.c b/drivers/gpu/nvgpu/common/linux/debug_mm.c
new file mode 100644
index 000000000..1e260f898
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_mm.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_mm.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+
+void gk20a_mm_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	debugfs_create_bool("force_pramin", 0664, platform->debugfs,
+			   &g->mm.force_pramin);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_mm.h b/drivers/gpu/nvgpu/common/linux/debug_mm.h
new file mode 100644
index 000000000..bf7bc9851
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_mm.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __NVGPU_DEBUG_MM_H__
+#define __NVGPU_DEBUG_MM_H__
+
+struct gk20a;
+void gk20a_mm_debugfs_init(struct gk20a *g);
+
+#endif /* __NVGPU_DEBUG_MM_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_pmu.c b/drivers/gpu/nvgpu/common/linux/debug_pmu.c
new file mode 100644
index 000000000..f19f51398
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_pmu.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_pmu.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+
+static int lpwr_debug_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+
+	if (g->ops.pmu.pmu_pg_engines_feature_list &&
+		g->ops.pmu.pmu_pg_engines_feature_list(g,
+		PMU_PG_ELPG_ENGINE_ID_GRAPHICS) !=
+		PMU_PG_FEATURE_GR_POWER_GATING_ENABLED) {
+		seq_printf(s, "PSTATE: %u\n"
+			"RPPG Enabled: %u\n"
+			"RPPG ref count: %u\n"
+			"RPPG state: %u\n"
+			"MSCG Enabled: %u\n"
+			"MSCG pstate state: %u\n"
+			"MSCG transition state: %u\n",
+			g->ops.clk_arb.get_current_pstate(g),
+			g->elpg_enabled, g->pmu.elpg_refcnt,
+			g->pmu.elpg_stat, g->mscg_enabled,
+			g->pmu.mscg_stat, g->pmu.mscg_transition_state);
+
+	} else
+		seq_printf(s, "ELPG Enabled: %u\n"
+			"ELPG ref count: %u\n"
+			"ELPG state: %u\n",
+			g->elpg_enabled, g->pmu.elpg_refcnt,
+			g->pmu.elpg_stat);
+
+	return 0;
+
+}
+
+static int lpwr_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lpwr_debug_show, inode->i_private);
+}
+
+static const struct file_operations lpwr_debug_fops = {
+	.open		= lpwr_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int mscg_stat_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	u64 total_ingating, total_ungating, residency, divisor, dividend;
+	struct pmu_pg_stats_data pg_stat_data = { 0 };
+	int err;
+
+	/* Don't unnecessarily power on the device */
+	if (g->power_on) {
+		err = gk20a_busy(g);
+		if (err)
+			return err;
+
+		gk20a_pmu_get_pg_stats(g,
+			PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
+		gk20a_idle(g);
+	}
+	total_ingating = g->pg_ingating_time_us +
+			(u64)pg_stat_data.ingating_time;
+	total_ungating = g->pg_ungating_time_us +
+			(u64)pg_stat_data.ungating_time;
+
+	divisor = total_ingating + total_ungating;
+
+	/* We compute the residency on a scale of 1000 */
+	dividend = total_ingating * 1000;
+
+	if (divisor)
+		residency = div64_u64(dividend, divisor);
+	else
+		residency = 0;
+
+	seq_printf(s,
+			"Time in MSCG: %llu us\n"
+			"Time out of MSCG: %llu us\n"
+			"MSCG residency ratio: %llu\n"
+			"MSCG Entry Count: %u\n"
+			"MSCG Avg Entry latency %u\n"
+			"MSCG Avg Exit latency %u\n",
+			total_ingating, total_ungating,
+			residency, pg_stat_data.gating_cnt,
+			pg_stat_data.avg_entry_latency_us,
+			pg_stat_data.avg_exit_latency_us);
+	return 0;
+
+}
+
+static int mscg_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mscg_stat_show, inode->i_private);
+}
+
+static const struct file_operations mscg_stat_fops = {
+	.open		= mscg_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int mscg_transitions_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	struct pmu_pg_stats_data pg_stat_data = { 0 };
+	u32 total_gating_cnt;
+	int err;
+
+	if (g->power_on) {
+		err = gk20a_busy(g);
+		if (err)
+			return err;
+
+		gk20a_pmu_get_pg_stats(g,
+			PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
+		gk20a_idle(g);
+	}
+	total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
+
+	seq_printf(s, "%u\n", total_gating_cnt);
+	return 0;
+
+}
+
+static int mscg_transitions_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mscg_transitions_show, inode->i_private);
+}
+
+static const struct file_operations mscg_transitions_fops = {
+	.open		= mscg_transitions_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int elpg_stat_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	struct pmu_pg_stats_data pg_stat_data = { 0 };
+	u64 total_ingating, total_ungating, residency, divisor, dividend;
+	int err;
+
+	/* Don't unnecessarily power on the device */
+	if (g->power_on) {
+		err = gk20a_busy(g);
+		if (err)
+			return err;
+
+		gk20a_pmu_get_pg_stats(g,
+			PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
+		gk20a_idle(g);
+	}
+	total_ingating = g->pg_ingating_time_us +
+			(u64)pg_stat_data.ingating_time;
+	total_ungating = g->pg_ungating_time_us +
+			(u64)pg_stat_data.ungating_time;
+	divisor = total_ingating + total_ungating;
+
+	/* We compute the residency on a scale of 1000 */
+	dividend = total_ingating * 1000;
+
+	if (divisor)
+		residency = div64_u64(dividend, divisor);
+	else
+		residency = 0;
+
+	seq_printf(s,
+			"Time in ELPG: %llu us\n"
+			"Time out of ELPG: %llu us\n"
+			"ELPG residency ratio: %llu\n"
+			"ELPG Entry Count: %u\n"
+			"ELPG Avg Entry latency %u us\n"
+			"ELPG Avg Exit latency %u us\n",
+			total_ingating, total_ungating,
+			residency, pg_stat_data.gating_cnt,
+			pg_stat_data.avg_entry_latency_us,
+			pg_stat_data.avg_exit_latency_us);
+	return 0;
+
+}
+
+static int elpg_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, elpg_stat_show, inode->i_private);
+}
+
+static const struct file_operations elpg_stat_fops = {
+	.open		= elpg_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int elpg_transitions_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	struct pmu_pg_stats_data pg_stat_data = { 0 };
+	u32 total_gating_cnt;
+	int err;
+
+	if (g->power_on) {
+		err = gk20a_busy(g);
+		if (err)
+			return err;
+
+		gk20a_pmu_get_pg_stats(g,
+			PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
+		gk20a_idle(g);
+	}
+	total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
+
+	seq_printf(s, "%u\n", total_gating_cnt);
+	return 0;
+
+}
+
+static int elpg_transitions_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, elpg_transitions_show, inode->i_private);
+}
+
+static const struct file_operations elpg_transitions_fops = {
+	.open		= elpg_transitions_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int falc_trace_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	struct pmu_gk20a *pmu = &g->pmu;
+	u32 i = 0, j = 0, k, l, m;
+	char part_str[40];
+	void *tracebuffer;
+	char *trace;
+	u32 *trace1;
+
+	/* allocate system memory to copy pmu trace buffer */
+	tracebuffer = nvgpu_kzalloc(g, GK20A_PMU_TRACE_BUFSIZE);
+	if (tracebuffer == NULL)
+		return -ENOMEM;
+
+	/* read pmu traces into system memory buffer */
+	nvgpu_mem_rd_n(g, &pmu->trace_buf,
+		       0, tracebuffer, GK20A_PMU_TRACE_BUFSIZE);
+
+	trace = (char *)tracebuffer;
+	trace1 = (u32 *)tracebuffer;
+
+	for (i = 0; i < GK20A_PMU_TRACE_BUFSIZE; i += 0x40) {
+		for (j = 0; j < 0x40; j++)
+			if (trace1[(i / 4) + j])
+				break;
+		if (j == 0x40)
+			break;
+		seq_printf(s, "Index %x: ", trace1[(i / 4)]);
+		l = 0;
+		m = 0;
+		while (nvgpu_find_hex_in_string((trace+i+20+m), g, &k)) {
+			if (k >= 40)
+				break;
+			strncpy(part_str, (trace+i+20+m), k);
+			part_str[k] = 0;
+			seq_printf(s, "%s0x%x", part_str,
+					trace1[(i / 4) + 1 + l]);
+			l++;
+			m += k + 2;
+		}
+		seq_printf(s, "%s", (trace+i+20+m));
+	}
+
+	nvgpu_kfree(g, tracebuffer);
+	return 0;
+}
+
+static int falc_trace_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, falc_trace_show, inode->i_private);
+}
+
+static const struct file_operations falc_trace_fops = {
+	.open		= falc_trace_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int perfmon_events_enable_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+
+	seq_printf(s, "%u\n", g->pmu.perfmon_sampling_enabled ? 1 : 0);
+	return 0;
+
+}
+
+static int perfmon_events_enable_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, perfmon_events_enable_show, inode->i_private);
+}
+
+static ssize_t perfmon_events_enable_write(struct file *file,
+	const char __user *userbuf, size_t count, loff_t *ppos)
+{
+	struct seq_file *s = file->private_data;
+	struct gk20a *g = s->private;
+	unsigned long val = 0;
+	char buf[40];
+	int buf_size;
+	int err;
+
+	memset(buf, 0, sizeof(buf));
+	buf_size = min(count, (sizeof(buf)-1));
+
+	if (copy_from_user(buf, userbuf, buf_size))
+		return -EFAULT;
+
+	if (kstrtoul(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	/* Don't turn on gk20a unnecessarily */
+	if (g->power_on) {
+		err = gk20a_busy(g);
+		if (err)
+			return err;
+
+		if (val && !g->pmu.perfmon_sampling_enabled) {
+			g->pmu.perfmon_sampling_enabled = true;
+			nvgpu_pmu_perfmon_start_sampling(&(g->pmu));
+		} else if (!val && g->pmu.perfmon_sampling_enabled) {
+			g->pmu.perfmon_sampling_enabled = false;
+			nvgpu_pmu_perfmon_stop_sampling(&(g->pmu));
+		}
+		gk20a_idle(g);
+	} else {
+		g->pmu.perfmon_sampling_enabled = val ? true : false;
+	}
+
+	return count;
+}
+
+static const struct file_operations perfmon_events_enable_fops = {
+	.open		= perfmon_events_enable_open,
+	.read		= seq_read,
+	.write		= perfmon_events_enable_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int perfmon_events_count_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+
+	seq_printf(s, "%lu\n", g->pmu.perfmon_events_cnt);
+	return 0;
+
+}
+
+static int perfmon_events_count_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, perfmon_events_count_show, inode->i_private);
+}
+
+static const struct file_operations perfmon_events_count_fops = {
+	.open		= perfmon_events_count_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int security_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+
+	seq_printf(s, "%d\n", g->pmu.pmu_mode);
+	return 0;
+
+}
+
+static int security_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, security_show, inode->i_private);
+}
+
+static const struct file_operations security_fops = {
+	.open		= security_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int gk20a_pmu_debugfs_init(struct gk20a *g)
+{
+	struct dentry *d;
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	d = debugfs_create_file(
+		"lpwr_debug", S_IRUGO|S_IWUSR, platform->debugfs, g,
+						&lpwr_debug_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"mscg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
+						&mscg_stat_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"mscg_transitions", S_IRUGO, platform->debugfs, g,
+						&mscg_transitions_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"elpg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
+						&elpg_stat_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"elpg_transitions", S_IRUGO, platform->debugfs, g,
+						&elpg_transitions_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"falc_trace", S_IRUGO, platform->debugfs, g,
+						&falc_trace_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"perfmon_events_enable", S_IRUGO, platform->debugfs, g,
+						&perfmon_events_enable_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"perfmon_events_count", S_IRUGO, platform->debugfs, g,
+						&perfmon_events_count_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"pmu_security", S_IRUGO, platform->debugfs, g,
+						&security_fops);
+	if (!d)
+		goto err_out;
+	return 0;
+err_out:
+	pr_err("%s: Failed to make debugfs node\n", __func__);
+	debugfs_remove_recursive(platform->debugfs);
+	return -ENOMEM;
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_pmu.h b/drivers/gpu/nvgpu/common/linux/debug_pmu.h
new file mode 100644
index 000000000..c4e3243d8
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_pmu.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __NVGPU_DEBUG_PMU_H__
+#define __NVGPU_DEBUG_PMU_H__
+
+struct gk20a;
+int gk20a_pmu_debugfs_init(struct gk20a *g);
+
+#endif /* __NVGPU_DEBUG_PMU_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/debug_sched.c b/drivers/gpu/nvgpu/common/linux/debug_sched.c
new file mode 100644
index 000000000..40b93149c
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_sched.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "debug_sched.h"
+#include "gk20a/platform_gk20a.h"
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static int gk20a_sched_debugfs_show(struct seq_file *s, void *unused)
+{
+	struct gk20a *g = s->private;
+	struct gk20a_sched_ctrl *sched = &g->sched_ctrl;
+	bool sched_busy = true;
+
+	int n = sched->bitmap_size / sizeof(u64);
+	int i;
+	int err;
+
+	err = gk20a_busy(g);
+	if (err)
+		return err;
+
+	if (nvgpu_mutex_tryacquire(&sched->busy_lock)) {
+		sched_busy = false;
+		nvgpu_mutex_release(&sched->busy_lock);
+	}
+
+	seq_printf(s, "control_locked=%d\n", sched->control_locked);
+	seq_printf(s, "busy=%d\n", sched_busy);
+	seq_printf(s, "bitmap_size=%zu\n", sched->bitmap_size);
+
+	nvgpu_mutex_acquire(&sched->status_lock);
+
+	seq_puts(s, "active_tsg_bitmap\n");
+	for (i = 0; i < n; i++)
+		seq_printf(s, "\t0x%016llx\n", sched->active_tsg_bitmap[i]);
+
+	seq_puts(s, "recent_tsg_bitmap\n");
+	for (i = 0; i < n; i++)
+		seq_printf(s, "\t0x%016llx\n", sched->recent_tsg_bitmap[i]);
+
+	nvgpu_mutex_release(&sched->status_lock);
+
+	gk20a_idle(g);
+
+	return 0;
+}
+
+static int gk20a_sched_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, gk20a_sched_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations gk20a_sched_debugfs_fops = {
+	.open		= gk20a_sched_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void gk20a_sched_debugfs_init(struct gk20a *g)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
+
+	debugfs_create_file("sched_ctrl", S_IRUGO, platform->debugfs,
+			g, &gk20a_sched_debugfs_fops);
+}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_sched.h b/drivers/gpu/nvgpu/common/linux/debug_sched.h
new file mode 100644
index 000000000..34a8f55f4
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/debug_sched.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __NVGPU_DEBUG_SCHED_H__
+#define __NVGPU_DEBUG_SCHED_H__
+
+struct gk20a;
+void gk20a_sched_debugfs_init(struct gk20a *g);
+
+#endif /* __NVGPU_DEBUG_SCHED_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/driver_common.c b/drivers/gpu/nvgpu/common/linux/driver_common.c
index 80e7698b7..f85016d4c 100644
--- a/drivers/gpu/nvgpu/common/linux/driver_common.c
+++ b/drivers/gpu/nvgpu/common/linux/driver_common.c
@@ -21,6 +21,7 @@
 #include <nvgpu/soc.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a/gk20a_scale.h"
 #include "gk20a/gk20a.h"
@@ -182,7 +183,7 @@ int nvgpu_probe(struct gk20a *g,
 	nvgpu_init_mm_vars(g);
 
 	gk20a_create_sysfs(g->dev);
-	gk20a_debug_init(g->dev, debugfs_symlink);
+	gk20a_debug_init(g, debugfs_symlink);
 
 	g->dbg_regops_tmp_buf = nvgpu_kzalloc(g, SZ_4K);
 	if (!g->dbg_regops_tmp_buf) {
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
index 2502ff30b..d81328f06 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -26,9 +26,9 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/log.h>
 #include <nvgpu/list.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a/gk20a.h"
-#include "gk20a/debug_gk20a.h"
 #include "gk20a/ctxsw_trace_gk20a.h"
 #include "gk20a/dbg_gpu_gk20a.h"
 #include "gk20a/fence_gk20a.h"
diff --git a/drivers/gpu/nvgpu/common/linux/kmem.c b/drivers/gpu/nvgpu/common/linux/kmem.c
index d058eba5e..41aaa7294 100644
--- a/drivers/gpu/nvgpu/common/linux/kmem.c
+++ b/drivers/gpu/nvgpu/common/linux/kmem.c
@@ -134,19 +134,19 @@ void __nvgpu_vfree(struct gk20a *g, void *addr)
 
 #ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
 
-static void lock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+void nvgpu_lock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
 {
 	nvgpu_mutex_acquire(&tracker->lock);
 }
 
-static void unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
+void nvgpu_unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker)
 {
 	nvgpu_mutex_release(&tracker->lock);
 }
 
-static void kmem_print_mem_alloc(struct gk20a *g,
-				 struct nvgpu_mem_alloc *alloc,
-				 struct seq_file *s)
+void kmem_print_mem_alloc(struct gk20a *g,
+			 struct nvgpu_mem_alloc *alloc,
+			 struct seq_file *s)
 {
 #ifdef __NVGPU_SAVE_KALLOC_STACK_TRACES
 	int i;
@@ -231,7 +231,7 @@ static int __nvgpu_save_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
 	alloc->stack_length = stack_trace.nr_entries;
 #endif
 
-	lock_tracker(tracker);
+	nvgpu_lock_tracker(tracker);
 	tracker->bytes_alloced += size;
 	tracker->bytes_alloced_real += real_size;
 	tracker->nr_allocs++;
@@ -246,10 +246,10 @@ static int __nvgpu_save_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
 	if (ret) {
 		WARN(1, "Duplicate alloc??? 0x%llx\n", addr);
 		kfree(alloc);
-		unlock_tracker(tracker);
+		nvgpu_unlock_tracker(tracker);
 		return ret;
 	}
-	unlock_tracker(tracker);
+	nvgpu_unlock_tracker(tracker);
 
 	return 0;
 }
@@ -259,17 +259,17 @@ static int __nvgpu_free_kmem_alloc(struct nvgpu_mem_alloc_tracker *tracker,
 {
 	struct nvgpu_mem_alloc *alloc;
 
-	lock_tracker(tracker);
+	nvgpu_lock_tracker(tracker);
 	alloc = nvgpu_rem_alloc(tracker, addr);
 	if (WARN(!alloc, "Possible double-free detected: 0x%llx!", addr)) {
-		unlock_tracker(tracker);
+		nvgpu_unlock_tracker(tracker);
 		return -EINVAL;
 	}
 
 	tracker->nr_frees++;
 	tracker->bytes_freed += alloc->size;
 	tracker->bytes_freed_real += alloc->real_size;
-	unlock_tracker(tracker);
+	nvgpu_unlock_tracker(tracker);
 
 	return 0;
 }
@@ -407,307 +407,6 @@ void __nvgpu_track_kfree(struct gk20a *g, void *addr)
 	__nvgpu_free_kmem_alloc(g->kmallocs, (u64)(uintptr_t)addr);
 }
 
-/**
- * to_human_readable_bytes - Determine  suffix for passed size.
- *
- * @bytes - Number of bytes to generate a suffix for.
- * @hr_bytes [out] - The human readable number of bytes.
- * @hr_suffix [out] - The suffix for the HR number of bytes.
- *
- * Computes a human readable decomposition of the passed number of bytes. The
- * suffix for the bytes is passed back through the @hr_suffix pointer. The right
- * number of bytes is then passed back in @hr_bytes. This returns the following
- * ranges:
- *
- *   0 - 1023 B
- *   1 - 1023 KB
- *   1 - 1023 MB
- *   1 - 1023 GB
- *   1 - 1023 TB
- *   1 - ...  PB
- */
-static void __to_human_readable_bytes(u64 bytes, u64 *hr_bytes,
-				      const char **hr_suffix)
-{
-	static const char *suffixes[] =
-		{ "B", "KB", "MB", "GB", "TB", "PB" };
-
-	u64 suffix_ind = 0;
-
-	while (suffix_ind < ARRAY_SIZE(suffixes) && bytes >= 1024) {
-		bytes >>= 10;
-		suffix_ind++;
-	}
-
-	/*
-	 * Handle case where bytes > 1023PB.
-	 */
-	suffix_ind = suffix_ind < ARRAY_SIZE(suffixes) ?
-		suffix_ind : ARRAY_SIZE(suffixes) - 1;
-
-	*hr_bytes = bytes;
-	*hr_suffix = suffixes[suffix_ind];
-}
-
-/**
- * print_hr_bytes - Print human readable bytes
- *
- * @s - A seq_file to print to. May be NULL.
- * @msg - A message to print before the bytes.
- * @bytes - Number of bytes.
- *
- * Print @msg followed by the human readable decomposition of the passed number
- * of bytes.
- *
- * If @s is NULL then this prints will be made to the kernel log.
- */
-static void print_hr_bytes(struct seq_file *s, const char *msg, u64 bytes)
-{
-	u64 hr_bytes;
-	const char *hr_suffix;
-
-	__to_human_readable_bytes(bytes, &hr_bytes, &hr_suffix);
-	__pstat(s, "%s%lld %s\n", msg, hr_bytes, hr_suffix);
-}
-
-/**
- * print_histogram - Build a histogram of the memory usage.
- *
- * @tracker The tracking to pull data from.
- * @s       A seq_file to dump info into.
- */
-static void print_histogram(struct nvgpu_mem_alloc_tracker *tracker,
-			    struct seq_file *s)
-{
-	int i;
-	u64 pot_min, pot_max;
-	u64 nr_buckets;
-	unsigned int *buckets;
-	unsigned int total_allocs;
-	struct nvgpu_rbtree_node *node;
-	static const char histogram_line[] =
-		"++++++++++++++++++++++++++++++++++++++++";
-
-	/*
-	 * pot_min is essentially a round down to the nearest power of 2. This
-	 * is the start of the histogram. pot_max is just a round up to the
-	 * nearest power of two. Each histogram bucket is one power of two so
-	 * the histogram buckets are exponential.
-	 */
-	pot_min = (u64)rounddown_pow_of_two(tracker->min_alloc);
-	pot_max = (u64)roundup_pow_of_two(tracker->max_alloc);
-
-	nr_buckets = __ffs(pot_max) - __ffs(pot_min);
-
-	buckets = kzalloc(sizeof(*buckets) * nr_buckets, GFP_KERNEL);
-	if (!buckets) {
-		__pstat(s, "OOM: could not allocate bucket storage!?\n");
-		return;
-	}
-
-	/*
-	 * Iterate across all of the allocs and determine what bucket they
-	 * should go in. Round the size down to the nearest power of two to
-	 * find the right bucket.
-	 */
-	nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
-	while (node) {
-		int b;
-		u64 bucket_min;
-		struct nvgpu_mem_alloc *alloc =
-			nvgpu_mem_alloc_from_rbtree_node(node);
-
-		bucket_min = (u64)rounddown_pow_of_two(alloc->size);
-		if (bucket_min < tracker->min_alloc)
-			bucket_min = tracker->min_alloc;
-
-		b = __ffs(bucket_min) - __ffs(pot_min);
-
-		/*
-		 * Handle the one case were there's an alloc exactly as big as
-		 * the maximum bucket size of the largest bucket. Most of the
-		 * buckets have an inclusive minimum and exclusive maximum. But
-		 * the largest bucket needs to have an _inclusive_ maximum as
-		 * well.
-		 */
-		if (b == (int)nr_buckets)
-			b--;
-
-		buckets[b]++;
-
-		nvgpu_rbtree_enum_next(&node, node);
-	}
-
-	total_allocs = 0;
-	for (i = 0; i < (int)nr_buckets; i++)
-		total_allocs += buckets[i];
-
-	__pstat(s, "Alloc histogram:\n");
-
-	/*
-	 * Actually compute the histogram lines.
-	 */
-	for (i = 0; i < (int)nr_buckets; i++) {
-		char this_line[sizeof(histogram_line) + 1];
-		u64 line_length;
-		u64 hr_bytes;
-		const char *hr_suffix;
-
-		memset(this_line, 0, sizeof(this_line));
-
-		/*
-		 * Compute the normalized line length. Cant use floating point
-		 * so we will just multiply everything by 1000 and use fixed
-		 * point.
-		 */
-		line_length = (1000 * buckets[i]) / total_allocs;
-		line_length *= sizeof(histogram_line);
-		line_length /= 1000;
-
-		memset(this_line, '+', line_length);
-
-		__to_human_readable_bytes(1 << (__ffs(pot_min) + i),
-					  &hr_bytes, &hr_suffix);
-		__pstat(s, "  [%-4lld %-4lld] %-2s %5u | %s\n",
-			hr_bytes, hr_bytes << 1,
-			hr_suffix, buckets[i], this_line);
-	}
-}
-
-#ifdef CONFIG_DEBUG_FS
-/**
- * nvgpu_kmem_print_stats - Print kmem tracking stats.
- *
- * @tracker The tracking to pull data from.
- * @s       A seq_file to dump info into.
- *
- * Print stats from a tracker. If @s is non-null then seq_printf() will be
- * used with @s. Otherwise the stats are pr_info()ed.
- */
-void nvgpu_kmem_print_stats(struct nvgpu_mem_alloc_tracker *tracker,
-			    struct seq_file *s)
-{
-	lock_tracker(tracker);
-
-	__pstat(s, "Mem tracker: %s\n\n", tracker->name);
-
-	__pstat(s, "Basic Stats:\n");
-	__pstat(s,        "  Number of allocs        %lld\n",
-		tracker->nr_allocs);
-	__pstat(s,        "  Number of frees         %lld\n",
-		tracker->nr_frees);
-	print_hr_bytes(s, "  Smallest alloc          ", tracker->min_alloc);
-	print_hr_bytes(s, "  Largest alloc           ", tracker->max_alloc);
-	print_hr_bytes(s, "  Bytes allocated         ", tracker->bytes_alloced);
-	print_hr_bytes(s, "  Bytes freed             ", tracker->bytes_freed);
-	print_hr_bytes(s, "  Bytes allocated (real)  ",
-		       tracker->bytes_alloced_real);
-	print_hr_bytes(s, "  Bytes freed (real)      ",
-		       tracker->bytes_freed_real);
-	__pstat(s, "\n");
-
-	print_histogram(tracker, s);
-
-	unlock_tracker(tracker);
-}
-
-static int __kmem_tracking_show(struct seq_file *s, void *unused)
-{
-	struct nvgpu_mem_alloc_tracker *tracker = s->private;
-
-	nvgpu_kmem_print_stats(tracker, s);
-
-	return 0;
-}
-
-static int __kmem_tracking_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, __kmem_tracking_show, inode->i_private);
-}
-
-static const struct file_operations __kmem_tracking_fops = {
-	.open = __kmem_tracking_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static int __kmem_traces_dump_tracker(struct gk20a *g,
-				      struct nvgpu_mem_alloc_tracker *tracker,
-				      struct seq_file *s)
-{
-	struct nvgpu_rbtree_node *node;
-
-	nvgpu_rbtree_enum_start(0, &node, tracker->allocs);
-	while (node) {
-		struct nvgpu_mem_alloc *alloc =
-			nvgpu_mem_alloc_from_rbtree_node(node);
-
-		kmem_print_mem_alloc(g, alloc, s);
-
-		nvgpu_rbtree_enum_next(&node, node);
-	}
-
-	return 0;
-}
-
-static int __kmem_traces_show(struct seq_file *s, void *unused)
-{
-	struct gk20a *g = s->private;
-
-	lock_tracker(g->vmallocs);
-	seq_puts(s, "Oustanding vmallocs:\n");
-	__kmem_traces_dump_tracker(g, g->vmallocs, s);
-	seq_puts(s, "\n");
-	unlock_tracker(g->vmallocs);
-
-	lock_tracker(g->kmallocs);
-	seq_puts(s, "Oustanding kmallocs:\n");
-	__kmem_traces_dump_tracker(g, g->kmallocs, s);
-	unlock_tracker(g->kmallocs);
-
-	return 0;
-}
-
-static int __kmem_traces_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, __kmem_traces_show, inode->i_private);
-}
-
-static const struct file_operations __kmem_traces_fops = {
-	.open = __kmem_traces_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-void nvgpu_kmem_debugfs_init(struct device *dev)
-{
-	struct gk20a_platform *plat = dev_get_drvdata(dev);
-	struct gk20a *g = get_gk20a(dev);
-	struct dentry *gpu_root = plat->debugfs;
-	struct dentry *node;
-
-	g->debugfs_kmem = debugfs_create_dir("kmem_tracking", gpu_root);
-	if (IS_ERR_OR_NULL(g->debugfs_kmem))
-		return;
-
-	node = debugfs_create_file(g->vmallocs->name, S_IRUGO,
-				   g->debugfs_kmem,
-				   g->vmallocs, &__kmem_tracking_fops);
-	node = debugfs_create_file(g->kmallocs->name, S_IRUGO,
-				   g->debugfs_kmem,
-				   g->kmallocs, &__kmem_tracking_fops);
-	node = debugfs_create_file("traces", S_IRUGO,
-				   g->debugfs_kmem,
-				   g, &__kmem_traces_fops);
-}
-#else
-void nvgpu_kmem_debugfs_init(struct device *dev)
-{
-}
-#endif
-
 static int __do_check_for_outstanding_allocs(
 	struct gk20a *g,
 	struct nvgpu_mem_alloc_tracker *tracker,
diff --git a/drivers/gpu/nvgpu/common/linux/kmem_priv.h b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
index d3abb3784..a41762af8 100644
--- a/drivers/gpu/nvgpu/common/linux/kmem_priv.h
+++ b/drivers/gpu/nvgpu/common/linux/kmem_priv.h
@@ -20,6 +20,8 @@
 #include <nvgpu/rbtree.h>
 #include <nvgpu/lock.h>
 
+struct seq_file;
+
 #define __pstat(s, fmt, msg...)				\
 	do {						\
 		if (s)					\
@@ -92,6 +94,12 @@ struct nvgpu_mem_alloc_tracker {
 	unsigned long max_alloc;
 };
 
+void nvgpu_lock_tracker(struct nvgpu_mem_alloc_tracker *tracker);
+void nvgpu_unlock_tracker(struct nvgpu_mem_alloc_tracker *tracker);
+
+void kmem_print_mem_alloc(struct gk20a *g,
+			 struct nvgpu_mem_alloc *alloc,
+			 struct seq_file *s);
 #endif /* CONFIG_NVGPU_TRACK_MEM_USAGE */
 
 #endif /* __KMEM_PRIV_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index d5fc40de4..4f7fc3fa9 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -29,6 +29,7 @@
 #include <nvgpu/nvgpu_common.h>
 #include <nvgpu/soc.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/platform_gk20a.h"
@@ -970,10 +971,7 @@ static int __exit gk20a_remove(struct platform_device *pdev)
 
 	gk20a_user_deinit(dev, &nvgpu_class);
 
-#ifdef CONFIG_DEBUG_FS
-	debugfs_remove_recursive(platform->debugfs);
-	debugfs_remove_recursive(platform->debugfs_alias);
-#endif
+	gk20a_debug_deinit(g);
 
 	gk20a_remove_sysfs(dev);
 
diff --git a/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c b/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
index 40ee199a3..eae0475ac 100644
--- a/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
@@ -411,7 +411,9 @@ int nvgpu_bitmap_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 	wmb();
 	a->inited = true;
 
+#ifdef CONFIG_DEBUG_FS
 	nvgpu_init_alloc_debug(g, __a);
+#endif
 	alloc_dbg(__a, "New allocator: type      bitmap\n");
 	alloc_dbg(__a, "               base      0x%llx\n", a->base);
 	alloc_dbg(__a, "               bit_offs  0x%llx\n", a->bit_offs);
diff --git a/drivers/gpu/nvgpu/common/mm/buddy_allocator.c b/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
index 34bc51dfe..0ef94c10f 100644
--- a/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
@@ -251,7 +251,9 @@ static void nvgpu_buddy_allocator_destroy(struct nvgpu_allocator *__a)
 
 	alloc_lock(__a);
 
+#ifdef CONFIG_DEBUG_FS
 	nvgpu_fini_alloc_debug(__a);
+#endif
 
 	/*
 	 * Free the fixed allocs first.
@@ -1290,7 +1292,9 @@ int __nvgpu_buddy_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 	wmb();
 	a->initialized = 1;
 
+#ifdef CONFIG_DEBUG_FS
 	nvgpu_init_alloc_debug(g, __a);
+#endif
 	alloc_dbg(__a, "New allocator: type      buddy\n");
 	alloc_dbg(__a, "               base      0x%llx\n", a->base);
 	alloc_dbg(__a, "               size      0x%llx\n", a->length);
diff --git a/drivers/gpu/nvgpu/common/mm/lockless_allocator.c b/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
index 234ae4a39..944b4b0ff 100644
--- a/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
@@ -99,7 +99,9 @@ static void nvgpu_lockless_alloc_destroy(struct nvgpu_allocator *a)
 {
 	struct nvgpu_lockless_allocator *pa = a->priv;
 
+#ifdef CONFIG_DEBUG_FS
 	nvgpu_fini_alloc_debug(a);
+#endif
 
 	nvgpu_vfree(a->g, pa->next);
 	nvgpu_kfree(nvgpu_alloc_to_gpu(a), pa);
@@ -191,7 +193,9 @@ int nvgpu_lockless_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 	wmb();
 	a->inited = true;
 
+#ifdef CONFIG_DEBUG_FS
 	nvgpu_init_alloc_debug(g, __a);
+#endif
 	alloc_dbg(__a, "New allocator: type          lockless\n");
 	alloc_dbg(__a, "               base          0x%llx\n", a->base);
 	alloc_dbg(__a, "               nodes         %d\n", a->nr_nodes);
diff --git a/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c b/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
index 211b353bb..1646d2b1a 100644
--- a/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
@@ -20,11 +20,6 @@
 
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
-#ifdef CONFIG_DEBUG_FS
-#include "gk20a/platform_gk20a.h"
-#endif
-
-u32 nvgpu_alloc_tracing_on;
 
 u64 nvgpu_alloc_length(struct nvgpu_allocator *a)
 {
@@ -151,68 +146,3 @@ int __nvgpu_alloc_common_init(struct nvgpu_allocator *a, struct gk20a *g,
 
 	return 0;
 }
-
-#ifdef CONFIG_DEBUG_FS
-void nvgpu_alloc_print_stats(struct nvgpu_allocator *__a,
-			     struct seq_file *s, int lock)
-{
-	__a->ops->print_stats(__a, s, lock);
-}
-
-static int __alloc_show(struct seq_file *s, void *unused)
-{
-	struct nvgpu_allocator *a = s->private;
-
-	nvgpu_alloc_print_stats(a, s, 1);
-
-	return 0;
-}
-
-static int __alloc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, __alloc_show, inode->i_private);
-}
-
-static const struct file_operations __alloc_fops = {
-	.open = __alloc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-#endif
-
-void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a)
-{
-#ifdef CONFIG_DEBUG_FS
-	if (!g->debugfs_allocators)
-		return;
-
-	a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
-					       g->debugfs_allocators,
-					       a, &__alloc_fops);
-#endif
-}
-
-void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a)
-{
-#ifdef CONFIG_DEBUG_FS
-	if (!IS_ERR_OR_NULL(a->debugfs_entry))
-		debugfs_remove(a->debugfs_entry);
-#endif
-}
-
-#ifdef CONFIG_DEBUG_FS
-void nvgpu_alloc_debugfs_init(struct device *dev)
-{
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct dentry *gpu_root = platform->debugfs;
-	struct gk20a *g = get_gk20a(dev);
-
-	g->debugfs_allocators = debugfs_create_dir("allocators", gpu_root);
-	if (IS_ERR_OR_NULL(g->debugfs_allocators))
-		return;
-
-	debugfs_create_u32("tracing", 0664, g->debugfs_allocators,
-			   &nvgpu_alloc_tracing_on);
-}
-#endif
diff --git a/drivers/gpu/nvgpu/common/mm/page_allocator.c b/drivers/gpu/nvgpu/common/mm/page_allocator.c
index 14b5da3c5..3f4f37069 100644
--- a/drivers/gpu/nvgpu/common/mm/page_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/page_allocator.c
@@ -916,7 +916,9 @@ int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
 	if (err)
 		goto fail;
 
+#ifdef CONFIG_DEBUG_FS
 	nvgpu_init_alloc_debug(g, __a);
+#endif
 	palloc_dbg(a, "New allocator: type      page\n");
 	palloc_dbg(a, "               base      0x%llx\n", a->base);
 	palloc_dbg(a, "               size      0x%llx\n", a->length);
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index a01602748..084f17934 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -18,9 +18,6 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/fs.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
 #include <linux/dma-buf.h>
 
 #include <trace/events/gk20a.h>
@@ -40,8 +37,6 @@
 #include "cde_gk20a.h"
 #include "fence_gk20a.h"
 #include "gr_gk20a.h"
-#include "debug_gk20a.h"
-#include "platform_gk20a.h"
 
 #include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
@@ -1585,8 +1580,7 @@ int gk20a_prepare_compressible_read(
 	if (IS_ERR(dmabuf))
 		return -EINVAL;
 
-	err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g),
-				     offset, &state);
+	err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
 	if (err) {
 		dma_buf_put(dmabuf);
 		return err;
@@ -1650,7 +1644,7 @@ int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
 		return -EINVAL;
 	}
 
-	err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state);
+	err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
 	if (err) {
 		nvgpu_err(g, "could not get state from dmabuf");
 		dma_buf_put(dmabuf);
@@ -1671,38 +1665,3 @@ int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
 	dma_buf_put(dmabuf);
 	return 0;
 }
-
-#ifdef CONFIG_DEBUG_FS
-static ssize_t gk20a_cde_reload_write(struct file *file,
-	const char __user *userbuf, size_t count, loff_t *ppos)
-{
-	struct gk20a *g = file->private_data;
-	gk20a_cde_reload(g);
-	return count;
-}
-
-static const struct file_operations gk20a_cde_reload_fops = {
-	.open		= simple_open,
-	.write		= gk20a_cde_reload_write,
-};
-
-void gk20a_cde_debugfs_init(struct device *dev)
-{
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct gk20a *g = get_gk20a(dev);
-
-	if (!platform->has_cde)
-		return;
-
-	debugfs_create_u32("cde_parameter", S_IWUSR | S_IRUGO,
-			   platform->debugfs, &g->cde_app.shader_parameter);
-	debugfs_create_u32("cde_ctx_count", S_IWUSR | S_IRUGO,
-			   platform->debugfs, &g->cde_app.ctx_count);
-	debugfs_create_u32("cde_ctx_usecount", S_IWUSR | S_IRUGO,
-			   platform->debugfs, &g->cde_app.ctx_usecount);
-	debugfs_create_u32("cde_ctx_count_top", S_IWUSR | S_IRUGO,
-			   platform->debugfs, &g->cde_app.ctx_count_top);
-	debugfs_create_file("reload_cde_firmware", S_IWUSR, platform->debugfs,
-			    g, &gk20a_cde_reload_fops);
-}
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index ffd55b4dd..4f400bf3b 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -295,7 +295,6 @@ int gk20a_cde_convert(struct gk20a *g,
 		struct nvgpu_fence *fence,
 		u32 __flags, struct gk20a_cde_param *params,
 		int num_params, struct gk20a_fence **fence_out);
-void gk20a_cde_debugfs_init(struct device *dev);
 
 int gk20a_prepare_compressible_read(
 		struct gk20a *g, u32 buffer_fd, u32 request, u64 offset,
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 1ed90b145..c905bedb2 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -13,15 +13,10 @@
  * more details.
  */
 
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
-
 #include <nvgpu/kmem.h>
 #include <nvgpu/dma.h>
 
 #include "gk20a.h"
-#include "debug_gk20a.h"
 
 #include <nvgpu/log.h>
 
@@ -33,10 +28,6 @@
 #include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
 
-#ifdef CONFIG_DEBUG_FS
-#include "platform_gk20a.h"
-#endif
-
 static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)
 {
 	gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");
@@ -728,18 +719,3 @@ void gk20a_ce_delete_context_priv(struct gk20a *g,
 	return;
 }
 EXPORT_SYMBOL(gk20a_ce_delete_context);
-
-#ifdef CONFIG_DEBUG_FS
-void gk20a_ce_debugfs_init(struct device *dev)
-{
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct gk20a *g = get_gk20a(dev);
-
-	debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
-			   platform->debugfs, &g->ce_app.ctx_count);
-	debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
-			   platform->debugfs, &g->ce_app.app_state);
-	debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
-			   platform->debugfs, &g->ce_app.next_ctx_id);
-}
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index dfd190195..f972e1758 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -156,10 +156,4 @@ void gk20a_ce_delete_context_priv(struct gk20a *g,
 void gk20a_ce_delete_context(struct gk20a *g,
 		u32 ce_ctx_id);
 
-
-#ifdef CONFIG_DEBUG_FS
-/* CE app debugfs api */
-void gk20a_ce_debugfs_init(struct device *dev);
-#endif
-
 #endif /*__CE2_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 571570d8d..13abed955 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -30,9 +30,9 @@
 #include <nvgpu/circ_buf.h>
 #include <nvgpu/cond.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a.h"
-#include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
 #include "dbg_gpu_gk20a.h"
 #include "fence_gk20a.h"
@@ -1403,6 +1403,7 @@ static u32 get_gp_free_count(struct channel_gk20a *c)
 	return gp_free_count(c);
 }
 
+#ifdef CONFIG_DEBUG_FS
 static void trace_write_pushbuffer(struct channel_gk20a *c,
 				   struct nvgpu_gpfifo *g)
 {
@@ -1439,6 +1440,7 @@ static void trace_write_pushbuffer(struct channel_gk20a *c,
 		dma_buf_vunmap(dmabuf, mem);
 	}
 }
+#endif
 
 static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 					 struct nvgpu_gpfifo *g,
@@ -1446,6 +1448,7 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 					 int offset,
 					 int count)
 {
+#ifdef CONFIG_DEBUG_FS
 	u32 size;
 	int i;
 	struct nvgpu_gpfifo *gp;
@@ -1478,6 +1481,7 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 
 	if (gpfifo_allocated)
 		nvgpu_big_free(c->g, g);
+#endif
 }
 
 static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
@@ -1629,8 +1633,8 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
 	nvgpu_err(g, "Job on channel %d timed out",
 		  ch->hw_chid);
 
-	gk20a_debug_dump(g->dev);
-	gk20a_gr_debug_dump(g->dev);
+	gk20a_debug_dump(g);
+	gk20a_gr_debug_dump(g);
 
 	g->ops.fifo.force_reset_ch(ch,
 		NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT, true);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index ac3a3d57a..46560a560 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -29,12 +29,11 @@
 #include <nvgpu/log.h>
 #include <nvgpu/soc.h>
 #include <nvgpu/atomic.h>
-#include <nvgpu/sort.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/log2.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a.h"
-#include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
 #include "mm_gk20a.h"
 
@@ -46,10 +45,6 @@
 #include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
 
-#ifdef CONFIG_DEBUG_FS
-#include "platform_gk20a.h"
-#endif
-
 #define FECS_METHOD_WFI_RESTORE 0x80000
 
 static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
@@ -57,10 +52,6 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 					    bool wait_for_finish);
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
 
-#ifdef CONFIG_DEBUG_FS
-static void __gk20a_fifo_profile_free(struct kref *ref);
-#endif
-
 u32 gk20a_fifo_get_engine_ids(struct gk20a *g,
 		u32 engine_id[], u32 engine_id_sz,
 		u32 engine_enum)
@@ -562,14 +553,6 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
 	f->engine_info = NULL;
 	nvgpu_kfree(g, f->active_engines_list);
 	f->active_engines_list = NULL;
-#ifdef CONFIG_DEBUG_FS
-	nvgpu_mutex_acquire(&f->profile.lock);
-	if (f->profile.enabled) {
-		f->profile.enabled = false;
-		kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
-	}
-	nvgpu_mutex_release(&f->profile.lock);
-#endif
 }
 
 /* reads info from hardware and fills in pbmda exception info record */
@@ -1543,7 +1526,7 @@ static bool gk20a_fifo_handle_mmu_fault(
 	} else {
 		fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
 		fake_fault = false;
-		gk20a_debug_dump(g->dev);
+		gk20a_debug_dump(g);
 	}
 
 
@@ -1833,7 +1816,7 @@ void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose)
 			gk20a_channel_abort(ch, false);
 
 			if (gk20a_fifo_error_ch(g, ch))
-				gk20a_debug_dump(g->dev);
+				gk20a_debug_dump(g);
 
 			gk20a_channel_put(ch);
 		}
@@ -1860,7 +1843,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
 		struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
 
 		if (gk20a_fifo_error_tsg(g, tsg))
-			gk20a_debug_dump(g->dev);
+			gk20a_debug_dump(g);
 
 		gk20a_fifo_abort_tsg(g, tsgid, false);
 	}
@@ -1957,7 +1940,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 	unsigned int id_type;
 
 	if (verbose)
-		gk20a_debug_dump(g->dev);
+		gk20a_debug_dump(g);
 
 	if (g->ops.ltc.flush)
 		g->ops.ltc.flush(g);
@@ -3441,345 +3424,6 @@ struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
 		return NULL;
 }
 
-#ifdef CONFIG_DEBUG_FS
-
-/* Get the next element in the ring buffer of profile entries
- * and grab a reference to the structure
- */
-struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g)
-{
-	struct fifo_gk20a *f = &g->fifo;
-	struct fifo_profile_gk20a *profile;
-	unsigned int index;
-
-	/* If kref is zero, profiling is not enabled */
-	if (!kref_get_unless_zero(&f->profile.ref))
-		return NULL;
-	index = atomic_inc_return(&f->profile.get);
-	profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES];
-
-	return profile;
-}
-
-/* Free the reference to the structure. This allows deferred cleanups */
-void gk20a_fifo_profile_release(struct gk20a *g,
-					struct fifo_profile_gk20a *profile)
-{
-	kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
-}
-
-static void *gk20a_fifo_sched_debugfs_seq_start(
-		struct seq_file *s, loff_t *pos)
-{
-	struct gk20a *g = s->private;
-	struct fifo_gk20a *f = &g->fifo;
-
-	if (*pos >= f->num_channels)
-		return NULL;
-
-	return &f->channel[*pos];
-}
-
-static void *gk20a_fifo_sched_debugfs_seq_next(
-		struct seq_file *s, void *v, loff_t *pos)
-{
-	struct gk20a *g = s->private;
-	struct fifo_gk20a *f = &g->fifo;
-
-	++(*pos);
-	if (*pos >= f->num_channels)
-		return NULL;
-
-	return &f->channel[*pos];
-}
-
-static void gk20a_fifo_sched_debugfs_seq_stop(
-		struct seq_file *s, void *v)
-{
-}
-
-static int gk20a_fifo_sched_debugfs_seq_show(
-		struct seq_file *s, void *v)
-{
-	struct gk20a *g = s->private;
-	struct fifo_gk20a *f = &g->fifo;
-	struct channel_gk20a *ch = v;
-	struct tsg_gk20a *tsg = NULL;
-
-	struct fifo_engine_info_gk20a *engine_info;
-	struct fifo_runlist_info_gk20a *runlist;
-	u32 runlist_id;
-	int ret = SEQ_SKIP;
-	u32 engine_id;
-
-	engine_id = gk20a_fifo_get_gr_engine_id(g);
-	engine_info = (f->engine_info + engine_id);
-	runlist_id = engine_info->runlist_id;
-	runlist = &f->runlist_info[runlist_id];
-
-	if (ch == f->channel) {
-		seq_puts(s, "chid     tsgid    pid      timeslice  timeout  interleave graphics_preempt compute_preempt\n");
-		seq_puts(s, "                            (usecs)   (msecs)\n");
-		ret = 0;
-	}
-
-	if (!test_bit(ch->hw_chid, runlist->active_channels))
-		return ret;
-
-	if (gk20a_channel_get(ch)) {
-		if (gk20a_is_channel_marked_as_tsg(ch))
-			tsg = &f->tsg[ch->tsgid];
-
-		seq_printf(s, "%-8d %-8d %-8d %-9d %-8d %-10d %-8d %-8d\n",
-				ch->hw_chid,
-				ch->tsgid,
-				ch->tgid,
-				tsg ? tsg->timeslice_us : ch->timeslice_us,
-				ch->timeout_ms_max,
-				tsg ? tsg->interleave_level : ch->interleave_level,
-				ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->graphics_preempt_mode : U32_MAX,
-				ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->compute_preempt_mode : U32_MAX);
-		gk20a_channel_put(ch);
-	}
-	return 0;
-}
-
-static const struct seq_operations gk20a_fifo_sched_debugfs_seq_ops = {
-	.start = gk20a_fifo_sched_debugfs_seq_start,
-	.next = gk20a_fifo_sched_debugfs_seq_next,
-	.stop = gk20a_fifo_sched_debugfs_seq_stop,
-	.show = gk20a_fifo_sched_debugfs_seq_show
-};
-
-static int gk20a_fifo_sched_debugfs_open(struct inode *inode,
-	struct file *file)
-{
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = seq_open(file, &gk20a_fifo_sched_debugfs_seq_ops);
-	if (err)
-		return err;
-
-	gk20a_dbg(gpu_dbg_info, "i_private=%p", inode->i_private);
-
-	((struct seq_file *)file->private_data)->private = inode->i_private;
-	return 0;
-};
-
-/*
- * The file operations structure contains our open function along with
- * set of the canned seq_ ops.
- */
-static const struct file_operations gk20a_fifo_sched_debugfs_fops = {
-	.owner = THIS_MODULE,
-	.open = gk20a_fifo_sched_debugfs_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release
-};
-
-static void __gk20a_fifo_profile_free(struct kref *ref)
-{
-	struct fifo_gk20a *f = container_of(ref, struct fifo_gk20a,
-						profile.ref);
-	nvgpu_vfree(f->g, f->profile.data);
-	nvgpu_vfree(f->g, f->profile.sorted);
-}
-
-static int gk20a_fifo_profile_enable(void *data, u64 val)
-{
-	struct gk20a *g = (struct gk20a *) data;
-	struct fifo_gk20a *f = &g->fifo;
-
-
-	nvgpu_mutex_acquire(&f->profile.lock);
-	if (val == 0) {
-		if (f->profile.enabled) {
-			f->profile.enabled = false;
-			kref_put(&f->profile.ref, __gk20a_fifo_profile_free);
-		}
-	} else {
-		if (!f->profile.enabled) {
-			/* not kref init as it can have a running condition if
-			 * we enable/disable/enable while kickoff is happening
-			 */
-			if (!kref_get_unless_zero(&f->profile.ref)) {
-				f->profile.data = vzalloc(
-							FIFO_PROFILING_ENTRIES *
-					sizeof(struct fifo_profile_gk20a));
-				f->profile.sorted  = vzalloc(
-							FIFO_PROFILING_ENTRIES *
-							sizeof(u64));
-				if (!(f->profile.data && f->profile.sorted)) {
-					nvgpu_vfree(g, f->profile.data);
-					nvgpu_vfree(g, f->profile.sorted);
-					nvgpu_mutex_release(&f->profile.lock);
-					return -ENOMEM;
-				}
-				kref_init(&f->profile.ref);
-			}
-			atomic_set(&f->profile.get, 0);
-			f->profile.enabled = true;
-		}
-	}
-	nvgpu_mutex_release(&f->profile.lock);
-
-	return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(
-	gk20a_fifo_profile_enable_debugfs_fops,
-	NULL,
-	gk20a_fifo_profile_enable,
-	"%llu\n"
-);
-
-static int __profile_cmp(const void *a, const void *b)
-{
-	return *((unsigned long long *) a) - *((unsigned long long *) b);
-}
-
-/*
- * This uses about 800b in the stack, but the function using it is not part
- * of a callstack where much memory is being used, so it is fine
- */
-#define PERCENTILE_WIDTH	5
-#define PERCENTILE_RANGES	(100/PERCENTILE_WIDTH)
-
-static unsigned int __gk20a_fifo_create_stats(struct gk20a *g,
-		u64 *percentiles, u32 index_end, u32 index_start)
-{
-	unsigned int nelem = 0;
-	unsigned int index;
-	struct fifo_profile_gk20a *profile;
-
-	for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) {
-		profile = &g->fifo.profile.data[index];
-
-		if (profile->timestamp[index_end] >
-				profile->timestamp[index_start]) {
-			/* This is a valid element */
-			g->fifo.profile.sorted[nelem] =
-						profile->timestamp[index_end] -
-						profile->timestamp[index_start];
-			nelem++;
-		}
-	}
-
-	/* sort it */
-	sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long),
-		__profile_cmp, NULL);
-
-	/* build ranges */
-	for (index = 0; index < PERCENTILE_RANGES; index++)
-		percentiles[index] =
-			g->fifo.profile.sorted[(PERCENTILE_WIDTH * (index + 1) *
-						nelem)/100 - 1];
-	return nelem;
-}
-
-static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused)
-{
-	struct gk20a *g = s->private;
-	unsigned int get, nelem, index;
-	/*
-	 * 800B in the stack, but function is declared statically and only
-	 * called from debugfs handler
-	 */
-	u64 percentiles_ioctl[PERCENTILE_RANGES];
-	u64 percentiles_kickoff[PERCENTILE_RANGES];
-	u64 percentiles_jobtracking[PERCENTILE_RANGES];
-	u64 percentiles_append[PERCENTILE_RANGES];
-	u64 percentiles_userd[PERCENTILE_RANGES];
-
-	if (!kref_get_unless_zero(&g->fifo.profile.ref)) {
-		seq_printf(s, "Profiling disabled\n");
-		return 0;
-	}
-
-	get = atomic_read(&g->fifo.profile.get);
-
-	__gk20a_fifo_create_stats(g, percentiles_ioctl,
-		PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY);
-	__gk20a_fifo_create_stats(g, percentiles_kickoff,
-		PROFILE_END, PROFILE_ENTRY);
-	__gk20a_fifo_create_stats(g, percentiles_jobtracking,
-		PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY);
-	__gk20a_fifo_create_stats(g, percentiles_append,
-		PROFILE_APPEND, PROFILE_JOB_TRACKING);
-	nelem = __gk20a_fifo_create_stats(g, percentiles_userd,
-		PROFILE_END, PROFILE_APPEND);
-
-	seq_printf(s, "Number of kickoffs: %d\n", nelem);
-	seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n");
-
-	for (index = 0; index < PERCENTILE_RANGES; index++)
-		seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n",
-			PERCENTILE_WIDTH * (index+1),
-			percentiles_ioctl[index],
-			percentiles_kickoff[index],
-			percentiles_append[index],
-			percentiles_jobtracking[index],
-			percentiles_userd[index]);
-
-	kref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
-
-	return 0;
-}
-
-static int gk20a_fifo_profile_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, gk20a_fifo_profile_stats, inode->i_private);
-}
-
-static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = {
-	.open		= gk20a_fifo_profile_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-
-void gk20a_fifo_debugfs_init(struct device *dev)
-{
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct gk20a *g = get_gk20a(dev);
-
-	struct dentry *gpu_root = platform->debugfs;
-	struct dentry *fifo_root;
-	struct dentry *profile_root;
-
-
-	fifo_root = debugfs_create_dir("fifo", gpu_root);
-	if (IS_ERR_OR_NULL(fifo_root))
-		return;
-
-	gk20a_dbg(gpu_dbg_info, "g=%p", g);
-
-	debugfs_create_file("sched", 0600, fifo_root, g,
-		&gk20a_fifo_sched_debugfs_fops);
-
-	profile_root = debugfs_create_dir("profile", fifo_root);
-	if (IS_ERR_OR_NULL(profile_root))
-		return;
-
-	nvgpu_mutex_init(&g->fifo.profile.lock);
-	g->fifo.profile.enabled = false;
-	atomic_set(&g->fifo.profile.get, 0);
-	atomic_set(&g->fifo.profile.ref.refcount, 0);
-
-	debugfs_create_file("enable", 0600, profile_root, g,
-		&gk20a_fifo_profile_enable_debugfs_fops);
-
-	debugfs_create_file("stats", 0600, profile_root, g,
-		&gk20a_fifo_profile_stats_debugfs_fops);
-
-}
-#endif /* CONFIG_DEBUG_FS */
-
 static const char * const ccsr_chan_status_str[] = {
 	"idle",
 	"pending",
@@ -3901,6 +3545,54 @@ void gk20a_dump_channel_status_ramfc(struct gk20a *g,
 	gk20a_debug_output(o, "\n");
 }
 
+void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
+		 struct gk20a_debug_output *o)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
+	struct ch_state **ch_state;
+
+	ch_state = nvgpu_kzalloc(g, sizeof(*ch_state) * f->num_channels);
+	if (!ch_state) {
+		gk20a_debug_output(o, "cannot alloc memory for channels\n");
+		return;
+	}
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (gk20a_channel_get(ch)) {
+			ch_state[chid] =
+				nvgpu_kmalloc(g, sizeof(struct ch_state) +
+					ram_in_alloc_size_v());
+			/* ref taken stays to below loop with
+			 * successful allocs */
+			if (!ch_state[chid])
+				gk20a_channel_put(ch);
+		}
+	}
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (!ch_state[chid])
+			continue;
+
+		ch_state[chid]->pid = ch->pid;
+		ch_state[chid]->refs = atomic_read(&ch->ref_count);
+		nvgpu_mem_rd_n(g, &ch->inst_block, 0,
+				&ch_state[chid]->inst_block[0],
+				ram_in_alloc_size_v());
+		gk20a_channel_put(ch);
+	}
+	for (chid = 0; chid < f->num_channels; chid++) {
+		if (ch_state[chid]) {
+			g->ops.fifo.dump_channel_status_ramfc(g, o, chid,
+						 ch_state[chid]);
+			nvgpu_kfree(g, ch_state[chid]);
+		}
+	}
+	nvgpu_kfree(g, ch_state);
+}
+
 void gk20a_dump_pbdma_status(struct gk20a *g,
 				 struct gk20a_debug_output *o)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 6c8868a24..228e5130c 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -23,10 +23,11 @@
 
 #include "channel_gk20a.h"
 #include "tsg_gk20a.h"
-#include "debug_gk20a.h"
 
 #include <nvgpu/kref.h>
 
+struct gk20a_debug_output;
+
 #define MAX_RUNLIST_BUFFERS		2
 
 #define FIFO_INVAL_ENGINE_ID		((u32)~0)
@@ -287,8 +288,6 @@ int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
 int gk20a_fifo_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice);
 
 
-void gk20a_fifo_debugfs_init(struct device *dev);
-
 const char *gk20a_fifo_interleave_level_name(u32 interleave_level);
 
 int gk20a_fifo_engine_enum_from_type(struct gk20a *g, u32 engine_type,
@@ -341,6 +340,8 @@ void gk20a_dump_channel_status_ramfc(struct gk20a *g,
 				     struct gk20a_debug_output *o,
 				     u32 hw_chid,
 				     struct ch_state *ch_state);
+void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
+		 struct gk20a_debug_output *o);
 void gk20a_dump_pbdma_status(struct gk20a *g,
 				 struct gk20a_debug_output *o);
 void gk20a_dump_eng_status(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 689fafb11..899c1d6a3 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -30,6 +30,7 @@ struct acr_desc;
 struct nvgpu_mem_alloc_tracker;
 struct dbg_profiler_object_data;
 struct ecc_gk20a;
+struct gk20a_debug_output;
 
 #include <linux/sched.h>
 #include <nvgpu/lock.h>
@@ -61,7 +62,6 @@ struct ecc_gk20a;
 #include "therm_gk20a.h"
 #include "gm20b/acr_gm20b.h"
 #include "cde_gk20a.h"
-#include "debug_gk20a.h"
 #include "sched_gk20a.h"
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 #include "clk/clk.h"
@@ -1544,10 +1544,6 @@ void nvgpu_wait_for_deferred_interrupts(struct gk20a *g);
 struct gk20a * __must_check gk20a_get(struct gk20a *g);
 void gk20a_put(struct gk20a *g);
 
-#ifdef CONFIG_DEBUG_FS
-int gk20a_railgating_debugfs_init(struct device *dev);
-#endif
-
 static inline bool gk20a_platform_has_syncpoints(struct gk20a *g)
 {
 #ifdef CONFIG_TEGRA_GK20A_NVHOST
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 2188618c0..982cfac86 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -30,6 +30,7 @@
 #include <nvgpu/bug.h>
 #include <nvgpu/firmware.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a.h"
 #include "kind_gk20a.h"
@@ -37,13 +38,8 @@
 #include "gr_pri_gk20a.h"
 #include "regops_gk20a.h"
 #include "dbg_gpu_gk20a.h"
-#include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
 
-#ifdef CONFIG_DEBUG_FS
-#include "platform_gk20a.h"
-#endif
-
 #include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
@@ -514,7 +510,7 @@ int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
 		nvgpu_err(g,
 			   "timeout waiting on ucode response");
 		gk20a_fecs_dump_falcon_stats(g);
-		gk20a_gr_debug_dump(g->dev);
+		gk20a_gr_debug_dump(g);
 		return -1;
 	} else if (check == WAIT_UCODE_ERROR) {
 		nvgpu_err(g,
@@ -9032,20 +9028,6 @@ static int gr_gk20a_dump_gr_status_regs(struct gk20a *g,
 	return 0;
 }
 
-#ifdef CONFIG_DEBUG_FS
-int gr_gk20a_debugfs_init(struct gk20a *g)
-{
-	struct gk20a_platform *platform = dev_get_drvdata(g->dev);
-
-	g->debugfs_gr_default_attrib_cb_size =
-		debugfs_create_u32("gr_default_attrib_cb_size",
-				   S_IRUGO|S_IWUSR, platform->debugfs,
-				   &g->gr.attrib_cb_default_size);
-
-	return 0;
-}
-#endif
-
 static void gr_gk20a_init_cyclestats(struct gk20a *g)
 {
 #if defined(CONFIG_GK20A_CYCLE_STATS)
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 79aeb42f3..deb8ea9c0 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -653,7 +653,6 @@ int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 void gr_gk20a_free_gr_ctx(struct gk20a *g,
 			  struct vm_gk20a *vm, struct gr_ctx_desc *gr_ctx);
 int gr_gk20a_halt_pipe(struct gk20a *g);
-int gr_gk20a_debugfs_init(struct gk20a *g);
 
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 int gr_gk20a_css_attach(struct channel_gk20a *ch,   /* in - main hw structure */
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
index 8a3beb39e..b19398a6a 100644
--- a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -37,6 +37,7 @@
 #include "pramin_gk20a.h"
 #include "priv_ring_gk20a.h"
 
+#include <nvgpu/debug.h>
 #include <nvgpu/log.h>
 #include <nvgpu/bug.h>
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 53d22a7d3..08e2e9cc1 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -2563,13 +2563,13 @@ priv_exist_or_err:
 	return 0;
 }
 
-int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct device *dev,
+int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct gk20a *g,
 			   u64 offset, struct gk20a_buffer_state **state)
 {
 	int err = 0;
 	struct gk20a_dmabuf_priv *priv;
 	struct gk20a_buffer_state *s;
-	struct gk20a *g = get_gk20a(dev);
+	struct device *dev = g->dev;
 
 	if (WARN_ON(offset >= (u64)dmabuf->size))
 		return -EINVAL;
@@ -3123,18 +3123,6 @@ static bool gk20a_mm_is_bar1_supported(struct gk20a *g)
 	return true;
 }
 
-#ifdef CONFIG_DEBUG_FS
-void gk20a_mm_debugfs_init(struct device *dev)
-{
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct dentry *gpu_root = platform->debugfs;
-	struct gk20a *g = gk20a_get_platform(dev)->g;
-
-	debugfs_create_bool("force_pramin", 0664, gpu_root,
-			   &g->mm.force_pramin);
-}
-#endif
-
 void gk20a_init_mm(struct gpu_ops *gops)
 {
 	gops->mm.gmmu_map = gk20a_locked_gmmu_map;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 79b553712..5d90cbf6c 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -146,7 +146,6 @@ struct channel_gk20a;
 int gk20a_init_mm_support(struct gk20a *g);
 int gk20a_init_mm_setup_sw(struct gk20a *g);
 int gk20a_init_mm_setup_hw(struct gk20a *g);
-void gk20a_mm_debugfs_init(struct device *dev);
 void gk20a_init_mm_ce_context(struct gk20a *g);
 
 int gk20a_mm_fb_flush(struct gk20a *g);
@@ -437,7 +436,7 @@ dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr);
 
 int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev);
 
-int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct device *dev,
+int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct gk20a *g,
 			   u64 offset, struct gk20a_buffer_state **state);
 
 int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry);
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index a9e039437..552d5d735 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -35,12 +35,6 @@
 #include "nvgpu_gpuid_t19x.h"
 #endif
 
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include "platform_gk20a.h"
-#endif
-
 #define GK20A_PMU_UCODE_IMAGE	"gpmu_ucode.bin"
 
 #define PMU_MEM_SCRUBBING_TIMEOUT_MAX 1000
@@ -49,7 +43,7 @@
 #define gk20a_dbg_pmu(fmt, arg...) \
 	gk20a_dbg(gpu_dbg_pmu, fmt, ##arg)
 
-static int gk20a_pmu_get_pg_stats(struct gk20a *g,
+int gk20a_pmu_get_pg_stats(struct gk20a *g,
 		u32 pg_engine_id,
 		struct pmu_pg_stats_data *pg_stat_data);
 static void ap_callback_init_and_enable_ctrl(
@@ -281,7 +275,7 @@ static void set_pmu_cmdline_args_falctracesize_v1(
 	pmu->args_v1.falc_trace_size = size;
 }
 
-static bool find_hex_in_string(char *strings, struct gk20a *g, u32 *hex_pos)
+bool nvgpu_find_hex_in_string(char *strings, struct gk20a *g, u32 *hex_pos)
 {
 	u32 i = 0, j = strlen(strings);
 	for (; i < j; i++) {
@@ -326,7 +320,7 @@ static void printtrace(struct pmu_gk20a *pmu)
 		count = scnprintf(buf, 0x40, "Index %x: ", trace1[(i / 4)]);
 		l = 0;
 		m = 0;
-		while (find_hex_in_string((trace+i+20+m), g, &k)) {
+		while (nvgpu_find_hex_in_string((trace+i+20+m), g, &k)) {
 			if (k >= 40)
 				break;
 			strncpy(part_str, (trace+i+20+m), k);
@@ -4141,7 +4135,7 @@ void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
 		nvgpu_err(g, "ZBC save timeout");
 }
 
-static int pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
+int nvgpu_pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
 {
 	struct gk20a *g = gk20a_from_pmu(pmu);
 	struct pmu_v *pv = &g->ops.pmu_ver;
@@ -4185,7 +4179,7 @@ static int pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
 	return 0;
 }
 
-static int pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu)
+int nvgpu_pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu)
 {
 	struct gk20a *g = gk20a_from_pmu(pmu);
 	struct pmu_cmd cmd;
@@ -4231,7 +4225,7 @@ static int pmu_handle_perfmon_event(struct pmu_gk20a *pmu,
 
 	/* restart sampling */
 	if (pmu->perfmon_sampling_enabled)
-		return pmu_perfmon_start_sampling(pmu);
+		return nvgpu_pmu_perfmon_start_sampling(pmu);
 	return 0;
 }
 
@@ -5173,9 +5167,9 @@ int gk20a_pmu_perfmon_enable(struct gk20a *g, bool enable)
 	gk20a_dbg_fn("");
 
 	if (enable)
-		err = pmu_perfmon_start_sampling(pmu);
+		err = nvgpu_pmu_perfmon_start_sampling(pmu);
 	else
-		err = pmu_perfmon_stop_sampling(pmu);
+		err = nvgpu_pmu_perfmon_stop_sampling(pmu);
 
 	return err;
 }
@@ -5293,7 +5287,7 @@ void gk20a_pmu_elpg_statistics(struct gk20a *g, u32 pg_engine_id,
 	pg_stat_data->avg_exit_latency_us = stats.pg_avg_exit_time_us;
 }
 
-static int gk20a_pmu_get_pg_stats(struct gk20a *g,
+int gk20a_pmu_get_pg_stats(struct gk20a *g,
 		u32 pg_engine_id,
 		struct pmu_pg_stats_data *pg_stat_data)
 {
@@ -5463,466 +5457,3 @@ int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id)
 	status = gk20a_pmu_ap_send_command(g, &ap_cmd, true);
 	return status;
 }
-
-#ifdef CONFIG_DEBUG_FS
-static int lpwr_debug_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-
-	if (g->ops.pmu.pmu_pg_engines_feature_list &&
-		g->ops.pmu.pmu_pg_engines_feature_list(g,
-		PMU_PG_ELPG_ENGINE_ID_GRAPHICS) !=
-		PMU_PG_FEATURE_GR_POWER_GATING_ENABLED) {
-		seq_printf(s, "PSTATE: %u\n"
-			"RPPG Enabled: %u\n"
-			"RPPG ref count: %u\n"
-			"RPPG state: %u\n"
-			"MSCG Enabled: %u\n"
-			"MSCG pstate state: %u\n"
-			"MSCG transition state: %u\n",
-			g->ops.clk_arb.get_current_pstate(g),
-			g->elpg_enabled, g->pmu.elpg_refcnt,
-			g->pmu.elpg_stat, g->mscg_enabled,
-			g->pmu.mscg_stat, g->pmu.mscg_transition_state);
-
-	} else
-		seq_printf(s, "ELPG Enabled: %u\n"
-			"ELPG ref count: %u\n"
-			"ELPG state: %u\n",
-			g->elpg_enabled, g->pmu.elpg_refcnt,
-			g->pmu.elpg_stat);
-
-	return 0;
-
-}
-
-static int lpwr_debug_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, lpwr_debug_show, inode->i_private);
-}
-
-static const struct file_operations lpwr_debug_fops = {
-	.open		= lpwr_debug_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int mscg_stat_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-	u64 total_ingating, total_ungating, residency, divisor, dividend;
-	struct pmu_pg_stats_data pg_stat_data = { 0 };
-	int err;
-
-	/* Don't unnecessarily power on the device */
-	if (g->power_on) {
-		err = gk20a_busy(g);
-		if (err)
-			return err;
-
-		gk20a_pmu_get_pg_stats(g,
-			PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
-		gk20a_idle(g);
-	}
-	total_ingating = g->pg_ingating_time_us +
-			(u64)pg_stat_data.ingating_time;
-	total_ungating = g->pg_ungating_time_us +
-			(u64)pg_stat_data.ungating_time;
-
-	divisor = total_ingating + total_ungating;
-
-	/* We compute the residency on a scale of 1000 */
-	dividend = total_ingating * 1000;
-
-	if (divisor)
-		residency = div64_u64(dividend, divisor);
-	else
-		residency = 0;
-
-	seq_printf(s,
-			"Time in MSCG: %llu us\n"
-			"Time out of MSCG: %llu us\n"
-			"MSCG residency ratio: %llu\n"
-			"MSCG Entry Count: %u\n"
-			"MSCG Avg Entry latency %u\n"
-			"MSCG Avg Exit latency %u\n",
-			total_ingating, total_ungating,
-			residency, pg_stat_data.gating_cnt,
-			pg_stat_data.avg_entry_latency_us,
-			pg_stat_data.avg_exit_latency_us);
-	return 0;
-
-}
-
-static int mscg_stat_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mscg_stat_show, inode->i_private);
-}
-
-static const struct file_operations mscg_stat_fops = {
-	.open		= mscg_stat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int mscg_transitions_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-	struct pmu_pg_stats_data pg_stat_data = { 0 };
-	u32 total_gating_cnt;
-	int err;
-
-	if (g->power_on) {
-		err = gk20a_busy(g);
-		if (err)
-			return err;
-
-		gk20a_pmu_get_pg_stats(g,
-			PMU_PG_ELPG_ENGINE_ID_MS, &pg_stat_data);
-		gk20a_idle(g);
-	}
-	total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
-
-	seq_printf(s, "%u\n", total_gating_cnt);
-	return 0;
-
-}
-
-static int mscg_transitions_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mscg_transitions_show, inode->i_private);
-}
-
-static const struct file_operations mscg_transitions_fops = {
-	.open		= mscg_transitions_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int elpg_stat_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-	struct pmu_pg_stats_data pg_stat_data = { 0 };
-	u64 total_ingating, total_ungating, residency, divisor, dividend;
-	int err;
-
-	/* Don't unnecessarily power on the device */
-	if (g->power_on) {
-		err = gk20a_busy(g);
-		if (err)
-			return err;
-
-		gk20a_pmu_get_pg_stats(g,
-			PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
-		gk20a_idle(g);
-	}
-	total_ingating = g->pg_ingating_time_us +
-			(u64)pg_stat_data.ingating_time;
-	total_ungating = g->pg_ungating_time_us +
-			(u64)pg_stat_data.ungating_time;
-	divisor = total_ingating + total_ungating;
-
-	/* We compute the residency on a scale of 1000 */
-	dividend = total_ingating * 1000;
-
-	if (divisor)
-		residency = div64_u64(dividend, divisor);
-	else
-		residency = 0;
-
-	seq_printf(s,
-			"Time in ELPG: %llu us\n"
-			"Time out of ELPG: %llu us\n"
-			"ELPG residency ratio: %llu\n"
-			"ELPG Entry Count: %u\n"
-			"ELPG Avg Entry latency %u us\n"
-			"ELPG Avg Exit latency %u us\n",
-			total_ingating, total_ungating,
-			residency, pg_stat_data.gating_cnt,
-			pg_stat_data.avg_entry_latency_us,
-			pg_stat_data.avg_exit_latency_us);
-	return 0;
-
-}
-
-static int elpg_stat_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, elpg_stat_show, inode->i_private);
-}
-
-static const struct file_operations elpg_stat_fops = {
-	.open		= elpg_stat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int elpg_transitions_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-	struct pmu_pg_stats_data pg_stat_data = { 0 };
-	u32 total_gating_cnt;
-	int err;
-
-	if (g->power_on) {
-		err = gk20a_busy(g);
-		if (err)
-			return err;
-
-		gk20a_pmu_get_pg_stats(g,
-			PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data);
-		gk20a_idle(g);
-	}
-	total_gating_cnt = g->pg_gating_cnt + pg_stat_data.gating_cnt;
-
-	seq_printf(s, "%u\n", total_gating_cnt);
-	return 0;
-
-}
-
-static int elpg_transitions_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, elpg_transitions_show, inode->i_private);
-}
-
-static const struct file_operations elpg_transitions_fops = {
-	.open		= elpg_transitions_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int falc_trace_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-	struct pmu_gk20a *pmu = &g->pmu;
-	u32 i = 0, j = 0, k, l, m;
-	char part_str[40];
-	void *tracebuffer;
-	char *trace;
-	u32 *trace1;
-
-	/* allocate system memory to copy pmu trace buffer */
-	tracebuffer = nvgpu_kzalloc(g, GK20A_PMU_TRACE_BUFSIZE);
-	if (tracebuffer == NULL)
-		return -ENOMEM;
-
-	/* read pmu traces into system memory buffer */
-	nvgpu_mem_rd_n(g, &pmu->trace_buf,
-		       0, tracebuffer, GK20A_PMU_TRACE_BUFSIZE);
-
-	trace = (char *)tracebuffer;
-	trace1 = (u32 *)tracebuffer;
-
-	for (i = 0; i < GK20A_PMU_TRACE_BUFSIZE; i += 0x40) {
-		for (j = 0; j < 0x40; j++)
-			if (trace1[(i / 4) + j])
-				break;
-		if (j == 0x40)
-			break;
-		seq_printf(s, "Index %x: ", trace1[(i / 4)]);
-		l = 0;
-		m = 0;
-		while (find_hex_in_string((trace+i+20+m), g, &k)) {
-			if (k >= 40)
-				break;
-			strncpy(part_str, (trace+i+20+m), k);
-			part_str[k] = 0;
-			seq_printf(s, "%s0x%x", part_str,
-					trace1[(i / 4) + 1 + l]);
-			l++;
-			m += k + 2;
-		}
-		seq_printf(s, "%s", (trace+i+20+m));
-	}
-
-	nvgpu_kfree(g, tracebuffer);
-	return 0;
-}
-
-static int falc_trace_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, falc_trace_show, inode->i_private);
-}
-
-static const struct file_operations falc_trace_fops = {
-	.open		= falc_trace_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int perfmon_events_enable_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-
-	seq_printf(s, "%u\n", g->pmu.perfmon_sampling_enabled ? 1 : 0);
-	return 0;
-
-}
-
-static int perfmon_events_enable_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, perfmon_events_enable_show, inode->i_private);
-}
-
-static ssize_t perfmon_events_enable_write(struct file *file,
-	const char __user *userbuf, size_t count, loff_t *ppos)
-{
-	struct seq_file *s = file->private_data;
-	struct gk20a *g = s->private;
-	unsigned long val = 0;
-	char buf[40];
-	int buf_size;
-	int err;
-
-	memset(buf, 0, sizeof(buf));
-	buf_size = min(count, (sizeof(buf)-1));
-
-	if (copy_from_user(buf, userbuf, buf_size))
-		return -EFAULT;
-
-	if (kstrtoul(buf, 10, &val) < 0)
-		return -EINVAL;
-
-	/* Don't turn on gk20a unnecessarily */
-	if (g->power_on) {
-		err = gk20a_busy(g);
-		if (err)
-			return err;
-
-		if (val && !g->pmu.perfmon_sampling_enabled) {
-			g->pmu.perfmon_sampling_enabled = true;
-			pmu_perfmon_start_sampling(&(g->pmu));
-		} else if (!val && g->pmu.perfmon_sampling_enabled) {
-			g->pmu.perfmon_sampling_enabled = false;
-			pmu_perfmon_stop_sampling(&(g->pmu));
-		}
-		gk20a_idle(g);
-	} else {
-		g->pmu.perfmon_sampling_enabled = val ? true : false;
-	}
-
-	return count;
-}
-
-static const struct file_operations perfmon_events_enable_fops = {
-	.open		= perfmon_events_enable_open,
-	.read		= seq_read,
-	.write		= perfmon_events_enable_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int perfmon_events_count_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-
-	seq_printf(s, "%lu\n", g->pmu.perfmon_events_cnt);
-	return 0;
-
-}
-
-static int perfmon_events_count_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, perfmon_events_count_show, inode->i_private);
-}
-
-static const struct file_operations perfmon_events_count_fops = {
-	.open		= perfmon_events_count_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int security_show(struct seq_file *s, void *data)
-{
-	struct gk20a *g = s->private;
-
-	seq_printf(s, "%d\n", g->pmu.pmu_mode);
-	return 0;
-
-}
-
-static int security_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, security_show, inode->i_private);
-}
-
-static const struct file_operations security_fops = {
-	.open		= security_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-int gk20a_pmu_debugfs_init(struct device *dev)
-{
-	struct dentry *d;
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct gk20a *g = get_gk20a(dev);
-
-	d = debugfs_create_file(
-		"lpwr_debug", S_IRUGO|S_IWUSR, platform->debugfs, g,
-						&lpwr_debug_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"mscg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
-						&mscg_stat_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"mscg_transitions", S_IRUGO, platform->debugfs, g,
-						&mscg_transitions_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"elpg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
-						&elpg_stat_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"elpg_transitions", S_IRUGO, platform->debugfs, g,
-						&elpg_transitions_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"falc_trace", S_IRUGO, platform->debugfs, g,
-						&falc_trace_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"perfmon_events_enable", S_IRUGO, platform->debugfs, g,
-						&perfmon_events_enable_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"perfmon_events_count", S_IRUGO, platform->debugfs, g,
-						&perfmon_events_count_fops);
-	if (!d)
-		goto err_out;
-
-	d = debugfs_create_file(
-		"pmu_security", S_IRUGO, platform->debugfs, g,
-						&security_fops);
-	if (!d)
-		goto err_out;
-	return 0;
-err_out:
-	pr_err("%s: Failed to make debugfs node\n", __func__);
-	debugfs_remove_recursive(platform->debugfs);
-	return -ENOMEM;
-}
-
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
index e7a8b7c27..cefb6577e 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -426,7 +426,6 @@ int pmu_mutex_release(struct pmu_gk20a *pmu, u32 id, u32 *token);
 int gk20a_pmu_destroy(struct gk20a *g);
 int gk20a_pmu_load_norm(struct gk20a *g, u32 *load);
 int gk20a_pmu_load_update(struct gk20a *g);
-int gk20a_pmu_debugfs_init(struct device *dev);
 void gk20a_pmu_reset_load_counters(struct gk20a *g);
 void gk20a_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
 		u32 *total_cycles);
@@ -468,5 +467,11 @@ int gk20a_pmu_vidmem_surface_alloc(struct gk20a *g, struct nvgpu_mem *mem,
 		u32 size);
 int gk20a_pmu_sysmem_surface_alloc(struct gk20a *g, struct nvgpu_mem *mem,
 		u32 size);
+int gk20a_pmu_get_pg_stats(struct gk20a *g,
+		u32 pg_engine_id, struct pmu_pg_stats_data *pg_stat_data);
+bool nvgpu_find_hex_in_string(char *strings, struct gk20a *g, u32 *hex_pos);
+
+int nvgpu_pmu_perfmon_start_sampling(struct pmu_gk20a *pmu);
+int nvgpu_pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu);
 
 #endif /*__PMU_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/sched_gk20a.c b/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
index b7edf3f05..3f3119afd 100644
--- a/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
@@ -13,10 +13,6 @@
 
 #include <asm/barrier.h>
 #include <linux/wait.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#include "platform_gk20a.h"
-#endif
 #include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <uapi/linux/nvgpu.h>
@@ -523,69 +519,6 @@ int gk20a_sched_dev_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-#ifdef CONFIG_DEBUG_FS
-static int gk20a_sched_debugfs_show(struct seq_file *s, void *unused)
-{
-	struct device *dev = s->private;
-	struct gk20a *g = gk20a_get_platform(dev)->g;
-	struct gk20a_sched_ctrl *sched = &g->sched_ctrl;
-	bool sched_busy = true;
-
-	int n = sched->bitmap_size / sizeof(u64);
-	int i;
-	int err;
-
-	err = gk20a_busy(g);
-	if (err)
-		return err;
-
-	if (nvgpu_mutex_tryacquire(&sched->busy_lock)) {
-		sched_busy = false;
-		nvgpu_mutex_release(&sched->busy_lock);
-	}
-
-	seq_printf(s, "control_locked=%d\n", sched->control_locked);
-	seq_printf(s, "busy=%d\n", sched_busy);
-	seq_printf(s, "bitmap_size=%zu\n", sched->bitmap_size);
-
-	nvgpu_mutex_acquire(&sched->status_lock);
-
-	seq_puts(s, "active_tsg_bitmap\n");
-	for (i = 0; i < n; i++)
-		seq_printf(s, "\t0x%016llx\n", sched->active_tsg_bitmap[i]);
-
-	seq_puts(s, "recent_tsg_bitmap\n");
-	for (i = 0; i < n; i++)
-		seq_printf(s, "\t0x%016llx\n", sched->recent_tsg_bitmap[i]);
-
-	nvgpu_mutex_release(&sched->status_lock);
-
-	gk20a_idle(g);
-
-	return 0;
-}
-
-static int gk20a_sched_debugfs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, gk20a_sched_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations gk20a_sched_debugfs_fops = {
-	.open		= gk20a_sched_debugfs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-void gk20a_sched_debugfs_init(struct device *dev)
-{
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-
-	debugfs_create_file("sched_ctrl", S_IRUGO, platform->debugfs,
-			dev, &gk20a_sched_debugfs_fops);
-}
-#endif /* CONFIG_DEBUG_FS */
-
 void gk20a_sched_ctrl_tsg_added(struct gk20a *g, struct tsg_gk20a *tsg)
 {
 	struct gk20a_sched_ctrl *sched = &g->sched_ctrl;
diff --git a/drivers/gpu/nvgpu/gk20a/sched_gk20a.h b/drivers/gpu/nvgpu/gk20a/sched_gk20a.h
index 4f6d15105..776f689dd 100644
--- a/drivers/gpu/nvgpu/gk20a/sched_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/sched_gk20a.h
@@ -48,7 +48,6 @@ void gk20a_sched_ctrl_tsg_added(struct gk20a *, struct tsg_gk20a *);
 void gk20a_sched_ctrl_tsg_removed(struct gk20a *, struct tsg_gk20a *);
 int gk20a_sched_ctrl_init(struct gk20a *);
 
-void gk20a_sched_debugfs_init(struct device *dev);
 void gk20a_sched_ctrl_cleanup(struct gk20a *g);
 
 #endif /* __SCHED_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 82c587f95..c6e451e1f 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -20,6 +20,7 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/log.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/gr_gk20a.h"
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index f5328f035..831fd5dad 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -33,11 +33,11 @@
 #include "clk_gm20b.h"
 #include "mc_gm20b.h"
 #include "regops_gm20b.h"
-#include "debug_gm20b.h"
 #include "cde_gm20b.h"
 #include "therm_gm20b.h"
 #include "hal_gm20b.h"
 
+#include <nvgpu/debug.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
 
@@ -234,7 +234,7 @@ int gm20b_init_hal(struct gk20a *g)
 	gm20b_init_pmu_ops(gops);
 	gm20b_init_clk_ops(gops);
 	gm20b_init_regops(gops);
-	gm20b_init_debug_ops(gops);
+	gk20a_init_debug_ops(gops);
 	gk20a_init_dbg_session_ops(gops);
 	gm20b_init_cde_ops(gops);
 	gm20b_init_therm_ops(gops);
diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c
index f28ff45f0..d923e5e92 100644
--- a/drivers/gpu/nvgpu/gp106/hal_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c
@@ -53,6 +53,7 @@
 
 #include "hal_gp106.h"
 
+#include <nvgpu/debug.h>
 #include <nvgpu/bug.h>
 
 #include <nvgpu/hw/gp106/hw_proj_gp106.h>
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index 98a8be2ff..9a30ad7c2 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -23,6 +23,7 @@
 #include <nvgpu/gmmu.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/bug.h>
+#include <nvgpu/debug.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/gr_gk20a.h"
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index e2a931bed..a1906a088 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -44,6 +44,7 @@
 #include "gp10b.h"
 #include "hal_gp10b.h"
 
+#include <nvgpu/debug.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/allocator.h b/drivers/gpu/nvgpu/include/nvgpu/allocator.h
index 3579b0fb5..567c44220 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/allocator.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/allocator.h
@@ -256,11 +256,13 @@ static inline struct gk20a *nvgpu_alloc_to_gpu(struct nvgpu_allocator *a)
 	return a->g;
 }
 
+#ifdef CONFIG_DEBUG_FS
 /*
  * Common functionality for the internals of the allocators.
  */
 void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a);
 void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a);
+#endif
 
 int  __nvgpu_alloc_common_init(struct nvgpu_allocator *a, struct gk20a *g,
 			       const char *name, void *priv, bool dbg,
@@ -281,11 +283,6 @@ static inline void nvgpu_alloc_disable_dbg(struct nvgpu_allocator *a)
  */
 extern u32 nvgpu_alloc_tracing_on;
 
-#ifdef CONFIG_DEBUG_FS
-struct device;
-void nvgpu_alloc_debugfs_init(struct device *dev);
-#endif
-
 #define nvgpu_alloc_trace_func()			\
 	do {						\
 		if (nvgpu_alloc_tracing_on)		\
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/debug.h
similarity index 54%
rename from drivers/gpu/nvgpu/gk20a/debug_gk20a.h
rename to drivers/gpu/nvgpu/include/nvgpu/debug.h
index 213922b33..70a039786 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/debug.h
@@ -14,28 +14,42 @@
  *
  */
 
-#ifndef _DEBUG_GK20A_H_
-#define _DEBUG_GK20A_H_
+#ifndef __NVGPU_DEBUG_H__
+#define __NVGPU_DEBUG_H__
 
-struct platform_device;
 struct gk20a;
 struct gpu_ops;
 
-extern unsigned int gk20a_debug_trace_cmdbuf;
-
 struct gk20a_debug_output {
 	void (*fn)(void *ctx, const char *str, size_t len);
 	void *ctx;
 	char buf[256];
 };
 
+#ifdef CONFIG_DEBUG_FS
+extern unsigned int gk20a_debug_trace_cmdbuf;
+
 void gk20a_debug_output(struct gk20a_debug_output *o,
 					const char *fmt, ...);
 
-void gk20a_debug_dump(struct device *pdev);
+void gk20a_debug_dump(struct gk20a *g);
 void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o);
-int gk20a_gr_debug_dump(struct device *pdev);
-void gk20a_debug_init(struct device *dev, const char *debugfs_symlink);
+int gk20a_gr_debug_dump(struct gk20a *g);
 void gk20a_init_debug_ops(struct gpu_ops *gops);
-void gk20a_debug_dump_device(void *dev);
+
+void gk20a_debug_init(struct gk20a *g, const char *debugfs_symlink);
+void gk20a_debug_deinit(struct gk20a *g);
+#else
+static inline void gk20a_debug_output(struct gk20a_debug_output *o,
+					const char *fmt, ...) {}
+
+static inline void gk20a_debug_dump(struct gk20a *g) {}
+static inline void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o) {}
+static inline int gk20a_gr_debug_dump(struct gk20a *g) { return 0;}
+static inline void gk20a_init_debug_ops(struct gpu_ops *gops) {}
+
+static inline void gk20a_debug_init(struct gk20a *g, const char *debugfs_symlink) {}
+static inline void gk20a_debug_deinit(struct gk20a *g) {}
 #endif
+
+#endif /* __NVGPU_DEBUG_H__ */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h b/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h
index dc198a04c..611854f2b 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/kmem.h
@@ -31,12 +31,6 @@ void *__nvgpu_track_kcalloc(struct gk20a *g, size_t n, size_t size,
 			    unsigned long ip);
 void  __nvgpu_track_vfree(struct gk20a *g, void *addr);
 void  __nvgpu_track_kfree(struct gk20a *g, void *addr);
-
-void nvgpu_kmem_debugfs_init(struct device *dev);
-#else
-static inline void nvgpu_kmem_debugfs_init(struct device *dev)
-{
-}
 #endif
 
 /**
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index 02cc5b474..cdd0d378c 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -22,10 +22,10 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
 
 #include "vgpu/vgpu.h"
 #include "vgpu/fecs_trace_vgpu.h"
-#include "gk20a/debug_gk20a.h"
 #include "gk20a/hal_gk20a.h"
 #include "gk20a/ctxsw_trace_gk20a.h"
 #include "gk20a/tsg_gk20a.h"
@@ -667,7 +667,7 @@ int vgpu_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	gk20a_debug_init(dev, "gpu.0");
+	gk20a_debug_init(gk20a, "gpu.0");
 
 	/* Set DMA parameters to allow larger sgt lists */
 	dev->dma_parms = &gk20a->dma_parms;