From 70ce67df2dba25bf118cbc81b645149b386ec75c Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Mon, 1 Jun 2020 19:56:37 -0500
Subject: [PATCH] gpu: nvgpu: Add a generic profiler

Add a generic profiler based on the channel kickoff profiler. This
aims to provide a mechanism to allow engineers to (more) easily profile
arbitrary software paths within nvgpu.

Usage of this profiler is still primarily through debugfs. Next up is
a generic debugfs interface for this profiler in the Linux code.

The end goal for this is to profile the recovery code and generate
interesting statistics.

JIRA NVGPU-5606

Signed-off-by: Alex Waterman <alexw@nvidia.com>
Change-Id: I99783ec7e5143855845bde4e98760ff43350456d
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2355319
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 arch/nvgpu-common.yaml                        |  10 +-
 drivers/gpu/nvgpu/Makefile                    |   1 +
 drivers/gpu/nvgpu/Makefile.sources            |   1 +
 drivers/gpu/nvgpu/common/fifo/fifo.c          |   9 +
 drivers/gpu/nvgpu/common/fifo/submit.c        |  42 +--
 drivers/gpu/nvgpu/common/swdebug/profile.c    | 284 ++++++++++++++++++
 drivers/gpu/nvgpu/include/nvgpu/channel.h     |   4 +-
 drivers/gpu/nvgpu/include/nvgpu/fifo.h        |  15 +-
 .../gpu/nvgpu/include/nvgpu/fifo/swprofile.h  |  50 +++
 drivers/gpu/nvgpu/include/nvgpu/profile.h     |  70 -----
 drivers/gpu/nvgpu/include/nvgpu/swprofile.h   | 168 +++++++++++
 drivers/gpu/nvgpu/os/linux/channel.h          |   2 +-
 drivers/gpu/nvgpu/os/linux/debug.c            |   2 -
 drivers/gpu/nvgpu/os/linux/debug_fifo.c       | 189 +-----------
 drivers/gpu/nvgpu/os/linux/debug_fifo.h       |   3 +-
 drivers/gpu/nvgpu/os/linux/ioctl_channel.c    |  17 +-
 libs/dgpu/libnvgpu-drv-dgpu_safe.export       |   2 +
 libs/igpu/libnvgpu-drv-igpu_safe.export       |   2 +
 18 files changed, 579 insertions(+), 292 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/common/swdebug/profile.c
 create mode 100644 drivers/gpu/nvgpu/include/nvgpu/fifo/swprofile.h
 delete mode 100644 drivers/gpu/nvgpu/include/nvgpu/profile.h
 create mode 100644 drivers/gpu/nvgpu/include/nvgpu/swprofile.h

diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml
index 6db08f30b..3f57422bc 100644
--- a/arch/nvgpu-common.yaml
+++ b/arch/nvgpu-common.yaml
@@ -357,8 +357,7 @@ fifo:
                  common/fifo/priv_cmdbuf.c,
                  common/fifo/job.c,
                  include/nvgpu/priv_cmdbuf.h,
-                 include/nvgpu/job.h,
-                 include/nvgpu/profile.h ]
+                 include/nvgpu/job.h ]
       deps: [ ]
     runlist:
       safe: yes
@@ -993,6 +992,13 @@ power_features:
       sources: [ common/power_features/pg/pg.c,
                  include/nvgpu/power_features/pg.h ]
 
+swdebug:
+  owner: Alex W
+  safe: no
+  sources: [ common/swdebug/profile.c,
+             include/nvgpu/swprofile.h,
+             include/nvgpu/fifo/swprofile.h ]
+
 ##
 ## HAL units. Currently they are under common but this needs to change.
 ## We are moving these to a top level directory.
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 35718b529..63c2778b0 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -182,6 +182,7 @@ nvgpu-y += \
 	common/utils/rbtree.o \
 	common/utils/string.o \
 	common/utils/worker.o \
+	common/swdebug/profile.o \
 	common/ptimer/ptimer.o \
 	common/perf/perfbuf.o \
 	common/therm/therm.o \
diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources
index 47ca68365..447c1d660 100644
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -95,6 +95,7 @@ srcs +=	common/device.c \
 	common/utils/rbtree.c \
 	common/utils/string.c \
 	common/utils/worker.c \
+	common/swdebug/profile.c \
 	common/init/nvgpu_init.c \
 	common/mm/allocators/nvgpu_allocator.c \
 	common/mm/allocators/bitmap_allocator.c \
diff --git a/drivers/gpu/nvgpu/common/fifo/fifo.c b/drivers/gpu/nvgpu/common/fifo/fifo.c
index ed83b9db8..c8241c13f 100644
--- a/drivers/gpu/nvgpu/common/fifo/fifo.c
+++ b/drivers/gpu/nvgpu/common/fifo/fifo.c
@@ -35,6 +35,12 @@
 #include <nvgpu/vm_area.h>
 #include <nvgpu/nvgpu_err.h>
 #include <nvgpu/mc.h>
+#include <nvgpu/swprofile.h>
+#include <nvgpu/fifo/swprofile.h>
+
+static const char *nvgpu_fifo_kickoff_profile_events[] = {
+	NVGPU_FIFO_KICKOFF_PROFILE_EVENTS,
+};
 
 void nvgpu_fifo_cleanup_sw_common(struct gk20a *g)
 {
@@ -93,6 +99,9 @@ int nvgpu_fifo_setup_sw_common(struct gk20a *g)
 	nvgpu_mutex_init(&f->deferred_reset_mutex);
 #endif
 
+	nvgpu_swprofile_initialize(g, &f->kickoff_profiler,
+				 nvgpu_fifo_kickoff_profile_events);
+
 	err = nvgpu_channel_setup_sw(g);
 	if (err != 0) {
 		nvgpu_err(g, "failed to init channel support");
diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c
index 9e288937e..fe315df29 100644
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -34,11 +34,13 @@
 #include <nvgpu/priv_cmdbuf.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/fence.h>
-#include <nvgpu/profile.h>
+#include <nvgpu/swprofile.h>
 #include <nvgpu/vpr.h>
 #include <nvgpu/trace.h>
 #include <nvgpu/nvhost.h>
 
+#include <nvgpu/fifo/swprofile.h>
+
 /*
  * We might need two extra gpfifo entries per submit - one for pre fence and
  * one for post fence.
@@ -340,7 +342,7 @@ static int nvgpu_submit_prepare_gpfifo_track(struct nvgpu_channel *c,
 		u32 flags,
 		struct nvgpu_channel_fence *fence,
 		struct nvgpu_fence_type **fence_out,
-		struct nvgpu_profile *profile,
+		struct nvgpu_swprofiler *profiler,
 		bool need_deferred_cleanup)
 {
 	bool skip_buffer_refcounting = (flags &
@@ -358,7 +360,7 @@ static int nvgpu_submit_prepare_gpfifo_track(struct nvgpu_channel *c,
 		goto clean_up_job;
 	}
 
-	nvgpu_profile_snapshot(profile, PROFILE_JOB_TRACKING);
+	nvgpu_swprofile_snapshot(profiler, PROF_KICKOFF_JOB_TRACKING);
 
 	/*
 	 * wait_cmd can be unset even if flag_fence_wait exists; the
@@ -432,11 +434,11 @@ static int nvgpu_submit_prepare_gpfifo_notrack(struct nvgpu_channel *c,
 		struct nvgpu_gpfifo_userdata userdata,
 		u32 num_entries,
 		struct nvgpu_fence_type **fence_out,
-		struct nvgpu_profile *profile)
+		struct nvgpu_swprofiler *profiler)
 {
 	int err;
 
-	nvgpu_profile_snapshot(profile, PROFILE_JOB_TRACKING);
+	nvgpu_swprofile_snapshot(profiler, PROF_KICKOFF_JOB_TRACKING);
 
 	err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
 			num_entries);
@@ -475,7 +477,7 @@ static int nvgpu_do_submit(struct nvgpu_channel *c,
 		u32 flags,
 		struct nvgpu_channel_fence *fence,
 		struct nvgpu_fence_type **fence_out,
-		struct nvgpu_profile *profile,
+		struct nvgpu_swprofiler *profiler,
 		bool need_job_tracking,
 		bool need_deferred_cleanup)
 {
@@ -502,17 +504,17 @@ static int nvgpu_do_submit(struct nvgpu_channel *c,
 	if (need_job_tracking) {
 		err = nvgpu_submit_prepare_gpfifo_track(c, gpfifo,
 				userdata, num_entries, flags, fence,
-				fence_out, profile, need_deferred_cleanup);
+				fence_out, profiler, need_deferred_cleanup);
 	} else {
 		err = nvgpu_submit_prepare_gpfifo_notrack(c, gpfifo,
-				userdata, num_entries, fence_out, profile);
+				userdata, num_entries, fence_out, profiler);
 	}
 
 	if (err != 0) {
 		return err;
 	}
 
-	nvgpu_profile_snapshot(profile, PROFILE_APPEND);
+	nvgpu_swprofile_snapshot(profiler, PROF_KICKOFF_APPEND);
 
 	g->ops.userd.gp_put(g, c);
 
@@ -527,7 +529,7 @@ static int nvgpu_submit_deterministic(struct nvgpu_channel *c,
 				u32 flags,
 				struct nvgpu_channel_fence *fence,
 				struct nvgpu_fence_type **fence_out,
-				struct nvgpu_profile *profile)
+				struct nvgpu_swprofiler *profiler)
 {
 	bool skip_buffer_refcounting = (flags &
 			NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING) != 0U;
@@ -608,7 +610,7 @@ static int nvgpu_submit_deterministic(struct nvgpu_channel *c,
 	}
 
 	err = nvgpu_do_submit(c, gpfifo, userdata, num_entries, flags, fence,
-			fence_out, profile, need_job_tracking, false);
+			fence_out, profiler, need_job_tracking, false);
 	if (err != 0) {
 		goto clean_up;
 	}
@@ -633,7 +635,7 @@ static int nvgpu_submit_nondeterministic(struct nvgpu_channel *c,
 				u32 flags,
 				struct nvgpu_channel_fence *fence,
 				struct nvgpu_fence_type **fence_out,
-				struct nvgpu_profile *profile)
+				struct nvgpu_swprofiler *profiler)
 {
 	bool skip_buffer_refcounting = (flags &
 			NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING) != 0U;
@@ -682,7 +684,7 @@ static int nvgpu_submit_nondeterministic(struct nvgpu_channel *c,
 	}
 
 	err = nvgpu_do_submit(c, gpfifo, userdata, num_entries, flags, fence,
-			fence_out, profile, need_job_tracking, true);
+			fence_out, profiler, need_job_tracking, true);
 	if (err != 0) {
 		goto clean_up;
 	}
@@ -734,7 +736,7 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
 				u32 flags,
 				struct nvgpu_channel_fence *fence,
 				struct nvgpu_fence_type **fence_out,
-				struct nvgpu_profile *profile)
+				struct nvgpu_swprofiler *profiler)
 {
 	struct gk20a *g = c->g;
 	int err;
@@ -755,7 +757,7 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
 		return -ENOMEM;
 	}
 
-	nvgpu_profile_snapshot(profile, PROFILE_ENTRY);
+	nvgpu_swprofile_snapshot(profiler, PROF_KICKOFF_ENTRY);
 
 	/* update debug settings */
 	nvgpu_ltc_sync_enabled(g);
@@ -765,12 +767,12 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
 #ifdef CONFIG_NVGPU_DETERMINISTIC_CHANNELS
 	if (c->deterministic) {
 		err = nvgpu_submit_deterministic(c, gpfifo, userdata,
-				num_entries, flags, fence, fence_out, profile);
+				num_entries, flags, fence, fence_out, profiler);
 	} else
 #endif
 	{
 		err = nvgpu_submit_nondeterministic(c, gpfifo, userdata,
-				num_entries, flags, fence, fence_out, profile);
+				num_entries, flags, fence, fence_out, profiler);
 	}
 
 	if (err != 0) {
@@ -793,7 +795,7 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
 	nvgpu_log_info(g, "post-submit put %d, get %d, size %d",
 		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
 
-	nvgpu_profile_snapshot(profile, PROFILE_END);
+	nvgpu_swprofile_snapshot(profiler, PROF_KICKOFF_END);
 
 	nvgpu_log_fn(g, "done");
 	return err;
@@ -805,10 +807,10 @@ int nvgpu_submit_channel_gpfifo_user(struct nvgpu_channel *c,
 				u32 flags,
 				struct nvgpu_channel_fence *fence,
 				struct nvgpu_fence_type **fence_out,
-				struct nvgpu_profile *profile)
+				struct nvgpu_swprofiler *profiler)
 {
 	return nvgpu_submit_channel_gpfifo(c, NULL, userdata, num_entries,
-			flags, fence, fence_out, profile);
+			flags, fence, fence_out, profiler);
 }
 
 int nvgpu_submit_channel_gpfifo_kernel(struct nvgpu_channel *c,
diff --git a/drivers/gpu/nvgpu/common/swdebug/profile.c b/drivers/gpu/nvgpu/common/swdebug/profile.c
new file mode 100644
index 000000000..3bab64696
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/swdebug/profile.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/swprofile.h>
+#include <nvgpu/lock.h>
+#include <nvgpu/kref.h>
+#include <nvgpu/debug.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/sort.h>
+#include <nvgpu/log.h>
+
+/*
+ * A simple profiler, capable of generating histograms.
+ */
+
+/*
+ * The sample array is a 1d array comprised of repeating rows of data. To
+ * index the array as though it were a row-major matrix, we need to do some
+ * simple math.
+ */
+static inline u32 matrix_to_linear_index(struct nvgpu_swprofiler *p,
+					 u32 row, u32 col)
+{
+	return (row * p->psample_len) + col;
+}
+
+void nvgpu_swprofile_initialize(struct gk20a *g,
+				struct nvgpu_swprofiler *p,
+				const char *col_names[])
+{
+	if (p->col_names != NULL) {
+		/*
+		 * Profiler is already initialized.
+		 */
+		return;
+	}
+
+	nvgpu_mutex_init(&p->lock);
+	p->g = g;
+
+	p->col_names = col_names;
+
+	p->psample_len = 0U;
+	while (col_names[p->psample_len] != NULL) {
+		p->psample_len++;
+	}
+}
+
+int nvgpu_swprofile_open(struct gk20a *g, struct nvgpu_swprofiler *p)
+{
+	int ret = 0;
+
+	nvgpu_mutex_acquire(&p->lock);
+
+	/*
+	 * If this profiler is already opened, just take a ref and return.
+	 */
+	if (p->samples != NULL) {
+		nvgpu_ref_get(&p->ref);
+		goto done;
+	}
+
+	p->samples = nvgpu_vzalloc(g,
+				   PROFILE_ENTRIES * p->psample_len *
+				   sizeof(*p->samples));
+	if (p->samples == NULL) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * Otherwise allocate the necessary data structures, etc.
+	 */
+	nvgpu_ref_init(&p->ref);
+
+done:
+	nvgpu_mutex_release(&p->lock);
+	return ret;
+}
+
+static void nvgpu_swprofile_free(struct nvgpu_ref *ref)
+{
+	struct nvgpu_swprofiler *p = container_of(ref, struct nvgpu_swprofiler, ref);
+
+	nvgpu_vfree(p->g, p->samples);
+	p->samples = NULL;
+}
+
+void nvgpu_swprofile_close(struct nvgpu_swprofiler *p)
+{
+	nvgpu_ref_put(&p->ref, nvgpu_swprofile_free);
+}
+
+/*
+ * Note: this does _not_ lock the profiler. This is a conscious choice. If we
+ * do lock the profiler then there's the possibility that you get bad data due
+ * to the snapshot blocking on some other user printing the contents of the
+ * profiler.
+ *
+ * Instead, this way, it's possible that someone printing the data in the
+ * profiler gets a sample that's a mix of old and new. That's not great, but
+ * IMO worse than a completely bogus sample.
+ *
+ * Also it's really quite unlikely for this race to happen in practice as the
+ * print function is executed as a result of a debugfs call.
+ */
+void nvgpu_swprofile_snapshot(struct nvgpu_swprofiler *p, u32 idx)
+{
+	u32 index;
+
+	/*
+	 * Handle two cases: the first allows calling code to simply skip
+	 * any profiling by passing in a NULL profiler; see the CDE code
+	 * for this. The second case is if a profiler is not "opened".
+	 */
+	if (p == NULL || p->samples == NULL) {
+		return;
+	}
+
+	/*
+	 * p->sample_index is the current row, aka sample, we are writing to.
+	 * idx is the column - i.e the sub-sample.
+	 */
+	index = matrix_to_linear_index(p, p->sample_index, idx);
+
+	p->samples[index] = nvgpu_current_time_ns();
+}
+
+void nvgpu_swprofile_begin_sample(struct nvgpu_swprofiler *p)
+{
+	nvgpu_mutex_acquire(&p->lock);
+	p->sample_index++;
+
+	/* Handle wrap. */
+	if (p->sample_index >= PROFILE_ENTRIES) {
+		p->sample_index = 0U;
+	}
+	nvgpu_mutex_release(&p->lock);
+}
+
+static int profile_cmp(const void *a, const void *b)
+{
+	return *((const u64 *) a) - *((const u64 *) b);
+}
+
+#define PERCENTILE_WIDTH	5
+#define PERCENTILE_RANGES	(100/PERCENTILE_WIDTH)
+
+static u32 nvgpu_swprofile_build_ranges(struct nvgpu_swprofiler *p,
+					u64 *storage,
+					u64 *percentiles,
+					u32 index_end,
+					u32 index_start)
+{
+	u32 i;
+	u32 nelem = 0U;
+
+	/*
+	 * Iterate through a column and build a temporary slice array of samples
+	 * so that we can sort them without corrupting the current data.
+	 *
+	 * Note that we have to first convert the row/column indexes into linear
+	 * indexes to access the underlying sample array.
+	 */
+	for (i = 0; i < PROFILE_ENTRIES; i++) {
+		u32 linear_idx_start = matrix_to_linear_index(p, i, index_start);
+		u32 linear_idx_end = matrix_to_linear_index(p, i, index_end);
+
+		if (p->samples[linear_idx_end] <=
+		    p->samples[linear_idx_start]) {
+			/* This is an invalid element */
+			continue;
+		}
+
+		storage[nelem] = p->samples[linear_idx_end] -
+				 p->samples[linear_idx_start];
+		nelem++;
+	}
+
+	/* sort it */
+	sort(storage, nelem, sizeof(u64), profile_cmp, NULL);
+
+	/* build ranges */
+	for (i = 0; i < PERCENTILE_RANGES; i++) {
+		percentiles[i] = nelem < PERCENTILE_RANGES ? 0 :
+			storage[(PERCENTILE_WIDTH * (i + 1) * nelem)/100 - 1];
+	}
+
+	return nelem;
+}
+
+/*
+ * Print a list of percentiles spaced by 5%. Note that the debug_context needs
+ * to be special here. _Most_ print functions in NvGPU automatically add a new
+ * line to the end of each print statement. This function _specifically_
+ * requires that your debug print function does _NOT_ do this.
+ */
+void nvgpu_swprofile_print_ranges(struct gk20a *g,
+				  struct nvgpu_swprofiler *p,
+				  struct nvgpu_debug_context *o)
+{
+	u32 nelem = 0U, i, j;
+	u64 *sorted_data = NULL;
+	u64 *percentiles = NULL;
+
+	nvgpu_mutex_acquire(&p->lock);
+
+	if (p->samples == NULL) {
+		gk20a_debug_output(o, "Profiler not enabled.\n");
+		goto done;
+	}
+
+	sorted_data = nvgpu_vzalloc(g,
+				    PROFILE_ENTRIES * p->psample_len *
+				    sizeof(u64));
+	percentiles = nvgpu_vzalloc(g,
+				    PERCENTILE_RANGES * p->psample_len *
+				    sizeof(u64));
+	if (!sorted_data || !percentiles) {
+		nvgpu_err(g, "vzalloc: OOM!");
+		goto done;
+	}
+
+	/*
+	 * Loop over each column; sort the column's data and then build
+	 * percentile ranges based on that sorted data.
+	 */
+	for (i = 0U; i < p->psample_len; i++) {
+		nelem = nvgpu_swprofile_build_ranges(p,
+						   &sorted_data[i * PROFILE_ENTRIES],
+						   &percentiles[i * PERCENTILE_RANGES],
+						   i, 0U);
+	}
+
+	gk20a_debug_output(o, "Samples: %u\n", nelem);
+	gk20a_debug_output(o, "%6s", "Perc");
+	for (i = 0U; i < p->psample_len; i++) {
+		gk20a_debug_output(o, " %15s", p->col_names[i]);
+	}
+	gk20a_debug_output(o, "\n");
+	gk20a_debug_output(o, "%6s", "----");
+	for (i = 0U; i < p->psample_len; i++) {
+		gk20a_debug_output(o, " %15s", "---------------");
+	}
+	gk20a_debug_output(o, "\n");
+
+	/*
+	 * percentiles is another matrix, but this time it's using column major indexing.
+	 */
+	for (i = 0U; i < PERCENTILE_RANGES; i++) {
+		gk20a_debug_output(o, "%3upc ", PERCENTILE_WIDTH * (i + 1));
+		for (j = 0U; j < p->psample_len; j++) {
+			gk20a_debug_output(o, " %15llu",
+					   percentiles[(j * PERCENTILE_RANGES) + i]);
+		}
+		gk20a_debug_output(o, "\n");
+	}
+	gk20a_debug_output(o, "\n");
+
+done:
+	nvgpu_vfree(g, sorted_data);
+	nvgpu_vfree(g, percentiles);
+	nvgpu_mutex_release(&p->lock);
+}
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h
index 7c2a39cc6..fca50b5ff 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -40,7 +40,7 @@
 struct gk20a;
 struct dbg_session_gk20a;
 struct nvgpu_fence_type;
-struct nvgpu_profile;
+struct nvgpu_swprofiler;
 struct nvgpu_channel_sync;
 struct nvgpu_gpfifo_userdata;
 struct nvgpu_gr_subctx;
@@ -576,7 +576,7 @@ int nvgpu_submit_channel_gpfifo_user(struct nvgpu_channel *c,
 				u32 flags,
 				struct nvgpu_channel_fence *fence,
 				struct nvgpu_fence_type **fence_out,
-				struct nvgpu_profile *profile);
+				struct nvgpu_swprofiler *profiler);
 
 int nvgpu_submit_channel_gpfifo_kernel(struct nvgpu_channel *c,
 				struct nvgpu_gpfifo_entry *gpfifo,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/fifo.h b/drivers/gpu/nvgpu/include/nvgpu/fifo.h
index a74dc8a26..0b9048edf 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/fifo.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/fifo.h
@@ -197,6 +197,8 @@
 #include <nvgpu/lock.h>
 #include <nvgpu/kref.h>
 #include <nvgpu/list.h>
+#include <nvgpu/swprofile.h>
+
 /**
  * H/w defined value for Channel ID type
  */
@@ -231,6 +233,7 @@ struct nvgpu_engine_info;
 struct nvgpu_runlist_info;
 struct nvgpu_channel;
 struct nvgpu_tsg;
+struct nvgpu_swprofiler;
 
 struct nvgpu_fifo {
 	/** Pointer to GPU driver struct. */
@@ -297,16 +300,8 @@ struct nvgpu_fifo {
 	/** Number of active runlists. */
 	u32 num_runlists;
 
-#ifdef CONFIG_DEBUG_FS
-	struct {
-		struct nvgpu_profile *data;
-		nvgpu_atomic_t get;
-		bool enabled;
-		u64 *sorted;
-		struct nvgpu_ref ref;
-		struct nvgpu_mutex lock;
-	} profile;
-#endif
+	struct nvgpu_swprofiler kickoff_profiler;
+
 #ifdef CONFIG_NVGPU_USERD
 	struct nvgpu_mutex userd_mutex;
 	struct nvgpu_mem *userd_slabs;
diff --git a/drivers/gpu/nvgpu/include/nvgpu/fifo/swprofile.h b/drivers/gpu/nvgpu/include/nvgpu/fifo/swprofile.h
new file mode 100644
index 000000000..57169aef9
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/fifo/swprofile.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_FIFO_PROFILE_H
+#define NVGPU_FIFO_PROFILE_H
+
+/*
+ * Define these here, not in the C file so that they are closer to the other
+ * macro definitions below. The two lists must be in sync.
+ */
+#define NVGPU_FIFO_KICKOFF_PROFILE_EVENTS	\
+	"ioctl_entry",				\
+	"entry",				\
+	"job_tracking",				\
+	"append",				\
+	"end",					\
+	"ioctl_exit",				\
+	NULL					\
+
+/*
+ * The kickoff profile events; these are used to index into the profile's sample
+ * array.
+ */
+#define PROF_KICKOFF_IOCTL_ENTRY		0U
+#define PROF_KICKOFF_ENTRY			1U
+#define PROF_KICKOFF_JOB_TRACKING		2U
+#define PROF_KICKOFF_APPEND			3U
+#define PROF_KICKOFF_END			4U
+#define PROF_KICKOFF_IOCTL_EXIT			5U
+
+#endif
diff --git a/drivers/gpu/nvgpu/include/nvgpu/profile.h b/drivers/gpu/nvgpu/include/nvgpu/profile.h
deleted file mode 100644
index dc54e7ffe..000000000
--- a/drivers/gpu/nvgpu/include/nvgpu/profile.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef NVGPU_PROFILE_H
-#define NVGPU_PROFILE_H
-
-/*
- * Number of entries in the kickoff latency buffer, used to calculate
- * the profiling and histogram. This number is calculated to be statistically
- * significative on a histogram on a 5% step
- */
-#ifdef CONFIG_DEBUG_FS
-#define FIFO_PROFILING_ENTRIES	16384U
-#endif
-
-enum {
-	PROFILE_IOCTL_ENTRY = 0U,
-	PROFILE_ENTRY,
-	PROFILE_JOB_TRACKING,
-	PROFILE_APPEND,
-	PROFILE_END,
-	PROFILE_IOCTL_EXIT,
-	PROFILE_MAX
-};
-
-struct nvgpu_profile {
-	u64 timestamp[PROFILE_MAX];
-};
-
-#ifdef CONFIG_DEBUG_FS
-struct nvgpu_profile *nvgpu_profile_acquire(struct gk20a *g);
-void nvgpu_profile_release(struct gk20a *g,
-	struct nvgpu_profile *profile);
-void nvgpu_profile_snapshot(struct nvgpu_profile *profile, int idx);
-#else
-static inline struct nvgpu_profile *
-nvgpu_profile_acquire(struct gk20a *g)
-{
-	return NULL;
-}
-static inline void nvgpu_profile_release(struct gk20a *g,
-	struct nvgpu_profile *profile)
-{
-}
-static inline void nvgpu_profile_snapshot(
-		struct nvgpu_profile *profile, int idx)
-{
-}
-#endif
-
-#endif /* NVGPU_PROFILE_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/swprofile.h b/drivers/gpu/nvgpu/include/nvgpu/swprofile.h
new file mode 100644
index 000000000..94bcf28e4
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/swprofile.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_PROFILE_H
+#define NVGPU_PROFILE_H
+
+#include <nvgpu/lock.h>
+#include <nvgpu/types.h>
+#include <nvgpu/kref.h>
+
+struct nvgpu_debug_context;
+
+/*
+ * Number of entries in the kickoff latency buffer used to calculate the
+ * profiling and histogram. This number is calculated to be statistically
+ * significant on a histogram on a 5% step.
+ */
+#define PROFILE_ENTRIES		16384U
+
+struct nvgpu_swprofiler {
+	struct nvgpu_mutex    lock;
+
+	/**
+	 * The number of sample components that make up a sample for this
+	 * profiler.
+	 */
+	u32                   psample_len;
+
+	/**
+	 * Sample array: this is essentially a matrix where rows correspond to
+	 * a given sample and rows correspond to a type of sample. Number of
+	 * samples is always %PROFILING_ENTRIES. This 1d array is accessed with
+	 * row-major indexing.
+	 */
+	u64                  *samples;
+
+	/**
+	 * Pointer to next sample array to write. Will be wrapped at
+	 * %PROFILING_ENTRIES.
+	 */
+	u32                   sample_index;
+
+	/**
+	 * Column names used for printing the histogram. This is NULL terminated
+	 * so that the profiler can infer the number of subsamples in a
+	 * psample.
+	 */
+	const char          **col_names;
+
+	struct nvgpu_ref      ref;
+
+	/**
+	 * Necessary since we won't have an access to a gk20a struct to vfree()
+	 * against when this profiler is freed via an nvgpu_ref.
+	 */
+	struct gk20a         *g;
+};
+
+/**
+ * @brief Create a profiler with the passed column names.
+ *
+ * @param[in] g          The GPU that owns this profiler.
+ * @param[in] p          Pointer to a profiler object to initialize.
+ * @param[in] col_names  %NULL terminated list of column names.
+ *
+ * The sample array length is determined by the NULL terminated %col_names
+ * array. This will not allocate the underlying data; that's controlled by
+ * the open and close functions:
+ *
+ *    nvgpu_swprofile_open()
+ *    nvgpu_swprofile_close()
+ *
+ * Once nvgpu_swprofile_initialize() is called all of the below functions
+ * may also be called. All of the sampling related functions will become
+ * no-ops if the SW profiler is not opened.
+ */
+void nvgpu_swprofile_initialize(struct gk20a *g,
+				struct nvgpu_swprofiler *p,
+				const char **col_names);
+
+/**
+ * @brief Open a profiler for use.
+ *
+ * @param[in] g   The GPU that owns this profiler.
+ * @param[in] p   The profiler to open.
+ *
+ * This functions prepares a SW profiler object for actual profiling. Necessary
+ * data structures are allocated and subsequent snapshots will be captured.
+ *
+ * SW profiler objects are reference counted: for each open call made, a
+ * corresponding close call must also be made.
+ *
+ * @return Returns 0 on success, otherwise a negative error code.
+ */
+int nvgpu_swprofile_open(struct gk20a *g, struct nvgpu_swprofiler *p);
+
+/**
+ * @brief Close a profiler.
+ *
+ * @param[in] p  The profiler to close.
+ *
+ * Close call corresponding to nvgpu_swprofile_open().
+ */
+void nvgpu_swprofile_close(struct nvgpu_swprofiler *p);
+
+/**
+ * @brief Begin a series of timestamp samples.
+ *
+ * @param[in] p  The profiler to start sampling with.
+ *
+ * Each iteration through a given SW sequence requires one call to this
+ * function. It essentially just increments (with wraparound) an internal
+ * tracker which points to the sample space in the internal sample array.
+ * Typical usage is to call nvgpu_swprofile_begin_sample() and then a
+ * sequence of calls to nvgpu_swprofile_snapshot().
+ *
+ * Once done with the sequence being profiled nothing needs to happen. When
+ * the next iteration of the sequence is executed this function should be
+ * called again.
+ */
+void nvgpu_swprofile_begin_sample(struct nvgpu_swprofiler *p);
+
+/**
+ * @brief Capture a timestamp sample.
+ *
+ * @param[in] p    The profiler to sample with.
+ * @param[in] idx  The index to the subsample to capture.
+ *
+ * This captures a subsample. Any given run through a SW sequence that is
+ * being profiled will result in one or more subsamples which together make
+ * up a sample.
+ */
+void nvgpu_swprofile_snapshot(struct nvgpu_swprofiler *p, u32 idx);
+
+/**
+ * @brief Print percentile ranges for a SW profiler.
+ *
+ * @param[in] g   The GPU that owns this profiler.
+ * @param[in] p   The profiler to print.
+ * @param[in] o   A debug context object used for printing.
+ *
+ * Print a percentile table for all columns of sub-samples. This gives a
+ * good overview of the collected data.
+ */
+void nvgpu_swprofile_print_ranges(struct gk20a *g,
+				  struct nvgpu_swprofiler *p,
+				  struct nvgpu_debug_context *o);
+
+#endif /* NVGPU_PROFILE_H */
diff --git a/drivers/gpu/nvgpu/os/linux/channel.h b/drivers/gpu/nvgpu/os/linux/channel.h
index d7262c3fe..b9d27abc9 100644
--- a/drivers/gpu/nvgpu/os/linux/channel.h
+++ b/drivers/gpu/nvgpu/os/linux/channel.h
@@ -26,7 +26,7 @@ struct nvgpu_gpfifo;
 struct nvgpu_submit_gpfifo_args;
 struct nvgpu_channel_fence;
 struct nvgpu_fence_type;
-struct nvgpu_profile;
+struct nvgpu_swprofile;
 struct nvgpu_os_linux;
 
 struct sync_fence;
diff --git a/drivers/gpu/nvgpu/os/linux/debug.c b/drivers/gpu/nvgpu/os/linux/debug.c
index de7bd2ced..e4d99bc49 100644
--- a/drivers/gpu/nvgpu/os/linux/debug.c
+++ b/drivers/gpu/nvgpu/os/linux/debug.c
@@ -442,8 +442,6 @@ void gk20a_debug_deinit(struct gk20a *g)
 	if (!l->debugfs)
 		return;
 
-	gk20a_fifo_debugfs_deinit(g);
-
 	debugfs_remove_recursive(l->debugfs);
 	debugfs_remove(l->debugfs_alias);
 }
diff --git a/drivers/gpu/nvgpu/os/linux/debug_fifo.c b/drivers/gpu/nvgpu/os/linux/debug_fifo.c
index d2752c8c1..4239861bb 100644
--- a/drivers/gpu/nvgpu/os/linux/debug_fifo.c
+++ b/drivers/gpu/nvgpu/os/linux/debug_fifo.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2019 NVIDIA Corporation.  All rights reserved.
+ * Copyright (C) 2017-2020 NVIDIA Corporation.  All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -9,7 +9,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
  */
 
 #include "debug_fifo.h"
@@ -24,7 +23,9 @@
 #include <nvgpu/gr/ctx.h>
 #include <nvgpu/engines.h>
 #include <nvgpu/runlist.h>
-#include <nvgpu/profile.h>
+#include <nvgpu/swprofile.h>
+
+#include <nvgpu/fifo/swprofile.h>
 
 void __gk20a_fifo_profile_free(struct nvgpu_ref *ref);
 
@@ -147,41 +148,12 @@ static int gk20a_fifo_profile_enable(void *data, u64 val)
 	struct gk20a *g = (struct gk20a *) data;
 	struct nvgpu_fifo *f = &g->fifo;
 
-
-	nvgpu_mutex_acquire(&f->profile.lock);
 	if (val == 0) {
-		if (f->profile.enabled) {
-			f->profile.enabled = false;
-			nvgpu_ref_put(&f->profile.ref,
-				__gk20a_fifo_profile_free);
-		}
+		nvgpu_swprofile_close(&f->kickoff_profiler);
+		return 0;
 	} else {
-		if (!f->profile.enabled) {
-			/* not kref init as it can have a running condition if
-			 * we enable/disable/enable while kickoff is happening
-			 */
-			if (!nvgpu_ref_get_unless_zero(&f->profile.ref)) {
-				f->profile.data = nvgpu_vzalloc(g,
-					FIFO_PROFILING_ENTRIES *
-					sizeof(struct nvgpu_profile));
-				f->profile.sorted  = nvgpu_vzalloc(g,
-					FIFO_PROFILING_ENTRIES *
-					sizeof(u64));
-				if (!(f->profile.data && f->profile.sorted)) {
-					nvgpu_vfree(g, f->profile.data);
-					nvgpu_vfree(g, f->profile.sorted);
-					nvgpu_mutex_release(&f->profile.lock);
-					return -ENOMEM;
-				}
-				nvgpu_ref_init(&f->profile.ref);
-			}
-			atomic_set(&f->profile.get.atomic_var, 0);
-			f->profile.enabled = true;
-		}
+		return nvgpu_swprofile_open(g, &f->kickoff_profiler);
 	}
-	nvgpu_mutex_release(&f->profile.lock);
-
-	return 0;
 }
 
 DEFINE_SIMPLE_ATTRIBUTE(
@@ -191,96 +163,20 @@ DEFINE_SIMPLE_ATTRIBUTE(
 	"%llu\n"
 );
 
-static int __profile_cmp(const void *a, const void *b)
+static void gk20a_fifo_write_to_seqfile_no_nl(void *ctx, const char *str)
 {
-	return *((unsigned long long *) a) - *((unsigned long long *) b);
-}
-
-/*
- * This uses about 800b in the stack, but the function using it is not part
- * of a callstack where much memory is being used, so it is fine
- */
-#define PERCENTILE_WIDTH	5
-#define PERCENTILE_RANGES	(100/PERCENTILE_WIDTH)
-
-static unsigned int __gk20a_fifo_create_stats(struct gk20a *g,
-		u64 *percentiles, u32 index_end, u32 index_start)
-{
-	unsigned int nelem = 0;
-	unsigned int index;
-	struct nvgpu_profile *profile;
-
-	for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) {
-		profile = &g->fifo.profile.data[index];
-
-		if (profile->timestamp[index_end] >
-				profile->timestamp[index_start]) {
-			/* This is a valid element */
-			g->fifo.profile.sorted[nelem] =
-						profile->timestamp[index_end] -
-						profile->timestamp[index_start];
-			nelem++;
-		}
-	}
-
-	/* sort it */
-	sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long),
-		__profile_cmp, NULL);
-
-	/* build ranges */
-	for (index = 0; index < PERCENTILE_RANGES; index++) {
-		percentiles[index] = nelem < PERCENTILE_RANGES ? 0 :
-			g->fifo.profile.sorted[(PERCENTILE_WIDTH * (index + 1) *
-						nelem)/100 - 1];
-	}
-	return nelem;
+	seq_printf((struct seq_file *)ctx, str);
 }
 
 static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused)
 {
 	struct gk20a *g = s->private;
-	unsigned int get, nelem, index;
-	/*
-	 * 800B in the stack, but function is declared statically and only
-	 * called from debugfs handler
-	 */
-	u64 percentiles_ioctl[PERCENTILE_RANGES];
-	u64 percentiles_kickoff[PERCENTILE_RANGES];
-	u64 percentiles_jobtracking[PERCENTILE_RANGES];
-	u64 percentiles_append[PERCENTILE_RANGES];
-	u64 percentiles_userd[PERCENTILE_RANGES];
+	struct nvgpu_debug_context o = {
+		.fn = gk20a_fifo_write_to_seqfile_no_nl,
+		.ctx = s,
+	};
 
-	if (!nvgpu_ref_get_unless_zero(&g->fifo.profile.ref)) {
-		seq_printf(s, "Profiling disabled\n");
-		return 0;
-	}
-
-	get = atomic_read(&g->fifo.profile.get.atomic_var);
-
-	__gk20a_fifo_create_stats(g, percentiles_ioctl,
-		PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY);
-	__gk20a_fifo_create_stats(g, percentiles_kickoff,
-		PROFILE_END, PROFILE_ENTRY);
-	__gk20a_fifo_create_stats(g, percentiles_jobtracking,
-		PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY);
-	__gk20a_fifo_create_stats(g, percentiles_append,
-		PROFILE_APPEND, PROFILE_JOB_TRACKING);
-	nelem = __gk20a_fifo_create_stats(g, percentiles_userd,
-		PROFILE_END, PROFILE_APPEND);
-
-	seq_printf(s, "Number of kickoffs: %d\n", nelem);
-	seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n");
-
-	for (index = 0; index < PERCENTILE_RANGES; index++)
-		seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n",
-			PERCENTILE_WIDTH * (index+1),
-			percentiles_ioctl[index],
-			percentiles_kickoff[index],
-			percentiles_append[index],
-			percentiles_jobtracking[index],
-			percentiles_userd[index]);
-
-	nvgpu_ref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
+	nvgpu_swprofile_print_ranges(g, &g->fifo.kickoff_profiler, &o);
 
 	return 0;
 }
@@ -297,7 +193,6 @@ static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = {
 	.release	= single_release,
 };
 
-
 void gk20a_fifo_debugfs_init(struct gk20a *g)
 {
 	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
@@ -318,11 +213,6 @@ void gk20a_fifo_debugfs_init(struct gk20a *g)
 	if (IS_ERR_OR_NULL(profile_root))
 		return;
 
-	nvgpu_mutex_init(&g->fifo.profile.lock);
-	g->fifo.profile.enabled = false;
-	atomic_set(&g->fifo.profile.get.atomic_var, 0);
-	atomic_set(&g->fifo.profile.ref.refcount.atomic_var, 0);
-
 	debugfs_create_file("enable", 0600, profile_root, g,
 		&gk20a_fifo_profile_enable_debugfs_fops);
 
@@ -330,54 +220,3 @@ void gk20a_fifo_debugfs_init(struct gk20a *g)
 		&gk20a_fifo_profile_stats_debugfs_fops);
 
 }
-
-void nvgpu_profile_snapshot(struct nvgpu_profile *profile, int idx)
-{
-	if (profile)
-		profile->timestamp[idx] = nvgpu_current_time_ns();
-}
-
-void __gk20a_fifo_profile_free(struct nvgpu_ref *ref)
-{
-	struct nvgpu_fifo *f = container_of(ref, struct nvgpu_fifo,
-						profile.ref);
-	nvgpu_vfree(f->g, f->profile.data);
-	nvgpu_vfree(f->g, f->profile.sorted);
-}
-
-/* Get the next element in the ring buffer of profile entries
- * and grab a reference to the structure
- */
-struct nvgpu_profile *nvgpu_profile_acquire(struct gk20a *g)
-{
-	struct nvgpu_fifo *f = &g->fifo;
-	struct nvgpu_profile *profile;
-	unsigned int index;
-
-	/* If kref is zero, profiling is not enabled */
-	if (!nvgpu_ref_get_unless_zero(&f->profile.ref))
-		return NULL;
-	index = atomic_inc_return(&f->profile.get.atomic_var);
-	profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES];
-
-	return profile;
-}
-
-/* Free the reference to the structure. This allows deferred cleanups */
-void nvgpu_profile_release(struct gk20a *g,
-					struct nvgpu_profile *profile)
-{
-	nvgpu_ref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
-}
-
-void gk20a_fifo_debugfs_deinit(struct gk20a *g)
-{
-	struct nvgpu_fifo *f = &g->fifo;
-
-	nvgpu_mutex_acquire(&f->profile.lock);
-	if (f->profile.enabled) {
-		f->profile.enabled = false;
-		nvgpu_ref_put(&f->profile.ref, __gk20a_fifo_profile_free);
-	}
-	nvgpu_mutex_release(&f->profile.lock);
-}
diff --git a/drivers/gpu/nvgpu/os/linux/debug_fifo.h b/drivers/gpu/nvgpu/os/linux/debug_fifo.h
index 46ac853e6..0c02aa42d 100644
--- a/drivers/gpu/nvgpu/os/linux/debug_fifo.h
+++ b/drivers/gpu/nvgpu/os/linux/debug_fifo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 NVIDIA Corporation.  All rights reserved.
+ * Copyright (C) 2017-2020 NVIDIA Corporation.  All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -17,6 +17,5 @@
 
 struct gk20a;
 void gk20a_fifo_debugfs_init(struct gk20a *g);
-void gk20a_fifo_debugfs_deinit(struct gk20a *g);
 
 #endif /* __NVGPU_DEBUG_FIFO_H__ */
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
index 5d66cc08e..c8288fb47 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
@@ -46,9 +46,11 @@
 #include <nvgpu/gr/obj_ctx.h>
 #include <nvgpu/fence.h>
 #include <nvgpu/preempt.h>
-#include <nvgpu/profile.h>
+#include <nvgpu/swprofile.h>
 #include <nvgpu/nvgpu_init.h>
 
+#include <nvgpu/fifo/swprofile.h>
+
 #include "platform_gk20a.h"
 #include "ioctl_channel.h"
 #include "channel.h"
@@ -792,10 +794,11 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 {
 	struct nvgpu_channel_fence fence;
 	struct nvgpu_fence_type *fence_out;
-	struct nvgpu_profile *profile = NULL;
 	u32 submit_flags = 0;
 	int fd = -1;
 	struct gk20a *g = ch->g;
+	struct nvgpu_fifo *f = &g->fifo;
+	struct nvgpu_swprofiler *kickoff_profiler = &f->kickoff_profiler;
 	struct nvgpu_gpfifo_userdata userdata;
 	bool flag_fence_wait = (args->flags &
 			NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) != 0U;
@@ -807,8 +810,8 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 	int ret = 0;
 	nvgpu_log_fn(g, " ");
 
-	profile = nvgpu_profile_acquire(ch->g);
-	nvgpu_profile_snapshot(profile, PROFILE_IOCTL_ENTRY);
+	nvgpu_swprofile_begin_sample(kickoff_profiler);
+	nvgpu_swprofile_snapshot(kickoff_profiler, PROF_KICKOFF_IOCTL_ENTRY);
 
 	if (nvgpu_channel_check_unserviceable(ch)) {
 		return -ETIMEDOUT;
@@ -846,7 +849,7 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 
 	ret = nvgpu_submit_channel_gpfifo_user(ch,
 			userdata, args->num_entries,
-			submit_flags, &fence, &fence_out, profile);
+			submit_flags, &fence, &fence_out, kickoff_profiler);
 
 	if (ret) {
 		if (fd != -1)
@@ -869,9 +872,7 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 	}
 	nvgpu_fence_put(fence_out);
 
-	nvgpu_profile_snapshot(profile, PROFILE_IOCTL_EXIT);
-	if (profile)
-		nvgpu_profile_release(ch->g, profile);
+	nvgpu_swprofile_snapshot(kickoff_profiler, PROF_KICKOFF_IOCTL_EXIT);
 
 clean_up:
 	return ret;
diff --git a/libs/dgpu/libnvgpu-drv-dgpu_safe.export b/libs/dgpu/libnvgpu-drv-dgpu_safe.export
index 77af1078f..320cb676c 100644
--- a/libs/dgpu/libnvgpu-drv-dgpu_safe.export
+++ b/libs/dgpu/libnvgpu-drv-dgpu_safe.export
@@ -725,6 +725,8 @@ nvgpu_spinlock_release
 nvgpu_strnadd_u32
 nvgpu_sw_quiesce
 nvgpu_sw_quiesce_remove_support
+nvgpu_swprofile_initialize
+nvgpu_swprofile_snapshot
 nvgpu_thread_create
 nvgpu_thread_create_priority
 nvgpu_thread_get_fault_injection
diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export
index 8f782f4d3..bc70c3833 100644
--- a/libs/igpu/libnvgpu-drv-igpu_safe.export
+++ b/libs/igpu/libnvgpu-drv-igpu_safe.export
@@ -740,6 +740,8 @@ nvgpu_spinlock_release
 nvgpu_strnadd_u32
 nvgpu_sw_quiesce
 nvgpu_sw_quiesce_remove_support
+nvgpu_swprofile_initialize
+nvgpu_swprofile_snapshot
 nvgpu_thread_create
 nvgpu_thread_create_priority
 nvgpu_thread_get_fault_injection