gpu: nvgpu: move submit code to common

To finish OS unification of the submit path, move the gk20a_submit_channel_gpfifo* functions to a file that's accessible also outside Linux code. Also change the prefix of the submit functions from gk20a_ to nvgpu_. Jira NVGPU-705 Change-Id: I8ca355d1eb69771fb016c7a21fc7f102ca7967d7 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1760421 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2018-06-25 12:35:42 +03:00
parent 2ac6fb4253
commit 7998233b77
12 changed files with 646 additions and 569 deletions
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -177,6 +177,7 @@ nvgpu-y += \
 	common/clock_gating/gv11b_gating_reglist.o \
 	common/sim.o \
 	common/sim_pci.o \
+	common/fifo/submit.o \
 	gk20a/gk20a.o \
 	gk20a/ce2_gk20a.o \
 	gk20a/fifo_gk20a.o \
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -80,6 +80,7 @@ srcs :=	common/mm/nvgpu_allocator.c \
 	common/clock_gating/gv11b_gating_reglist.c \
 	common/clock_gating/gp106_gating_reglist.c \
 	common/clock_gating/gv100_gating_reglist.c \
+	common/fifo/submit.c \
 	boardobj/boardobj.c \
 	boardobj/boardobjgrp.c \
 	boardobj/boardobjgrpmask.c \
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/channel.h>
+#include <nvgpu/ltc.h>
+#include <nvgpu/os_sched.h>
+
+#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+
+#include "gk20a/gk20a.h"
+#include "gk20a/channel_gk20a.h"
+
+#include <trace/events/gk20a.h>
+
+/*
+ * Handle the submit synchronization - pre-fences and post-fences.
+ */
+static int nvgpu_submit_prepare_syncs(struct channel_gk20a *c,
+				      struct nvgpu_channel_fence *fence,
+				      struct channel_gk20a_job *job,
+				      struct priv_cmd_entry **wait_cmd,
+				      struct priv_cmd_entry **incr_cmd,
+				      struct gk20a_fence **post_fence,
+				      bool register_irq,
+				      u32 flags)
+{
+	struct gk20a *g = c->g;
+	bool need_sync_fence = false;
+	bool new_sync_created = false;
+	int wait_fence_fd = -1;
+	int err = 0;
+	bool need_wfi = !(flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI);
+	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
+
+	if (g->aggressive_sync_destroy_thresh) {
+		nvgpu_mutex_acquire(&c->sync_lock);
+		if (!c->sync) {
+			c->sync = gk20a_channel_sync_create(c, false);
+			if (!c->sync) {
+				err = -ENOMEM;
+				nvgpu_mutex_release(&c->sync_lock);
+				goto fail;
+			}
+			new_sync_created = true;
+		}
+		nvgpu_atomic_inc(&c->sync->refcount);
+		nvgpu_mutex_release(&c->sync_lock);
+	}
+
+	if (g->ops.fifo.resetup_ramfc && new_sync_created) {
+		err = g->ops.fifo.resetup_ramfc(c);
+		if (err)
+			goto fail;
+	}
+
+	/*
+	 * Optionally insert syncpt/semaphore wait in the beginning of gpfifo
+	 * submission when user requested and the wait hasn't expired.
+	 */
+	if (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) {
+		int max_wait_cmds = c->deterministic ? 1 : 0;
+
+		if (!pre_alloc_enabled)
+			job->wait_cmd = nvgpu_kzalloc(g,
+				sizeof(struct priv_cmd_entry));
+
+		if (!job->wait_cmd) {
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		if (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) {
+			wait_fence_fd = fence->id;
+			err = c->sync->wait_fd(c->sync, wait_fence_fd,
+					       job->wait_cmd, max_wait_cmds);
+		} else {
+			err = c->sync->wait_syncpt(c->sync, fence->id,
+						   fence->value,
+						   job->wait_cmd);
+		}
+
+		if (err)
+			goto clean_up_wait_cmd;
+
+		if (job->wait_cmd->valid)
+			*wait_cmd = job->wait_cmd;
+	}
+
+	if ((flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) &&
+	    (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE))
+		need_sync_fence = true;
+
+	/*
+	 * Always generate an increment at the end of a GPFIFO submission. This
+	 * is used to keep track of method completion for idle railgating. The
+	 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
+	 */
+	job->post_fence = gk20a_alloc_fence(c);
+	if (!job->post_fence) {
+		err = -ENOMEM;
+		goto clean_up_wait_cmd;
+	}
+	if (!pre_alloc_enabled)
+		job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
+
+	if (!job->incr_cmd) {
+		err = -ENOMEM;
+		goto clean_up_post_fence;
+	}
+
+	if (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET)
+		err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
+				 job->post_fence, need_wfi, need_sync_fence,
+				 register_irq);
+	else
+		err = c->sync->incr(c->sync, job->incr_cmd,
+				    job->post_fence, need_sync_fence,
+				    register_irq);
+	if (!err) {
+		*incr_cmd = job->incr_cmd;
+		*post_fence = job->post_fence;
+	} else
+		goto clean_up_incr_cmd;
+
+	return 0;
+
+clean_up_incr_cmd:
+	free_priv_cmdbuf(c, job->incr_cmd);
+	if (!pre_alloc_enabled)
+		job->incr_cmd = NULL;
+clean_up_post_fence:
+	gk20a_fence_put(job->post_fence);
+	job->post_fence = NULL;
+clean_up_wait_cmd:
+	if (job->wait_cmd)
+		free_priv_cmdbuf(c, job->wait_cmd);
+	if (!pre_alloc_enabled)
+		job->wait_cmd = NULL;
+fail:
+	*wait_cmd = NULL;
+	return err;
+}
+
+static void nvgpu_submit_append_priv_cmdbuf(struct channel_gk20a *c,
+		struct priv_cmd_entry *cmd)
+{
+	struct gk20a *g = c->g;
+	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
+	struct nvgpu_gpfifo_entry x = {
+		.entry0 = u64_lo32(cmd->gva),
+		.entry1 = u64_hi32(cmd->gva) |
+			pbdma_gp_entry1_length_f(cmd->size)
+	};
+
+	nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
+			&x, sizeof(x));
+
+	if (cmd->mem->aperture == APERTURE_SYSMEM)
+		trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
+				(u32 *)cmd->mem->cpu_va + cmd->off);
+
+	c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
+}
+
+static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c,
+		struct nvgpu_gpfifo_userdata userdata,
+		u32 num_entries)
+{
+	struct gk20a *g = c->g;
+	struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va;
+	u32 gpfifo_size = c->gpfifo.entry_num;
+	u32 len = num_entries;
+	u32 start = c->gpfifo.put;
+	u32 end = start + len; /* exclusive */
+	int err;
+
+	if (end > gpfifo_size) {
+		/* wrap-around */
+		int length0 = gpfifo_size - start;
+		int length1 = len - length0;
+
+		err = g->os_channel.copy_user_gpfifo(
+				gpfifo_cpu + start, userdata,
+				0, length0);
+		if (err)
+			return err;
+
+		err = g->os_channel.copy_user_gpfifo(
+				gpfifo_cpu, userdata,
+				length0, length1);
+		if (err)
+			return err;
+	} else {
+		err = g->os_channel.copy_user_gpfifo(
+				gpfifo_cpu + start, userdata,
+				0, len);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void nvgpu_submit_append_gpfifo_common(struct channel_gk20a *c,
+		struct nvgpu_gpfifo_entry *src, u32 num_entries)
+{
+	struct gk20a *g = c->g;
+	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
+	/* in bytes */
+	u32 gpfifo_size =
+		c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo_entry);
+	u32 len = num_entries * sizeof(struct nvgpu_gpfifo_entry);
+	u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo_entry);
+	u32 end = start + len; /* exclusive */
+
+	if (end > gpfifo_size) {
+		/* wrap-around */
+		int length0 = gpfifo_size - start;
+		int length1 = len - length0;
+		struct nvgpu_gpfifo_entry *src2 = src + length0;
+
+		nvgpu_mem_wr_n(g, gpfifo_mem, start, src, length0);
+		nvgpu_mem_wr_n(g, gpfifo_mem, 0, src2, length1);
+	} else {
+		nvgpu_mem_wr_n(g, gpfifo_mem, start, src, len);
+	}
+}
+
+/*
+ * Copy source gpfifo entries into the gpfifo ring buffer, potentially
+ * splitting into two memcpys to handle wrap-around.
+ */
+static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c,
+		struct nvgpu_gpfifo_entry *kern_gpfifo,
+		struct nvgpu_gpfifo_userdata userdata,
+		u32 num_entries)
+{
+	struct gk20a *g = c->g;
+	int err;
+
+	if (!kern_gpfifo && !c->gpfifo.pipe) {
+		/*
+		 * This path (from userspace to sysmem) is special in order to
+		 * avoid two copies unnecessarily (from user to pipe, then from
+		 * pipe to gpu sysmem buffer).
+		 */
+		err = nvgpu_submit_append_gpfifo_user_direct(c, userdata,
+				num_entries);
+		if (err)
+			return err;
+	} else if (!kern_gpfifo) {
+		/* from userspace to vidmem, use the common path */
+		err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata,
+				0, num_entries);
+		if (err)
+			return err;
+
+		nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe,
+				num_entries);
+	} else {
+		/* from kernel to either sysmem or vidmem, don't need
+		 * copy_user_gpfifo so use the common path */
+		nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries);
+	}
+
+	trace_write_pushbuffers(c, num_entries);
+
+	c->gpfifo.put = (c->gpfifo.put + num_entries) &
+		(c->gpfifo.entry_num - 1);
+
+	return 0;
+}
+
+static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
+				struct nvgpu_gpfifo_entry *gpfifo,
+				struct nvgpu_gpfifo_userdata userdata,
+				u32 num_entries,
+				u32 flags,
+				struct nvgpu_channel_fence *fence,
+				struct gk20a_fence **fence_out,
+				struct fifo_profile_gk20a *profile)
+{
+	struct gk20a *g = c->g;
+	struct priv_cmd_entry *wait_cmd = NULL;
+	struct priv_cmd_entry *incr_cmd = NULL;
+	struct gk20a_fence *post_fence = NULL;
+	struct channel_gk20a_job *job = NULL;
+	/* we might need two extra gpfifo entries - one for pre fence
+	 * and one for post fence. */
+	const int extra_entries = 2;
+	bool skip_buffer_refcounting = (flags &
+			NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING);
+	int err = 0;
+	bool need_job_tracking;
+	bool need_deferred_cleanup = false;
+
+	if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
+		return -ENODEV;
+
+	if (c->has_timedout)
+		return -ETIMEDOUT;
+
+	if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
+		return -ENOMEM;
+
+	/* fifo not large enough for request. Return error immediately.
+	 * Kernel can insert gpfifo entries before and after user gpfifos.
+	 * So, add extra_entries in user request. Also, HW with fifo size N
+	 * can accept only N-1 entreis and so the below condition */
+	if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
+		nvgpu_err(g, "not enough gpfifo space allocated");
+		return -ENOMEM;
+	}
+
+	if ((flags & (NVGPU_SUBMIT_FLAGS_FENCE_WAIT |
+		      NVGPU_SUBMIT_FLAGS_FENCE_GET)) &&
+	    !fence)
+		return -EINVAL;
+
+	/* an address space needs to have been bound at this point. */
+	if (!gk20a_channel_as_bound(c)) {
+		nvgpu_err(g,
+			    "not bound to an address space at time of gpfifo"
+			    " submission.");
+		return -EINVAL;
+	}
+
+	gk20a_fifo_profile_snapshot(profile, PROFILE_ENTRY);
+
+	/* update debug settings */
+	nvgpu_ltc_sync_enabled(g);
+
+	nvgpu_log_info(g, "channel %d", c->chid);
+
+	/*
+	 * Job tracking is necessary for any of the following conditions:
+	 *  - pre- or post-fence functionality
+	 *  - channel wdt
+	 *  - GPU rail-gating with non-deterministic channels
+	 *  - buffer refcounting
+	 *
+	 * If none of the conditions are met, then job tracking is not
+	 * required and a fast submit can be done (ie. only need to write
+	 * out userspace GPFIFO entries and update GP_PUT).
+	 */
+	need_job_tracking = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) ||
+			(flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) ||
+			c->timeout.enabled ||
+			(g->can_railgate && !c->deterministic) ||
+			!skip_buffer_refcounting;
+
+	if (need_job_tracking) {
+		bool need_sync_framework = false;
+
+		/*
+		 * If the channel is to have deterministic latency and
+		 * job tracking is required, the channel must have
+		 * pre-allocated resources. Otherwise, we fail the submit here
+		 */
+		if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
+			return -EINVAL;
+
+		need_sync_framework =
+			gk20a_channel_sync_needs_sync_framework(g) ||
+			(flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE &&
+			 flags & NVGPU_SUBMIT_FLAGS_FENCE_GET);
+
+		/*
+		 * Deferred clean-up is necessary for any of the following
+		 * conditions:
+		 * - channel's deterministic flag is not set
+		 * - dependency on sync framework, which could make the
+		 *   behavior of the clean-up operation non-deterministic
+		 *   (should not be performed in the submit path)
+		 * - channel wdt
+		 * - GPU rail-gating with non-deterministic channels
+		 * - buffer refcounting
+		 *
+		 * If none of the conditions are met, then deferred clean-up
+		 * is not required, and we clean-up one job-tracking
+		 * resource in the submit path.
+		 */
+		need_deferred_cleanup = !c->deterministic ||
+					need_sync_framework ||
+					c->timeout.enabled ||
+					(g->can_railgate &&
+					 !c->deterministic) ||
+					!skip_buffer_refcounting;
+
+		/*
+		 * For deterministic channels, we don't allow deferred clean_up
+		 * processing to occur. In cases we hit this, we fail the submit
+		 */
+		if (c->deterministic && need_deferred_cleanup)
+			return -EINVAL;
+
+		if (!c->deterministic) {
+			/*
+			 * Get a power ref unless this is a deterministic
+			 * channel that holds them during the channel lifetime.
+			 * This one is released by gk20a_channel_clean_up_jobs,
+			 * via syncpt or sema interrupt, whichever is used.
+			 */
+			err = gk20a_busy(g);
+			if (err) {
+				nvgpu_err(g,
+					"failed to host gk20a to submit gpfifo");
+				nvgpu_print_current(g, NULL, NVGPU_ERROR);
+				return err;
+			}
+		}
+
+		if (!need_deferred_cleanup) {
+			/* clean up a single job */
+			gk20a_channel_clean_up_jobs(c, false);
+		}
+	}
+
+
+	/* Grab access to HW to deal with do_idle */
+	if (c->deterministic)
+		nvgpu_rwsem_down_read(&g->deterministic_busy);
+
+	if (c->deterministic && c->deterministic_railgate_allowed) {
+		/*
+		 * Nope - this channel has dropped its own power ref. As
+		 * deterministic submits don't hold power on per each submitted
+		 * job like normal ones do, the GPU might railgate any time now
+		 * and thus submit is disallowed.
+		 */
+		err = -EINVAL;
+		goto clean_up;
+	}
+
+	trace_gk20a_channel_submit_gpfifo(g->name,
+					  c->chid,
+					  num_entries,
+					  flags,
+					  fence ? fence->id : 0,
+					  fence ? fence->value : 0);
+
+	nvgpu_log_info(g, "pre-submit put %d, get %d, size %d",
+		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+	/*
+	 * Make sure we have enough space for gpfifo entries. Check cached
+	 * values first and then read from HW. If no space, return EAGAIN
+	 * and let userpace decide to re-try request or not.
+	 */
+	if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
+		if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
+			err = -EAGAIN;
+			goto clean_up;
+		}
+	}
+
+	if (c->has_timedout) {
+		err = -ETIMEDOUT;
+		goto clean_up;
+	}
+
+	if (need_job_tracking) {
+		err = channel_gk20a_alloc_job(c, &job);
+		if (err)
+			goto clean_up;
+
+		err = nvgpu_submit_prepare_syncs(c, fence, job,
+						 &wait_cmd, &incr_cmd,
+						 &post_fence,
+						 need_deferred_cleanup,
+						 flags);
+		if (err)
+			goto clean_up_job;
+	}
+
+	gk20a_fifo_profile_snapshot(profile, PROFILE_JOB_TRACKING);
+
+	if (wait_cmd)
+		nvgpu_submit_append_priv_cmdbuf(c, wait_cmd);
+
+	err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
+			num_entries);
+	if (err)
+		goto clean_up_job;
+
+	/*
+	 * And here's where we add the incr_cmd we generated earlier. It should
+	 * always run!
+	 */
+	if (incr_cmd)
+		nvgpu_submit_append_priv_cmdbuf(c, incr_cmd);
+
+	if (fence_out)
+		*fence_out = gk20a_fence_get(post_fence);
+
+	if (need_job_tracking)
+		/* TODO! Check for errors... */
+		gk20a_channel_add_job(c, job, skip_buffer_refcounting);
+	gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND);
+
+	g->ops.fifo.userd_gp_put(g, c);
+
+	/* No hw access beyond this point */
+	if (c->deterministic)
+		nvgpu_rwsem_up_read(&g->deterministic_busy);
+
+	trace_gk20a_channel_submitted_gpfifo(g->name,
+				c->chid,
+				num_entries,
+				flags,
+				post_fence ? post_fence->syncpt_id : 0,
+				post_fence ? post_fence->syncpt_value : 0);
+
+	nvgpu_log_info(g, "post-submit put %d, get %d, size %d",
+		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+	gk20a_fifo_profile_snapshot(profile, PROFILE_END);
+
+	nvgpu_log_fn(g, "done");
+	return err;
+
+clean_up_job:
+	channel_gk20a_free_job(c, job);
+clean_up:
+	nvgpu_log_fn(g, "fail");
+	gk20a_fence_put(post_fence);
+	if (c->deterministic)
+		nvgpu_rwsem_up_read(&g->deterministic_busy);
+	else if (need_deferred_cleanup)
+		gk20a_idle(g);
+
+	return err;
+}
+
+int nvgpu_submit_channel_gpfifo_user(struct channel_gk20a *c,
+				struct nvgpu_gpfifo_userdata userdata,
+				u32 num_entries,
+				u32 flags,
+				struct nvgpu_channel_fence *fence,
+				struct gk20a_fence **fence_out,
+				struct fifo_profile_gk20a *profile)
+{
+	return nvgpu_submit_channel_gpfifo(c, NULL, userdata, num_entries,
+			flags, fence, fence_out, profile);
+}
+
+int nvgpu_submit_channel_gpfifo_kernel(struct channel_gk20a *c,
+				struct nvgpu_gpfifo_entry *gpfifo,
+				u32 num_entries,
+				u32 flags,
+				struct nvgpu_channel_fence *fence,
+				struct gk20a_fence **fence_out)
+{
+	struct nvgpu_gpfifo_userdata userdata = { NULL, NULL };
+
+	return nvgpu_submit_channel_gpfifo(c, gpfifo, userdata, num_entries,
+			flags, fence, fence_out, NULL);
+}
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -47,7 +47,7 @@ struct fifo_profile_gk20a;
 #define NVGPU_GPFIFO_FLAGS_REPLAYABLE_FAULTS_ENABLE	(1 << 2)
 #define NVGPU_GPFIFO_FLAGS_USERMODE_SUPPORT		(1 << 3)

-/* Flags to be passed to gk20a_submit_channel_gpfifo() */
+/* Flags to be passed to nvgpu_submit_channel_gpfifo() */
 #define NVGPU_SUBMIT_FLAGS_FENCE_WAIT	(1 << 0)
 #define NVGPU_SUBMIT_FLAGS_FENCE_GET	(1 << 1)
 #define NVGPU_SUBMIT_FLAGS_HW_FORMAT	(1 << 2)
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -150,6 +150,7 @@ enum nvgpu_unit;
 enum nvgpu_flush_op;

 struct _resmgr_context;
+struct nvgpu_gpfifo_entry;

 struct nvgpu_gpfifo_userdata {
 	struct nvgpu_gpfifo_entry __user *entries;
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/types.h>
+
+#include "gk20a/gk20a.h"
+
+struct nvgpu_channel_fence;
+struct gk20a_fence;
+struct fifo_profile_gk20a;
+
+int nvgpu_submit_channel_gpfifo_user(struct channel_gk20a *c,
+				struct nvgpu_gpfifo_userdata userdata,
+				u32 num_entries,
+				u32 flags,
+				struct nvgpu_channel_fence *fence,
+				struct gk20a_fence **fence_out,
+				struct fifo_profile_gk20a *profile);
+
+int nvgpu_submit_channel_gpfifo_kernel(struct channel_gk20a *c,
+				struct nvgpu_gpfifo_entry *gpfifo,
+				u32 num_entries,
+				u32 flags,
+				struct nvgpu_channel_fence *fence,
+				struct gk20a_fence **fence_out);
+
+#ifdef CONFIG_DEBUG_FS
+void trace_write_pushbuffers(struct channel_gk20a *c, int count);
+#else
+static inline void trace_write_pushbuffers(struct channel_gk20a *c, int count)
+{
+}
+#endif
--- a/drivers/gpu/nvgpu/os/linux/cde.c
+++ b/drivers/gpu/nvgpu/os/linux/cde.c
@@ -32,6 +32,7 @@
 #include <nvgpu/bug.h>
 #include <nvgpu/firmware.h>
 #include <nvgpu/os_sched.h>
+#include <nvgpu/channel.h>

 #include <nvgpu/linux/vm.h>

@@ -783,7 +784,7 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
 		return -ENOSYS;
 	}

-	return gk20a_submit_channel_gpfifo_kernel(cde_ctx->ch, gpfifo,
+	return nvgpu_submit_channel_gpfifo_kernel(cde_ctx->ch, gpfifo,
 			num_entries, flags, fence, fence_out);
 }

--- a/drivers/gpu/nvgpu/os/linux/ce2.c
+++ b/drivers/gpu/nvgpu/os/linux/ce2.c
@@ -15,6 +15,7 @@
 */

 #include <nvgpu/types.h>
+#include <nvgpu/channel.h>

 #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>

@@ -130,7 +131,7 @@ int gk20a_ce_execute_ops(struct gk20a *g,

 		nvgpu_smp_wmb();

-		ret = gk20a_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo,
+		ret = nvgpu_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo,
 				1, submit_flags, &fence, &ce_cmd_buf_fence_out);

 		if (!ret) {
--- a/drivers/gpu/nvgpu/os/linux/channel.c
+++ b/drivers/gpu/nvgpu/os/linux/channel.c
@@ -16,7 +16,6 @@

 #include <nvgpu/enabled.h>
 #include <nvgpu/debug.h>
-#include <nvgpu/ltc.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/os_sched.h>

@@ -489,11 +488,9 @@ static void trace_write_pushbuffer(struct channel_gk20a *c,
 		dma_buf_vunmap(dmabuf, mem);
 	}
 }
-#endif

-static void trace_write_pushbuffers(struct channel_gk20a *c, u32 count)
+void trace_write_pushbuffers(struct channel_gk20a *c, u32 count)
 {
-#ifdef CONFIG_DEBUG_FS
 	struct nvgpu_gpfifo_entry *gp = c->gpfifo.mem.cpu_va;
 	u32 n = c->gpfifo.entry_num;
 	u32 start = c->gpfifo.put;
@@ -507,549 +504,5 @@ static void trace_write_pushbuffers(struct channel_gk20a *c, u32 count)

 	for (i = 0; i < count; i++)
 		trace_write_pushbuffer(c, &gp[(start + i) % n]);
+}
 #endif
-}
-
-/*
- * Handle the submit synchronization - pre-fences and post-fences.
- */
-static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
-				      struct nvgpu_channel_fence *fence,
-				      struct channel_gk20a_job *job,
-				      struct priv_cmd_entry **wait_cmd,
-				      struct priv_cmd_entry **incr_cmd,
-				      struct gk20a_fence **post_fence,
-				      bool register_irq,
-				      u32 flags)
-{
-	struct gk20a *g = c->g;
-	bool need_sync_fence = false;
-	bool new_sync_created = false;
-	int wait_fence_fd = -1;
-	int err = 0;
-	bool need_wfi = !(flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI);
-	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
-
-	if (g->aggressive_sync_destroy_thresh) {
-		nvgpu_mutex_acquire(&c->sync_lock);
-		if (!c->sync) {
-			c->sync = gk20a_channel_sync_create(c, false);
-			if (!c->sync) {
-				err = -ENOMEM;
-				nvgpu_mutex_release(&c->sync_lock);
-				goto fail;
-			}
-			new_sync_created = true;
-		}
-		nvgpu_atomic_inc(&c->sync->refcount);
-		nvgpu_mutex_release(&c->sync_lock);
-	}
-
-	if (g->ops.fifo.resetup_ramfc && new_sync_created) {
-		err = g->ops.fifo.resetup_ramfc(c);
-		if (err)
-			goto fail;
-	}
-
-	/*
-	 * Optionally insert syncpt/semaphore wait in the beginning of gpfifo
-	 * submission when user requested and the wait hasn't expired.
-	 */
-	if (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) {
-		int max_wait_cmds = c->deterministic ? 1 : 0;
-
-		if (!pre_alloc_enabled)
-			job->wait_cmd = nvgpu_kzalloc(g,
-				sizeof(struct priv_cmd_entry));
-
-		if (!job->wait_cmd) {
-			err = -ENOMEM;
-			goto fail;
-		}
-
-		if (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) {
-			wait_fence_fd = fence->id;
-			err = c->sync->wait_fd(c->sync, wait_fence_fd,
-					       job->wait_cmd, max_wait_cmds);
-		} else {
-			err = c->sync->wait_syncpt(c->sync, fence->id,
-						   fence->value,
-						   job->wait_cmd);
-		}
-
-		if (err)
-			goto clean_up_wait_cmd;
-
-		if (job->wait_cmd->valid)
-			*wait_cmd = job->wait_cmd;
-	}
-
-	if ((flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) &&
-	    (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE))
-		need_sync_fence = true;
-
-	/*
-	 * Always generate an increment at the end of a GPFIFO submission. This
-	 * is used to keep track of method completion for idle railgating. The
-	 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
-	 */
-	job->post_fence = gk20a_alloc_fence(c);
-	if (!job->post_fence) {
-		err = -ENOMEM;
-		goto clean_up_wait_cmd;
-	}
-	if (!pre_alloc_enabled)
-		job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
-
-	if (!job->incr_cmd) {
-		err = -ENOMEM;
-		goto clean_up_post_fence;
-	}
-
-	if (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET)
-		err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
-				 job->post_fence, need_wfi, need_sync_fence,
-				 register_irq);
-	else
-		err = c->sync->incr(c->sync, job->incr_cmd,
-				    job->post_fence, need_sync_fence,
-				    register_irq);
-	if (!err) {
-		*incr_cmd = job->incr_cmd;
-		*post_fence = job->post_fence;
-	} else
-		goto clean_up_incr_cmd;
-
-	return 0;
-
-clean_up_incr_cmd:
-	free_priv_cmdbuf(c, job->incr_cmd);
-	if (!pre_alloc_enabled)
-		job->incr_cmd = NULL;
-clean_up_post_fence:
-	gk20a_fence_put(job->post_fence);
-	job->post_fence = NULL;
-clean_up_wait_cmd:
-	if (job->wait_cmd)
-		free_priv_cmdbuf(c, job->wait_cmd);
-	if (!pre_alloc_enabled)
-		job->wait_cmd = NULL;
-fail:
-	*wait_cmd = NULL;
-	return err;
-}
-
-static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
-		struct priv_cmd_entry *cmd)
-{
-	struct gk20a *g = c->g;
-	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
-	struct nvgpu_gpfifo_entry x = {
-		.entry0 = u64_lo32(cmd->gva),
-		.entry1 = u64_hi32(cmd->gva) |
-			pbdma_gp_entry1_length_f(cmd->size)
-	};
-
-	nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
-			&x, sizeof(x));
-
-	if (cmd->mem->aperture == APERTURE_SYSMEM)
-		trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
-				(u32 *)cmd->mem->cpu_va + cmd->off);
-
-	c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
-}
-
-static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c,
-		struct nvgpu_gpfifo_userdata userdata,
-		u32 num_entries)
-{
-	struct gk20a *g = c->g;
-	struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va;
-	u32 gpfifo_size = c->gpfifo.entry_num;
-	u32 len = num_entries;
-	u32 start = c->gpfifo.put;
-	u32 end = start + len; /* exclusive */
-	int err;
-
-	if (end > gpfifo_size) {
-		/* wrap-around */
-		int length0 = gpfifo_size - start;
-		int length1 = len - length0;
-
-		err = g->os_channel.copy_user_gpfifo(
-				gpfifo_cpu + start, userdata,
-				0, length0);
-		if (err)
-			return err;
-
-		err = g->os_channel.copy_user_gpfifo(
-				gpfifo_cpu, userdata,
-				length0, length1);
-		if (err)
-			return err;
-	} else {
-		err = g->os_channel.copy_user_gpfifo(
-				gpfifo_cpu + start, userdata,
-				0, len);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static void nvgpu_submit_append_gpfifo_common(struct channel_gk20a *c,
-		struct nvgpu_gpfifo_entry *src, u32 num_entries)
-{
-	struct gk20a *g = c->g;
-	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
-	/* in bytes */
-	u32 gpfifo_size =
-		c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo_entry);
-	u32 len = num_entries * sizeof(struct nvgpu_gpfifo_entry);
-	u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo_entry);
-	u32 end = start + len; /* exclusive */
-
-	if (end > gpfifo_size) {
-		/* wrap-around */
-		int length0 = gpfifo_size - start;
-		int length1 = len - length0;
-		struct nvgpu_gpfifo_entry *src2 = src + length0;
-
-		nvgpu_mem_wr_n(g, gpfifo_mem, start, src, length0);
-		nvgpu_mem_wr_n(g, gpfifo_mem, 0, src2, length1);
-	} else {
-		nvgpu_mem_wr_n(g, gpfifo_mem, start, src, len);
-	}
-}
-
-/*
- * Copy source gpfifo entries into the gpfifo ring buffer, potentially
- * splitting into two memcpys to handle wrap-around.
- */
-static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c,
-		struct nvgpu_gpfifo_entry *kern_gpfifo,
-		struct nvgpu_gpfifo_userdata userdata,
-		u32 num_entries)
-{
-	struct gk20a *g = c->g;
-	int err;
-
-	if (!kern_gpfifo && !c->gpfifo.pipe) {
-		/*
-		 * This path (from userspace to sysmem) is special in order to
-		 * avoid two copies unnecessarily (from user to pipe, then from
-		 * pipe to gpu sysmem buffer).
-		 */
-		err = nvgpu_submit_append_gpfifo_user_direct(c, userdata,
-				num_entries);
-		if (err)
-			return err;
-	} else if (!kern_gpfifo) {
-		/* from userspace to vidmem, use the common path */
-		err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata,
-				0, num_entries);
-		if (err)
-			return err;
-
-		nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe,
-				num_entries);
-	} else {
-		/* from kernel to either sysmem or vidmem, don't need
-		 * copy_user_gpfifo so use the common path */
-		nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries);
-	}
-
-	trace_write_pushbuffers(c, num_entries);
-
-	c->gpfifo.put = (c->gpfifo.put + num_entries) &
-		(c->gpfifo.entry_num - 1);
-
-	return 0;
-}
-
-static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
-				struct nvgpu_gpfifo_entry *gpfifo,
-				struct nvgpu_gpfifo_userdata userdata,
-				u32 num_entries,
-				u32 flags,
-				struct nvgpu_channel_fence *fence,
-				struct gk20a_fence **fence_out,
-				struct fifo_profile_gk20a *profile)
-{
-	struct gk20a *g = c->g;
-	struct priv_cmd_entry *wait_cmd = NULL;
-	struct priv_cmd_entry *incr_cmd = NULL;
-	struct gk20a_fence *post_fence = NULL;
-	struct channel_gk20a_job *job = NULL;
-	/* we might need two extra gpfifo entries - one for pre fence
-	 * and one for post fence. */
-	const int extra_entries = 2;
-	bool skip_buffer_refcounting = (flags &
-			NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING);
-	int err = 0;
-	bool need_job_tracking;
-	bool need_deferred_cleanup = false;
-
-	if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
-		return -ENODEV;
-
-	if (c->has_timedout)
-		return -ETIMEDOUT;
-
-	if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
-		return -ENOMEM;
-
-	/* fifo not large enough for request. Return error immediately.
-	 * Kernel can insert gpfifo entries before and after user gpfifos.
-	 * So, add extra_entries in user request. Also, HW with fifo size N
-	 * can accept only N-1 entreis and so the below condition */
-	if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
-		nvgpu_err(g, "not enough gpfifo space allocated");
-		return -ENOMEM;
-	}
-
-	if ((flags & (NVGPU_SUBMIT_FLAGS_FENCE_WAIT |
-		      NVGPU_SUBMIT_FLAGS_FENCE_GET)) &&
-	    !fence)
-		return -EINVAL;
-
-	/* an address space needs to have been bound at this point. */
-	if (!gk20a_channel_as_bound(c)) {
-		nvgpu_err(g,
-			    "not bound to an address space at time of gpfifo"
-			    " submission.");
-		return -EINVAL;
-	}
-
-	gk20a_fifo_profile_snapshot(profile, PROFILE_ENTRY);
-
-	/* update debug settings */
-	nvgpu_ltc_sync_enabled(g);
-
-	nvgpu_log_info(g, "channel %d", c->chid);
-
-	/*
-	 * Job tracking is necessary for any of the following conditions:
-	 *  - pre- or post-fence functionality
-	 *  - channel wdt
-	 *  - GPU rail-gating with non-deterministic channels
-	 *  - buffer refcounting
-	 *
-	 * If none of the conditions are met, then job tracking is not
-	 * required and a fast submit can be done (ie. only need to write
-	 * out userspace GPFIFO entries and update GP_PUT).
-	 */
-	need_job_tracking = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) ||
-			(flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) ||
-			c->timeout.enabled ||
-			(g->can_railgate && !c->deterministic) ||
-			!skip_buffer_refcounting;
-
-	if (need_job_tracking) {
-		bool need_sync_framework = false;
-
-		/*
-		 * If the channel is to have deterministic latency and
-		 * job tracking is required, the channel must have
-		 * pre-allocated resources. Otherwise, we fail the submit here
-		 */
-		if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
-			return -EINVAL;
-
-		need_sync_framework =
-			gk20a_channel_sync_needs_sync_framework(g) ||
-			(flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE &&
-			 flags & NVGPU_SUBMIT_FLAGS_FENCE_GET);
-
-		/*
-		 * Deferred clean-up is necessary for any of the following
-		 * conditions:
-		 * - channel's deterministic flag is not set
-		 * - dependency on sync framework, which could make the
-		 *   behavior of the clean-up operation non-deterministic
-		 *   (should not be performed in the submit path)
-		 * - channel wdt
-		 * - GPU rail-gating with non-deterministic channels
-		 * - buffer refcounting
-		 *
-		 * If none of the conditions are met, then deferred clean-up
-		 * is not required, and we clean-up one job-tracking
-		 * resource in the submit path.
-		 */
-		need_deferred_cleanup = !c->deterministic ||
-					need_sync_framework ||
-					c->timeout.enabled ||
-					(g->can_railgate &&
-					 !c->deterministic) ||
-					!skip_buffer_refcounting;
-
-		/*
-		 * For deterministic channels, we don't allow deferred clean_up
-		 * processing to occur. In cases we hit this, we fail the submit
-		 */
-		if (c->deterministic && need_deferred_cleanup)
-			return -EINVAL;
-
-		if (!c->deterministic) {
-			/*
-			 * Get a power ref unless this is a deterministic
-			 * channel that holds them during the channel lifetime.
-			 * This one is released by gk20a_channel_clean_up_jobs,
-			 * via syncpt or sema interrupt, whichever is used.
-			 */
-			err = gk20a_busy(g);
-			if (err) {
-				nvgpu_err(g,
-					"failed to host gk20a to submit gpfifo");
-				nvgpu_print_current(g, NULL, NVGPU_ERROR);
-				return err;
-			}
-		}
-
-		if (!need_deferred_cleanup) {
-			/* clean up a single job */
-			gk20a_channel_clean_up_jobs(c, false);
-		}
-	}
-
-
-	/* Grab access to HW to deal with do_idle */
-	if (c->deterministic)
-		nvgpu_rwsem_down_read(&g->deterministic_busy);
-
-	if (c->deterministic && c->deterministic_railgate_allowed) {
-		/*
-		 * Nope - this channel has dropped its own power ref. As
-		 * deterministic submits don't hold power on per each submitted
-		 * job like normal ones do, the GPU might railgate any time now
-		 * and thus submit is disallowed.
-		 */
-		err = -EINVAL;
-		goto clean_up;
-	}
-
-	trace_gk20a_channel_submit_gpfifo(g->name,
-					  c->chid,
-					  num_entries,
-					  flags,
-					  fence ? fence->id : 0,
-					  fence ? fence->value : 0);
-
-	nvgpu_log_info(g, "pre-submit put %d, get %d, size %d",
-		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
-
-	/*
-	 * Make sure we have enough space for gpfifo entries. Check cached
-	 * values first and then read from HW. If no space, return EAGAIN
-	 * and let userpace decide to re-try request or not.
-	 */
-	if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
-		if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
-			err = -EAGAIN;
-			goto clean_up;
-		}
-	}
-
-	if (c->has_timedout) {
-		err = -ETIMEDOUT;
-		goto clean_up;
-	}
-
-	if (need_job_tracking) {
-		err = channel_gk20a_alloc_job(c, &job);
-		if (err)
-			goto clean_up;
-
-		err = gk20a_submit_prepare_syncs(c, fence, job,
-						 &wait_cmd, &incr_cmd,
-						 &post_fence,
-						 need_deferred_cleanup,
-						 flags);
-		if (err)
-			goto clean_up_job;
-	}
-
-	gk20a_fifo_profile_snapshot(profile, PROFILE_JOB_TRACKING);
-
-	if (wait_cmd)
-		gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
-
-	err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
-			num_entries);
-	if (err)
-		goto clean_up_job;
-
-	/*
-	 * And here's where we add the incr_cmd we generated earlier. It should
-	 * always run!
-	 */
-	if (incr_cmd)
-		gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
-
-	if (fence_out)
-		*fence_out = gk20a_fence_get(post_fence);
-
-	if (need_job_tracking)
-		/* TODO! Check for errors... */
-		gk20a_channel_add_job(c, job, skip_buffer_refcounting);
-	gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND);
-
-	g->ops.fifo.userd_gp_put(g, c);
-
-	/* No hw access beyond this point */
-	if (c->deterministic)
-		nvgpu_rwsem_up_read(&g->deterministic_busy);
-
-	trace_gk20a_channel_submitted_gpfifo(g->name,
-				c->chid,
-				num_entries,
-				flags,
-				post_fence ? post_fence->syncpt_id : 0,
-				post_fence ? post_fence->syncpt_value : 0);
-
-	nvgpu_log_info(g, "post-submit put %d, get %d, size %d",
-		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
-
-	gk20a_fifo_profile_snapshot(profile, PROFILE_END);
-
-	nvgpu_log_fn(g, "done");
-	return err;
-
-clean_up_job:
-	channel_gk20a_free_job(c, job);
-clean_up:
-	nvgpu_log_fn(g, "fail");
-	gk20a_fence_put(post_fence);
-	if (c->deterministic)
-		nvgpu_rwsem_up_read(&g->deterministic_busy);
-	else if (need_deferred_cleanup)
-		gk20a_idle(g);
-
-	return err;
-}
-
-int gk20a_submit_channel_gpfifo_user(struct channel_gk20a *c,
-				struct nvgpu_gpfifo_userdata userdata,
-				u32 num_entries,
-				u32 flags,
-				struct nvgpu_channel_fence *fence,
-				struct gk20a_fence **fence_out,
-				struct fifo_profile_gk20a *profile)
-{
-	return gk20a_submit_channel_gpfifo(c, NULL, userdata, num_entries,
-			flags, fence, fence_out, profile);
-}
-
-int gk20a_submit_channel_gpfifo_kernel(struct channel_gk20a *c,
-				struct nvgpu_gpfifo_entry *gpfifo,
-				u32 num_entries,
-				u32 flags,
-				struct nvgpu_channel_fence *fence,
-				struct gk20a_fence **fence_out)
-{
-	struct nvgpu_gpfifo_userdata userdata = { NULL, NULL };
-	return gk20a_submit_channel_gpfifo(c, gpfifo, userdata, num_entries,
-			flags, fence, fence_out, NULL);
-}
--- a/drivers/gpu/nvgpu/os/linux/channel.h
+++ b/drivers/gpu/nvgpu/os/linux/channel.h
@@ -84,19 +84,4 @@ struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
 		int runlist_id,
 		bool is_privileged_channel);

-int gk20a_submit_channel_gpfifo_user(struct channel_gk20a *c,
-				struct nvgpu_gpfifo_userdata userdata,
-				u32 num_entries,
-				u32 flags,
-				struct nvgpu_channel_fence *fence,
-				struct gk20a_fence **fence_out,
-				struct fifo_profile_gk20a *profile);
-
-int gk20a_submit_channel_gpfifo_kernel(struct channel_gk20a *c,
-				struct nvgpu_gpfifo_entry *gpfifo,
-				u32 num_entries,
-				u32 flags,
-				struct nvgpu_channel_fence *fence,
-				struct gk20a_fence **fence_out);
-
 #endif /* __NVGPU_CHANNEL_H__ */
--- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
@@ -34,6 +34,7 @@
 #include <nvgpu/barrier.h>
 #include <nvgpu/nvhost.h>
 #include <nvgpu/os_sched.h>
+#include <nvgpu/channel.h>

 #include "gk20a/gk20a.h"
 #include "gk20a/dbg_gpu_gk20a.h"
@@ -803,7 +804,7 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 		(uintptr_t)args->gpfifo;
 	userdata.context = NULL;

-	ret = gk20a_submit_channel_gpfifo_user(ch,
+	ret = nvgpu_submit_channel_gpfifo_user(ch,
 			userdata, args->num_entries,
 			submit_flags, &fence, &fence_out, profile);

--- a/include/trace/events/gk20a.h
+++ b/include/trace/events/gk20a.h
@@ -646,4 +646,8 @@ DEFINE_EVENT(gk20a_cde, gk20a_cde_finished_ctx_cb,
 #define trace_gk20a_ltc_cbc_ctrl_start(arg...)		((void)(NULL))
 #define trace_gk20a_ltc_cbc_ctrl_done(arg...)		((void)(NULL))

+#define trace_gk20a_channel_submit_gpfifo(arg...)	((void)(NULL))
+#define trace_gk20a_channel_submitted_gpfifo(arg...)	((void)(NULL))
+#define trace_gk20a_push_cmdbuf(arg...)			((void)(NULL))
+
 #endif