gpu: nvgpu: add support for pre-allocated resources

Add support for pre-allocation of job tracking resources w/ new (extended) ioctl. Goal is to avoid dynamic memory allocation in the submit path. This patch does the following: 1) Intoduces a new ioctl, NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO_EX, which enables pre-allocation of tracking resources per job: a) 2x priv_cmd_entry b) 2x gk20a_fence 2) Implements circular ring buffer for job tracking to avoid lock contention between producer (submitter) and consumer (clean-up) Bug 1795076 Change-Id: I6b52e5c575871107ff380f9a5790f440a6969347 Signed-off-by: Sachit Kadle <skadle@nvidia.com> Reviewed-on: http://git-master/r/1203300 (cherry picked from commit 9fd270c22b860935dffe244753dabd87454bef39) Reviewed-on: http://git-master/r/1223934 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-23 01:50:07 +03:00 · 2016-08-15 14:32:39 -07:00
parent 63e8592e06
commit 733fb79b39
7 changed files with 463 additions and 66 deletions
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1126,9 +1126,9 @@ __releases(&cde_app->mutex)
 	struct gk20a_cde_app *cde_app = &g->cde_app;
 	bool channel_idle;

-	spin_lock(&ch->jobs_lock);
-	channel_idle = list_empty(&ch->jobs);
-	spin_unlock(&ch->jobs_lock);
+	channel_gk20a_joblist_lock(ch);
+	channel_idle = channel_gk20a_joblist_is_empty(ch);
+	channel_gk20a_joblist_unlock(ch);

 	if (!channel_idle)
 		return;
@@ -1207,7 +1207,7 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)

 	/* allocate gpfifo (1024 should be more than enough) */
 	err = gk20a_alloc_channel_gpfifo(ch,
-		&(struct nvgpu_alloc_gpfifo_args){1024, 0});
+		&(struct nvgpu_alloc_gpfifo_ex_args){1024, 0, 0, {}});
 	if (err) {
 		gk20a_warn(cde_ctx->dev, "cde: unable to allocate gpfifo");
 		goto err_alloc_gpfifo;
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -126,9 +126,9 @@ static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
 	bool channel_idle;
 	u32 event;

-	spin_lock(&ch->jobs_lock);
-	channel_idle = list_empty(&ch->jobs);
-	spin_unlock(&ch->jobs_lock);
+	channel_gk20a_joblist_lock(ch);
+	channel_idle = channel_gk20a_joblist_is_empty(ch);
+	channel_gk20a_joblist_unlock(ch);

 	if (!channel_idle)
 		return;
@@ -462,7 +462,7 @@ u32 gk20a_ce_create_context_with_cb(struct device *dev,

 	/* allocate gpfifo (1024 should be more than enough) */
 	err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
-		&(struct nvgpu_alloc_gpfifo_args){1024, 0});
+		&(struct nvgpu_alloc_gpfifo_ex_args){1024, 0, 0, {}});
 	if (err) {
 		gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
 		goto end;
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -26,6 +26,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/dma-buf.h>
 #include <linux/vmalloc.h>
+#include <linux/circ_buf.h>

 #include "debug_gk20a.h"
 #include "ctxsw_trace_gk20a.h"
@@ -55,6 +56,15 @@ static void free_priv_cmdbuf(struct channel_gk20a *c,
 static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
 static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);

+static void channel_gk20a_free_prealloc_resources(struct channel_gk20a *c);
+
+static void channel_gk20a_joblist_add(struct channel_gk20a *c,
+		struct channel_gk20a_job *job);
+static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
+		struct channel_gk20a_job *job);
+static struct channel_gk20a_job *channel_gk20a_joblist_peek(
+		struct channel_gk20a *c);
+
 static int channel_gk20a_commit_userd(struct channel_gk20a *c);
 static int channel_gk20a_setup_userd(struct channel_gk20a *c);

@@ -460,6 +470,7 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
 {
 	struct channel_gk20a_job *job, *n;
 	bool released_job_semaphore = false;
+	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(ch);

 	gk20a_channel_cancel_job_clean_up(ch, true);

@@ -471,14 +482,37 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)

 	/* release all job semaphores (applies only to jobs that use
 	   semaphore synchronization) */
-	spin_lock(&ch->jobs_lock);
-	list_for_each_entry_safe(job, n, &ch->jobs, list) {
+	channel_gk20a_joblist_lock(ch);
+	if (pre_alloc_enabled) {
+		int tmp_get = ch->joblist.pre_alloc.get;
+		int put = ch->joblist.pre_alloc.put;
+
+		/*
+		 * ensure put is read before any subsequent reads.
+		 * see corresponding wmb in gk20a_channel_add_job()
+		 */
+		rmb();
+
+		while (tmp_get != put) {
+			job = &ch->joblist.pre_alloc.jobs[tmp_get];
 			if (job->post_fence->semaphore) {
-			gk20a_semaphore_release(job->post_fence->semaphore);
+				gk20a_semaphore_release(
+						job->post_fence->semaphore);
+				released_job_semaphore = true;
+			}
+			tmp_get = (tmp_get + 1) % ch->joblist.pre_alloc.length;
+		}
+	} else {
+		list_for_each_entry_safe(job, n,
+				&ch->joblist.dynamic.jobs, list) {
+			if (job->post_fence->semaphore) {
+				gk20a_semaphore_release(
+						job->post_fence->semaphore);
 				released_job_semaphore = true;
 			}
 		}
-	spin_unlock(&ch->jobs_lock);
+	}
+	channel_gk20a_joblist_unlock(ch);

 	if (released_job_semaphore)
 		wake_up_interruptible_all(&ch->semaphore_wq);
@@ -511,9 +545,9 @@ int gk20a_wait_channel_idle(struct channel_gk20a *ch)
 		msecs_to_jiffies(gk20a_get_gr_idle_timeout(ch->g));

 	do {
-		spin_lock(&ch->jobs_lock);
-		channel_idle = list_empty(&ch->jobs);
-		spin_unlock(&ch->jobs_lock);
+		channel_gk20a_joblist_lock(ch);
+		channel_idle = channel_gk20a_joblist_is_empty(ch);
+		channel_gk20a_joblist_unlock(ch);
 		if (channel_idle)
 			break;

@@ -1016,6 +1050,10 @@ unbind:

 	mutex_unlock(&g->dbg_sessions_lock);

+	/* free pre-allocated resources, if applicable */
+	if (channel_gk20a_is_prealloc_enabled(ch))
+		channel_gk20a_free_prealloc_resources(ch);
+
 	/* make sure we catch accesses of unopened channels in case
 	 * there's non-refcounted channel pointers hanging around */
 	ch->g = NULL;
@@ -1422,7 +1460,10 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
 	/* we already handled q->put + size > q->size so BUG_ON this */
 	BUG_ON(q->put > q->size);

-	/* commit the previous writes before making the entry valid */
+	/*
+	 * commit the previous writes before making the entry valid.
+	 * see the corresponding rmb() in gk20a_free_priv_cmdbuf().
+	 */
 	wmb();

 	e->valid = true;
@@ -1436,26 +1477,222 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
 static void free_priv_cmdbuf(struct channel_gk20a *c,
 			     struct priv_cmd_entry *e)
 {
+	if (channel_gk20a_is_prealloc_enabled(c))
+		memset(e, 0, sizeof(struct priv_cmd_entry));
+	else
 		kfree(e);
 }

-static struct channel_gk20a_job *channel_gk20a_alloc_job(
-		struct channel_gk20a *c)
+static int channel_gk20a_alloc_job(struct channel_gk20a *c,
+		struct channel_gk20a_job **job_out)
 {
-	struct channel_gk20a_job *job = NULL;
+	int err = 0;

-	job = kzalloc(sizeof(*job), GFP_KERNEL);
-	return job;
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		int put = c->joblist.pre_alloc.put;
+		int get = c->joblist.pre_alloc.get;
+
+		/*
+		 * ensure all subsequent reads happen after reading get.
+		 * see corresponding wmb in gk20a_channel_clean_up_jobs()
+		 */
+		rmb();
+
+		if (CIRC_SPACE(put, get, c->joblist.pre_alloc.length))
+			*job_out = &c->joblist.pre_alloc.jobs[put];
+		else {
+			gk20a_warn(dev_from_gk20a(c->g),
+					"out of job ringbuffer space\n");
+			err = -EAGAIN;
+		}
+	} else {
+		*job_out = kzalloc(sizeof(struct channel_gk20a_job),
+				GFP_KERNEL);
+		if (!job_out)
+			err = -ENOMEM;
+	}
+
+	return err;
 }

 static void channel_gk20a_free_job(struct channel_gk20a *c,
 		struct channel_gk20a_job *job)
 {
+	/*
+	 * In case of pre_allocated jobs, we need to clean out
+	 * the job but maintain the pointers to the priv_cmd_entry,
+	 * since they're inherently tied to the job node.
+	 */
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		struct priv_cmd_entry *wait_cmd = job->wait_cmd;
+		struct priv_cmd_entry *incr_cmd = job->incr_cmd;
+		memset(job, 0, sizeof(*job));
+		job->wait_cmd = wait_cmd;
+		job->incr_cmd = incr_cmd;
+	} else
 		kfree(job);
 }

+void channel_gk20a_joblist_lock(struct channel_gk20a *c)
+{
+	if (channel_gk20a_is_prealloc_enabled(c))
+		mutex_lock(&c->joblist.pre_alloc.read_lock);
+	else
+		spin_lock(&c->joblist.dynamic.lock);
+}
+
+void channel_gk20a_joblist_unlock(struct channel_gk20a *c)
+{
+	if (channel_gk20a_is_prealloc_enabled(c))
+		mutex_unlock(&c->joblist.pre_alloc.read_lock);
+	else
+		spin_unlock(&c->joblist.dynamic.lock);
+}
+
+static struct channel_gk20a_job *channel_gk20a_joblist_peek(
+		struct channel_gk20a *c)
+{
+	int get;
+	struct channel_gk20a_job *job = NULL;
+
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		if (!channel_gk20a_joblist_is_empty(c)) {
+			get = c->joblist.pre_alloc.get;
+			job = &c->joblist.pre_alloc.jobs[get];
+		}
+	} else {
+		if (!list_empty(&c->joblist.dynamic.jobs))
+			job = list_first_entry(&c->joblist.dynamic.jobs,
+				       struct channel_gk20a_job, list);
+	}
+
+	return job;
+}
+
+static void channel_gk20a_joblist_add(struct channel_gk20a *c,
+		struct channel_gk20a_job *job)
+{
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		c->joblist.pre_alloc.put = (c->joblist.pre_alloc.put + 1) %
+				(c->joblist.pre_alloc.length);
+	} else {
+		list_add_tail(&job->list, &c->joblist.dynamic.jobs);
+	}
+}
+
+static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
+		struct channel_gk20a_job *job)
+{
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		c->joblist.pre_alloc.get = (c->joblist.pre_alloc.get + 1) %
+				(c->joblist.pre_alloc.length);
+	} else {
+		list_del_init(&job->list);
+	}
+}
+
+bool channel_gk20a_joblist_is_empty(struct channel_gk20a *c)
+{
+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		int get = c->joblist.pre_alloc.get;
+		int put = c->joblist.pre_alloc.put;
+		return !(CIRC_CNT(put, get, c->joblist.pre_alloc.length));
+	}
+
+	return list_empty(&c->joblist.dynamic.jobs);
+}
+
+bool channel_gk20a_is_prealloc_enabled(struct channel_gk20a *c)
+{
+	bool pre_alloc_enabled = c->joblist.pre_alloc.enabled;
+
+	rmb();
+	return pre_alloc_enabled;
+}
+
+static int channel_gk20a_prealloc_resources(struct channel_gk20a *c,
+	       unsigned int num_jobs)
+{
+	int i, err;
+	size_t size;
+	struct priv_cmd_entry *entries = NULL;
+
+	if (channel_gk20a_is_prealloc_enabled(c) || !num_jobs)
+		return -EINVAL;
+
+	/*
+	 * pre-allocate the job list.
+	 * since vmalloc take in an unsigned long, we need
+	 * to make sure we don't hit an overflow condition
+	 */
+	size = sizeof(struct channel_gk20a_job);
+	if (num_jobs <= ULONG_MAX / size)
+		c->joblist.pre_alloc.jobs = vzalloc(num_jobs * size);
+	if (!c->joblist.pre_alloc.jobs) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	/*
+	 * pre-allocate 2x priv_cmd_entry for each job up front.
+	 * since vmalloc take in an unsigned long, we need
+	 * to make sure we don't hit an overflow condition
+	 */
+	size = sizeof(struct priv_cmd_entry);
+	if (num_jobs <= ULONG_MAX / (size << 1))
+		entries = vzalloc((num_jobs << 1) * size);
+	if (!entries) {
+		err = -ENOMEM;
+		goto clean_up_joblist;
+	}
+
+	for (i = 0; i < num_jobs; i++) {
+		c->joblist.pre_alloc.jobs[i].wait_cmd = &entries[i];
+		c->joblist.pre_alloc.jobs[i].incr_cmd =
+			&entries[i + num_jobs];
+	}
+
+	/* pre-allocate a fence pool */
+	err = gk20a_alloc_fence_pool(c, num_jobs);
+	if (err)
+		goto clean_up_priv_cmd;
+
+	c->joblist.pre_alloc.length = num_jobs;
+
+	/*
+	 * commit the previous writes before setting the flag.
+	 * see corresponding rmb in channel_gk20a_is_prealloc_enabled()
+	 */
+	wmb();
+	c->joblist.pre_alloc.enabled = true;
+
+	return 0;
+
+clean_up_priv_cmd:
+	vfree(entries);
+clean_up_joblist:
+	vfree(c->joblist.pre_alloc.jobs);
+clean_up:
+	memset(&c->joblist.pre_alloc, 0, sizeof(c->joblist.pre_alloc));
+	return err;
+}
+
+static void channel_gk20a_free_prealloc_resources(struct channel_gk20a *c)
+{
+	vfree(c->joblist.pre_alloc.jobs[0].wait_cmd);
+	vfree(c->joblist.pre_alloc.jobs);
+	gk20a_free_fence_pool(c);
+
+	/*
+	 * commit the previous writes before disabling the flag.
+	 * see corresponding rmb in channel_gk20a_is_prealloc_enabled()
+	 */
+	wmb();
+	c->joblist.pre_alloc.enabled = false;
+}
+
 int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
-		struct nvgpu_alloc_gpfifo_args *args)
+		struct nvgpu_alloc_gpfifo_ex_args *args)
 {
 	struct gk20a *g = c->g;
 	struct device *d = dev_from_gk20a(g);
@@ -1539,19 +1776,30 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,

 	/* TBD: setup engine contexts */

-	err = channel_gk20a_alloc_priv_cmdbuf(c);
+	if (args->num_inflight_jobs) {
+		err = channel_gk20a_prealloc_resources(c,
+				args->num_inflight_jobs);
 		if (err)
 			goto clean_up_sync;
+	}
+
+	err = channel_gk20a_alloc_priv_cmdbuf(c);
+	if (err)
+		goto clean_up_prealloc;

 	err = channel_gk20a_update_runlist(c, true);
 	if (err)
-		goto clean_up_sync;
+		goto clean_up_priv_cmd;

 	g->ops.fifo.bind_channel(c);

 	gk20a_dbg_fn("done");
 	return 0;

+clean_up_priv_cmd:
+	channel_gk20a_free_priv_cmdbuf(c);
+clean_up_prealloc:
+	channel_gk20a_free_prealloc_resources(c);
 clean_up_sync:
 	gk20a_channel_sync_destroy(c->sync);
 	c->sync = NULL;
@@ -1878,6 +2126,7 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 	struct vm_gk20a *vm = c->vm;
 	struct mapped_buffer_node **mapped_buffers = NULL;
 	int err = 0, num_mapped_buffers = 0;
+	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);

 	/* job needs reference to this vm (released in channel_update) */
 	gk20a_vm_get(vm);
@@ -1898,9 +2147,19 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,

 		gk20a_channel_timeout_start(c, job);

-		spin_lock(&c->jobs_lock);
-		list_add_tail(&job->list, &c->jobs);
-		spin_unlock(&c->jobs_lock);
+		if (!pre_alloc_enabled)
+			channel_gk20a_joblist_lock(c);
+
+		/*
+		 * ensure all pending write complete before adding to the list.
+		 * see corresponding rmb in gk20a_channel_clean_up_jobs() &
+		 * gk20a_channel_abort_clean_up()
+		 */
+		wmb();
+		channel_gk20a_joblist_add(c, job);
+
+		if (!pre_alloc_enabled)
+			channel_gk20a_joblist_unlock(c);
 	} else {
 		err = -ETIMEDOUT;
 		goto err_put_buffers;
@@ -1945,14 +2204,20 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
 	while (1) {
 		bool completed;

-		spin_lock(&c->jobs_lock);
-		if (list_empty(&c->jobs)) {
-			spin_unlock(&c->jobs_lock);
+		channel_gk20a_joblist_lock(c);
+		if (channel_gk20a_joblist_is_empty(c)) {
+			channel_gk20a_joblist_unlock(c);
 			break;
 		}
-		job = list_first_entry(&c->jobs,
-				       struct channel_gk20a_job, list);
-		spin_unlock(&c->jobs_lock);
+
+		/*
+		 * ensure that all subsequent reads occur after checking
+		 * that we have a valid node. see corresponding wmb in
+		 * gk20a_channel_add_job().
+		 */
+		rmb();
+		job = channel_gk20a_joblist_peek(c);
+		channel_gk20a_joblist_unlock(c);

 		completed = gk20a_fence_is_expired(job->post_fence);
 		if (!completed) {
@@ -1998,9 +2263,14 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
 		 * so this wouldn't get freed here. */
 		gk20a_channel_put(c);

-		spin_lock(&c->jobs_lock);
-		list_del_init(&job->list);
-		spin_unlock(&c->jobs_lock);
+		/*
+		 * ensure all pending writes complete before deleting the node.
+		 * see corresponding rmb in channel_gk20a_alloc_job().
+		 */
+		wmb();
+		channel_gk20a_joblist_lock(c);
+		channel_gk20a_joblist_delete(c, job);
+		channel_gk20a_joblist_unlock(c);

 		channel_gk20a_free_job(c, job);
 		job_finished = 1;
@@ -2160,6 +2430,7 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	int wait_fence_fd = -1;
 	int err = 0;
 	bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
+	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);

 	/*
 	 * If user wants to always allocate sync_fence_fds then respect that;
@@ -2197,9 +2468,10 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	 * this condition.
 	 */
 	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
+		job->pre_fence = gk20a_alloc_fence(c);
+		if (!pre_alloc_enabled)
 			job->wait_cmd = kzalloc(sizeof(struct priv_cmd_entry),
 						GFP_KERNEL);
-		job->pre_fence = gk20a_alloc_fence(c);

 		if (!job->wait_cmd || !job->pre_fence) {
 			err = -ENOMEM;
@@ -2233,8 +2505,10 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	 * is used to keep track of method completion for idle railgating. The
 	 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
 	 */
-	job->incr_cmd = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
 	job->post_fence = gk20a_alloc_fence(c);
+	if (!pre_alloc_enabled)
+		job->incr_cmd = kzalloc(sizeof(struct priv_cmd_entry),
+					GFP_KERNEL);

 	if (!job->incr_cmd || !job->post_fence) {
 		err = -ENOMEM;
@@ -2256,15 +2530,17 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 	return 0;

 clean_up_post_fence:
-	gk20a_free_priv_cmdbuf(c, job->incr_cmd);
 	gk20a_fence_put(job->post_fence);
-	job->incr_cmd = NULL;
 	job->post_fence = NULL;
+	free_priv_cmdbuf(c, job->incr_cmd);
+	if (!pre_alloc_enabled)
+		job->incr_cmd = NULL;
 clean_up_pre_fence:
-	gk20a_free_priv_cmdbuf(c, job->wait_cmd);
 	gk20a_fence_put(job->pre_fence);
-	job->wait_cmd = NULL;
 	job->pre_fence = NULL;
+	free_priv_cmdbuf(c, job->wait_cmd);
+	if (!pre_alloc_enabled)
+		job->wait_cmd = NULL;
 	*wait_cmd = NULL;
 	*pre_fence = NULL;
 fail:
@@ -2388,11 +2664,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	}

 	if (need_job_tracking) {
-		job = channel_gk20a_alloc_job(c);
-		if (!job) {
-			err = -ENOMEM;
+		err = channel_gk20a_alloc_job(c, &job);
+		if (err)
 			goto clean_up;
-		}

 		err = gk20a_submit_prepare_syncs(c, fence, job,
 						 &wait_cmd, &incr_cmd,
@@ -2463,13 +2737,14 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	init_waitqueue_head(&c->ref_count_dec_wq);
 	mutex_init(&c->ioctl_lock);
 	mutex_init(&c->error_notifier_mutex);
-	spin_lock_init(&c->jobs_lock);
+	spin_lock_init(&c->joblist.dynamic.lock);
+	mutex_init(&c->joblist.pre_alloc.read_lock);
 	raw_spin_lock_init(&c->timeout.lock);
 	mutex_init(&c->sync_lock);
 	INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
 	INIT_DELAYED_WORK(&c->clean_up.wq, gk20a_channel_clean_up_jobs);
 	mutex_init(&c->clean_up.lock);
-	INIT_LIST_HEAD(&c->jobs);
+	INIT_LIST_HEAD(&c->joblist.dynamic.jobs);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
 	mutex_init(&c->cs_client_mutex);
@@ -3119,7 +3394,7 @@ long gk20a_channel_ioctl(struct file *filp,
 				(struct nvgpu_free_obj_ctx_args *)buf);
 		gk20a_idle(dev);
 		break;
-	case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO:
+	case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO_EX:
 		err = gk20a_busy(dev);
 		if (err) {
 			dev_err(dev,
@@ -3128,9 +3403,34 @@ long gk20a_channel_ioctl(struct file *filp,
 			break;
 		}
 		err = gk20a_alloc_channel_gpfifo(ch,
-				(struct nvgpu_alloc_gpfifo_args *)buf);
+				(struct nvgpu_alloc_gpfifo_ex_args *)buf);
 		gk20a_idle(dev);
 		break;
+	case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO:
+	{
+		struct nvgpu_alloc_gpfifo_ex_args alloc_gpfifo_ex_args;
+		struct nvgpu_alloc_gpfifo_args *alloc_gpfifo_args =
+			(struct nvgpu_alloc_gpfifo_args *)buf;
+
+		err = gk20a_busy(dev);
+		if (err) {
+			dev_err(dev,
+				"%s: failed to host gk20a for ioctl cmd: 0x%x",
+				__func__, cmd);
+			break;
+		}
+
+		/* prepare new args structure */
+		memset(&alloc_gpfifo_ex_args, 0,
+				sizeof(struct nvgpu_alloc_gpfifo_ex_args));
+		alloc_gpfifo_ex_args.num_entries =
+				alloc_gpfifo_args->num_entries;
+		alloc_gpfifo_ex_args.flags = alloc_gpfifo_args->flags;
+
+		err = gk20a_alloc_channel_gpfifo(ch, &alloc_gpfifo_ex_args);
+		gk20a_idle(dev);
+		break;
+	}
 	case NVGPU_IOCTL_CHANNEL_SUBMIT_GPFIFO:
 		err = gk20a_ioctl_channel_submit_gpfifo(ch,
 				(struct nvgpu_submit_gpfifo_args *)buf);
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -70,6 +70,22 @@ struct channel_gk20a_job {
 	struct list_head list;
 };

+struct channel_gk20a_joblist {
+	struct {
+		bool enabled;
+		unsigned int length;
+		unsigned int put;
+		unsigned int get;
+		struct channel_gk20a_job *jobs;
+		struct mutex read_lock;
+	} pre_alloc;
+
+	struct {
+		struct list_head jobs;
+		spinlock_t lock;
+	} dynamic;
+};
+
 struct channel_gk20a_timeout {
 	struct delayed_work wq;
 	raw_spinlock_t lock;
@@ -115,6 +131,7 @@ struct channel_gk20a {
 	bool bound;
 	bool first_init;
 	bool vpr;
+	bool no_block;
 	bool cde;
 	pid_t pid;
 	pid_t tgid;
@@ -123,8 +140,8 @@ struct channel_gk20a {
 	int tsgid;
 	struct list_head ch_entry; /* channel's entry in TSG */

-	struct list_head jobs;
-	spinlock_t jobs_lock;
+	struct channel_gk20a_joblist joblist;
+	struct gk20a_allocator fence_allocator;

 	struct vm_gk20a *vm;

@@ -272,7 +289,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				bool force_need_sync_fence);

 int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
-			       struct nvgpu_alloc_gpfifo_args *args);
+			       struct nvgpu_alloc_gpfifo_ex_args *args);

 void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
 void channel_gk20a_disable(struct channel_gk20a *ch);
@@ -284,6 +301,11 @@ int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 void channel_gk20a_enable(struct channel_gk20a *ch);
 void gk20a_channel_timeout_restart_all_channels(struct gk20a *g);

+bool channel_gk20a_is_prealloc_enabled(struct channel_gk20a *c);
+void channel_gk20a_joblist_lock(struct channel_gk20a *c);
+void channel_gk20a_joblist_unlock(struct channel_gk20a *c);
+bool channel_gk20a_joblist_is_empty(struct channel_gk20a *c);
+
 int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 		int timeslice_period,
 		int *__timeslice_timeout, int *__timeslice_scale);
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -47,6 +47,11 @@ static void gk20a_fence_free(struct kref *ref)
 #endif
 	if (f->semaphore)
 		gk20a_semaphore_put(f->semaphore);
+
+	if (f->allocator) {
+		if (gk20a_alloc_initialized(f->allocator))
+			gk20a_free(f->allocator, (u64)f);
+	} else
 		kfree(f);
 }

@@ -109,15 +114,66 @@ int gk20a_fence_install_fd(struct gk20a_fence *f)
 #endif
 }

+int gk20a_alloc_fence_pool(struct channel_gk20a *c, int count)
+{
+	int err;
+	size_t size;
+	struct gk20a_fence *fence_pool = NULL;
+
+	size = sizeof(struct gk20a_fence);
+	if (count <= ULONG_MAX / size) {
+		size = count * size;
+		fence_pool = vzalloc(size);
+	}
+
+	if (!fence_pool)
+		return -ENOMEM;
+
+	err = gk20a_lockless_allocator_init(&c->fence_allocator,
+			      "fence_pool", (u64)fence_pool, size,
+			      sizeof(struct gk20a_fence), 0);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	vfree(fence_pool);
+	return err;
+}
+
+void gk20a_free_fence_pool(struct channel_gk20a *c)
+{
+	if (gk20a_alloc_initialized(&c->fence_allocator)) {
+		void *base = (void *)gk20a_alloc_base(&c->fence_allocator);
+
+		gk20a_alloc_destroy(&c->fence_allocator);
+		vfree(base);
+	}
+}
+
 struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c)
 {
-	struct gk20a_fence *fence;
+	struct gk20a_fence *fence = NULL;

+	if (channel_gk20a_is_prealloc_enabled(c)) {
+		if (gk20a_alloc_initialized(&c->fence_allocator)) {
+			fence = (struct gk20a_fence *)
+				gk20a_alloc(&c->fence_allocator,
+					sizeof(struct gk20a_fence));
+
+			/* clear the node and reset the allocator pointer */
+			if (fence) {
+				memset(fence, 0, sizeof(*fence));
+				fence->allocator = &c->fence_allocator;
+			}
+		}
+	} else
 		fence = kzalloc(sizeof(struct gk20a_fence), GFP_KERNEL);
-	if (!fence)
-		return NULL;

+	if (fence)
 		kref_init(&fence->ref);
+
 	return fence;
 }

--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
@@ -3,7 +3,7 @@
 *
 * GK20A Fences
 *
- * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -45,6 +45,9 @@ struct gk20a_fence {
 	struct platform_device *host1x_pdev;
 	u32 syncpt_id;
 	u32 syncpt_value;
+
+	/* Valid for fences part of a pre-allocated fence pool */
+	struct gk20a_allocator *allocator;
 };

 /* Fences can be created from semaphores or syncpoint (id, value) pairs */
@@ -62,7 +65,15 @@ int gk20a_fence_from_syncpt(
 		u32 id, u32 value, bool wfi,
 		bool need_sync_fence);

-struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c);
+int gk20a_alloc_fence_pool(
+		struct channel_gk20a *c,
+		int size);
+
+void gk20a_free_fence_pool(
+		struct channel_gk20a *c);
+
+struct gk20a_fence *gk20a_alloc_fence(
+		struct channel_gk20a *c);

 void gk20a_init_fence(struct gk20a_fence *f,
 		const struct gk20a_fence_ops *ops,
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -963,7 +963,13 @@ struct nvgpu_alloc_gpfifo_args {
 	__u32 num_entries;
 #define NVGPU_ALLOC_GPFIFO_FLAGS_VPR_ENABLED	(1 << 0) /* set owner channel of this gpfifo as a vpr channel */
 	__u32 flags;
+};

+struct nvgpu_alloc_gpfifo_ex_args {
+	__u32 num_entries;
+	__u32 num_inflight_jobs;
+	__u32 flags;
+	__u32 reserved[5];
 };

 struct gk20a_sync_pt_info {
@@ -1182,6 +1188,8 @@ struct nvgpu_preemption_mode_args {
 	_IOWR(NVGPU_IOCTL_MAGIC, 18, struct nvgpu_set_timeout_ex_args)
 #define NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO	\
 	_IOW(NVGPU_IOCTL_MAGIC,  100, struct nvgpu_alloc_gpfifo_args)
+#define NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO_EX	\
+	_IOW(NVGPU_IOCTL_MAGIC,  101, struct nvgpu_alloc_gpfifo_ex_args)
 #define NVGPU_IOCTL_CHANNEL_WAIT		\
 	_IOWR(NVGPU_IOCTL_MAGIC, 102, struct nvgpu_wait_args)
 #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS	\