diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 4a3076b5f..b4fdfb44e 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1,7 +1,7 @@
 /*
  * Color decompression engine support
  *
- * Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA Corporation.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -74,7 +74,7 @@ __must_hold(&cde_app->mutex)
 	trace_gk20a_cde_remove_ctx(cde_ctx);
 
 	/* free the channel */
-	gk20a_free_channel(cde_ctx->ch, true);
+	gk20a_channel_close(ch);
 
 	/* ..then release mapped memory */
 	gk20a_deinit_cde_img(cde_ctx);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index c12f196d0..5a71e8746 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -42,8 +42,8 @@
 
 #define NVMAP_HANDLE_PARAM_SIZE 1
 
-static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f);
-static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
+static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f);
+static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
 
 static void free_priv_cmdbuf(struct channel_gk20a *c,
 			     struct priv_cmd_entry *e);
@@ -61,29 +61,33 @@ static int channel_gk20a_update_runlist(struct channel_gk20a *c,
 					bool add);
 static void gk20a_free_error_notifiers(struct channel_gk20a *ch);
 
-static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f)
+/* allocate GPU channel */
+static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
 {
 	struct channel_gk20a *ch = NULL;
-	int chid;
 
-	mutex_lock(&f->ch_inuse_mutex);
-	for (chid = 0; chid < f->num_channels; chid++) {
-		if (!f->channel[chid].in_use) {
-			f->channel[chid].in_use = true;
-			ch = &f->channel[chid];
-			break;
-		}
+	mutex_lock(&f->free_chs_mutex);
+	if (!list_empty(&f->free_chs)) {
+		ch = list_first_entry(&f->free_chs, struct channel_gk20a,
+				free_chs);
+		list_del(&ch->free_chs);
+		WARN_ON(atomic_read(&ch->ref_count));
+		WARN_ON(ch->referenceable);
 	}
-	mutex_unlock(&f->ch_inuse_mutex);
+	mutex_unlock(&f->free_chs_mutex);
 
 	return ch;
 }
 
-static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c)
+static void free_channel(struct fifo_gk20a *f,
+		struct channel_gk20a *ch)
 {
-	mutex_lock(&f->ch_inuse_mutex);
-	f->channel[c->hw_chid].in_use = false;
-	mutex_unlock(&f->ch_inuse_mutex);
+	trace_gk20a_release_used_channel(ch->hw_chid);
+	/* refcount is zero here and channel is in a freed/dead state */
+	mutex_lock(&f->free_chs_mutex);
+	/* add to head to increase visibility of timing-related bugs */
+	list_add(&ch->free_chs, &f->free_chs);
+	mutex_unlock(&f->free_chs_mutex);
 }
 
 int channel_gk20a_commit_va(struct channel_gk20a *c)
@@ -361,6 +365,11 @@ void gk20a_channel_abort(struct channel_gk20a *ch)
 	struct channel_gk20a_job *job, *n;
 	bool released_job_semaphore = false;
 
+	gk20a_dbg_fn("");
+
+	/* make sure new kickoffs are prevented */
+	ch->has_timedout = true;
+
 	/* ensure no fences are pending */
 	mutex_lock(&ch->submit_lock);
 	if (ch->sync)
@@ -416,6 +425,8 @@ void gk20a_disable_channel(struct channel_gk20a *ch,
 			   bool finish,
 			   unsigned long finish_timeout)
 {
+	gk20a_dbg_fn("");
+
 	if (finish) {
 		int err = gk20a_channel_finish(ch, finish_timeout);
 		WARN_ON(err);
@@ -627,8 +638,9 @@ void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error)
 				(u32)(nsec >> 32);
 		ch->error_notifier->info32 = error;
 		ch->error_notifier->status = 0xffff;
+
 		gk20a_err(dev_from_gk20a(ch->g),
-		    "error notifier set to %d for ch %d\n", error, ch->hw_chid);
+		    "error notifier set to %d for ch %d", error, ch->hw_chid);
 	}
 }
 
@@ -643,7 +655,53 @@ static void gk20a_free_error_notifiers(struct channel_gk20a *ch)
 	}
 }
 
-void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
+/* Returns delta of cyclic integers a and b. If a is ahead of b, delta
+ * is positive */
+static int cyclic_delta(int a, int b)
+{
+	return a - b;
+}
+
+static void gk20a_wait_for_deferred_interrupts(struct gk20a *g)
+{
+	int stall_irq_threshold = atomic_read(&g->hw_irq_stall_count);
+	int nonstall_irq_threshold = atomic_read(&g->hw_irq_nonstall_count);
+
+	/* wait until all stalling irqs are handled */
+	wait_event(g->sw_irq_stall_last_handled_wq,
+		   cyclic_delta(stall_irq_threshold,
+				atomic_read(&g->sw_irq_stall_last_handled))
+		   <= 0);
+
+	/* wait until all non-stalling irqs are handled */
+	wait_event(g->sw_irq_nonstall_last_handled_wq,
+		   cyclic_delta(nonstall_irq_threshold,
+				atomic_read(&g->sw_irq_nonstall_last_handled))
+		   <= 0);
+}
+
+static void gk20a_wait_until_counter_is_N(
+	struct channel_gk20a *ch, atomic_t *counter, int wait_value,
+	wait_queue_head_t *wq, const char *caller, const char *counter_name)
+{
+	while (true) {
+		if (wait_event_timeout(
+			    *wq,
+			    atomic_read(counter) == wait_value,
+			    msecs_to_jiffies(5000)) > 0)
+			break;
+
+		gk20a_warn(dev_from_gk20a(ch->g),
+			   "%s: channel %d, still waiting, %s left: %d, waiting for: %d",
+			   caller, ch->hw_chid, counter_name,
+			   atomic_read(counter), wait_value);
+	}
+}
+
+
+
+/* call ONLY when no references to the channel exist: after the last put */
+static void gk20a_free_channel(struct channel_gk20a *ch)
 {
 	struct gk20a *g = ch->g;
 	struct fifo_gk20a *f = &g->fifo;
@@ -654,13 +712,50 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
 
 	gk20a_dbg_fn("");
 
+	WARN_ON(ch->g == NULL);
+
+	trace_gk20a_free_channel(ch->hw_chid);
+
+	/* prevent new kickoffs */
+	ch->has_timedout = true;
+	wmb();
+
+	/* wait until there's only our ref to the channel */
+	gk20a_wait_until_counter_is_N(
+		ch, &ch->ref_count, 1, &ch->ref_count_dec_wq,
+		__func__, "references");
+
+	/* wait until all pending interrupts for recently completed
+	 * jobs are handled */
+	gk20a_wait_for_deferred_interrupts(g);
+
+	/* prevent new refs */
+	spin_lock(&ch->ref_obtain_lock);
+	if (!ch->referenceable) {
+		spin_unlock(&ch->ref_obtain_lock);
+		gk20a_err(dev_from_gk20a(ch->g),
+			  "Extra %s() called to channel %u",
+			  __func__, ch->hw_chid);
+		return;
+	}
+	ch->referenceable = false;
+	spin_unlock(&ch->ref_obtain_lock);
+
+	/* matches with the initial reference in gk20a_open_new_channel() */
+	atomic_dec(&ch->ref_count);
+
+	/* wait until no more refs to the channel */
+	gk20a_wait_until_counter_is_N(
+		ch, &ch->ref_count, 0, &ch->ref_count_dec_wq,
+		__func__, "references");
+
 	/* if engine reset was deferred, perform it now */
 	mutex_lock(&f->deferred_reset_mutex);
 	if (g->fifo.deferred_reset_pending) {
 		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
 			   " deferred, running now");
-		gk20a_fifo_reset_engine(g, g->fifo.mmu_fault_engines);
-		g->fifo.mmu_fault_engines = 0;
+		gk20a_fifo_reset_engine(g, g->fifo.deferred_fault_engines);
+		g->fifo.deferred_fault_engines = 0;
 		g->fifo.deferred_reset_pending = false;
 	}
 	mutex_unlock(&f->deferred_reset_mutex);
@@ -674,7 +769,7 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
 	gk20a_dbg_info("freeing bound channel context, timeout=%ld",
 			timeout);
 
-	gk20a_disable_channel(ch, finish && !ch->has_timedout, timeout);
+	gk20a_disable_channel(ch, !ch->has_timedout, timeout);
 
 	gk20a_free_error_notifiers(ch);
 
@@ -714,6 +809,10 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
 	spin_unlock(&ch->update_fn_lock);
 	cancel_work_sync(&ch->update_fn_work);
 
+	/* make sure we don't have deferred interrupts pending that
+	 * could still touch the channel */
+	gk20a_wait_for_deferred_interrupts(g);
+
 unbind:
 	if (gk20a_is_channel_marked_as_tsg(ch))
 		gk20a_tsg_unbind_channel(ch);
@@ -743,8 +842,66 @@ unbind:
 	mutex_unlock(&ch->dbg_s_lock);
 
 release:
+	/* make sure we catch accesses of unopened channels in case
+	 * there's non-refcounted channel pointers hanging around */
+	ch->g = NULL;
+	wmb();
+
 	/* ALWAYS last */
-	release_used_channel(f, ch);
+	free_channel(f, ch);
+}
+
+/* Try to get a reference to the channel. Return nonzero on success. If fails,
+ * the channel is dead or being freed elsewhere and you must not touch it.
+ *
+ * Always when a channel_gk20a pointer is seen and about to be used, a
+ * reference must be held to it - either by you or the caller, which should be
+ * documented well or otherwise clearly seen. This usually boils down to the
+ * file from ioctls directly, or an explicit get in exception handlers when the
+ * channel is found by a hw_chid.
+ *
+ * Most global functions in this file require a reference to be held by the
+ * caller.
+ */
+struct channel_gk20a *_gk20a_channel_get(struct channel_gk20a *ch,
+					 const char *caller) {
+	struct channel_gk20a *ret;
+
+	spin_lock(&ch->ref_obtain_lock);
+
+	if (likely(ch->referenceable)) {
+		atomic_inc(&ch->ref_count);
+		ret = ch;
+	} else
+		ret = NULL;
+
+	spin_unlock(&ch->ref_obtain_lock);
+
+	if (ret)
+		trace_gk20a_channel_get(ch->hw_chid, caller);
+
+	return ret;
+}
+
+void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller)
+{
+	trace_gk20a_channel_put(ch->hw_chid, caller);
+	atomic_dec(&ch->ref_count);
+	wake_up_all(&ch->ref_count_dec_wq);
+
+	/* More puts than gets. Channel is probably going to get
+	 * stuck. */
+	WARN_ON(atomic_read(&ch->ref_count) < 0);
+
+	/* Also, more puts than gets. ref_count can go to 0 only if
+	 * the channel is closing. Channel is probably going to get
+	 * stuck. */
+	WARN_ON(atomic_read(&ch->ref_count) == 0 && ch->referenceable);
+}
+
+void gk20a_channel_close(struct channel_gk20a *ch)
+{
+	gk20a_free_channel(ch);
 }
 
 int gk20a_channel_release(struct inode *inode, struct file *filp)
@@ -758,14 +915,14 @@ int gk20a_channel_release(struct inode *inode, struct file *filp)
 
 	trace_gk20a_channel_release(dev_name(&g->dev->dev));
 
-	err = gk20a_busy(ch->g->dev);
+	err = gk20a_busy(g->dev);
 	if (err) {
 		gk20a_err(dev_from_gk20a(g), "failed to release channel %d",
 			ch->hw_chid);
 		return err;
 	}
-	gk20a_free_channel(ch, true);
-	gk20a_idle(ch->g->dev);
+	gk20a_channel_close(ch);
+	gk20a_idle(g->dev);
 
 	filp->private_data = NULL;
 	return 0;
@@ -808,22 +965,31 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
 	struct fifo_gk20a *f = &g->fifo;
 	struct channel_gk20a *ch;
 
-	ch = acquire_unused_channel(f);
+	gk20a_dbg_fn("");
+
+	ch = allocate_channel(f);
 	if (ch == NULL) {
 		/* TBD: we want to make this virtualizable */
 		gk20a_err(dev_from_gk20a(g), "out of hw chids");
 		return NULL;
 	}
 
+	trace_gk20a_open_new_channel(ch->hw_chid);
+
+	BUG_ON(ch->g);
 	ch->g = g;
 
 	if (g->ops.fifo.alloc_inst(g, ch)) {
-		ch->in_use = false;
+		ch->g = NULL;
+		free_channel(f, ch);
 		gk20a_err(dev_from_gk20a(g),
 			   "failed to open gk20a channel, out of inst mem");
-
 		return NULL;
 	}
+
+	/* now the channel is in a limbo out of the free list but not marked as
+	 * alive and used (i.e. get-able) yet */
+
 	ch->pid = current->pid;
 
 	/* By default, channel is regular (non-TSG) channel */
@@ -854,6 +1020,13 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
 	spin_lock_init(&ch->update_fn_lock);
 	INIT_WORK(&ch->update_fn_work, gk20a_channel_update_runcb_fn);
 
+	/* Mark the channel alive, get-able, with 1 initial use
+	 * references. The initial reference will be decreased in
+	 * gk20a_free_channel() */
+	ch->referenceable = true;
+	atomic_set(&ch->ref_count, 1);
+	wmb();
+
 	return ch;
 }
 
@@ -1379,7 +1552,7 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 	struct mapped_buffer_node **mapped_buffers = NULL;
 	int err = 0, num_mapped_buffers;
 
-	/* job needs reference to this vm */
+	/* job needs reference to this vm (released in channel_update) */
 	gk20a_vm_get(vm);
 
 	err = gk20a_vm_get_buffers(vm, &mapped_buffers, &num_mapped_buffers);
@@ -1395,14 +1568,21 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 		return -ENOMEM;
 	}
 
-	job->num_mapped_buffers = num_mapped_buffers;
-	job->mapped_buffers = mapped_buffers;
-	job->pre_fence = gk20a_fence_get(pre_fence);
-	job->post_fence = gk20a_fence_get(post_fence);
+	/* put() is done in gk20a_channel_update() when the job is done */
+	c = gk20a_channel_get(c);
 
-	mutex_lock(&c->jobs_lock);
-	list_add_tail(&job->list, &c->jobs);
-	mutex_unlock(&c->jobs_lock);
+	if (c) {
+		job->num_mapped_buffers = num_mapped_buffers;
+		job->mapped_buffers = mapped_buffers;
+		job->pre_fence = gk20a_fence_get(pre_fence);
+		job->post_fence = gk20a_fence_get(post_fence);
+
+		mutex_lock(&c->jobs_lock);
+		list_add_tail(&job->list, &c->jobs);
+		mutex_unlock(&c->jobs_lock);
+	} else {
+		return -ETIMEDOUT;
+	}
 
 	return 0;
 }
@@ -1412,13 +1592,15 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 	struct vm_gk20a *vm = c->vm;
 	struct channel_gk20a_job *job, *n;
 
-	trace_gk20a_channel_update(c);
+	trace_gk20a_channel_update(c->hw_chid);
 
 	wake_up(&c->submit_wq);
 
 	mutex_lock(&c->submit_lock);
 	mutex_lock(&c->jobs_lock);
 	list_for_each_entry_safe(job, n, &c->jobs, list) {
+		struct gk20a *g = c->g;
+
 		bool completed = gk20a_fence_is_expired(job->post_fence);
 		if (!completed)
 			break;
@@ -1434,12 +1616,15 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 		gk20a_fence_put(job->pre_fence);
 		gk20a_fence_put(job->post_fence);
 
-		/* job is done. release its reference to vm */
+		/* job is done. release its vm reference (taken in add_job) */
 		gk20a_vm_put(vm);
+		/* another bookkeeping taken in add_job. caller must hold a ref
+		 * so this wouldn't get freed here. */
+		gk20a_channel_put(c);
 
 		list_del_init(&job->list);
 		kfree(job);
-		gk20a_idle(c->g->dev);
+		gk20a_idle(g->dev);
 	}
 
 	/*
@@ -1719,10 +1904,13 @@ clean_up:
 int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 {
 	struct channel_gk20a *c = g->fifo.channel+chid;
-	c->g = g;
-	c->in_use = false;
+	c->g = NULL;
 	c->hw_chid = chid;
 	c->bound = false;
+	spin_lock_init(&c->ref_obtain_lock);
+	atomic_set(&c->ref_count, 0);
+	c->referenceable = false;
+	init_waitqueue_head(&c->ref_count_dec_wq);
 	mutex_init(&c->ioctl_lock);
 	mutex_init(&c->jobs_lock);
 	mutex_init(&c->submit_lock);
@@ -1733,6 +1921,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 #endif
 	INIT_LIST_HEAD(&c->dbg_s_list);
 	mutex_init(&c->dbg_s_lock);
+	list_add(&c->free_chs, &g->fifo.free_chs);
 
 	return 0;
 }
@@ -2066,8 +2255,7 @@ int gk20a_channel_suspend(struct gk20a *g)
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *ch = &f->channel[chid];
-		if (ch->in_use) {
-
+		if (gk20a_channel_get(ch)) {
 			gk20a_dbg_info("suspend channel %d", chid);
 			/* disable channel */
 			g->ops.fifo.disable_channel(ch);
@@ -2079,6 +2267,8 @@ int gk20a_channel_suspend(struct gk20a *g)
 				flush_work(&ch->update_fn_work);
 
 			channels_in_use = true;
+
+			gk20a_channel_put(ch);
 		}
 	}
 
@@ -2086,8 +2276,10 @@ int gk20a_channel_suspend(struct gk20a *g)
 		g->ops.fifo.update_runlist(g, 0, ~0, false, true);
 
 		for (chid = 0; chid < f->num_channels; chid++) {
-			if (f->channel[chid].in_use)
+			if (gk20a_channel_get(&f->channel[chid])) {
 				g->ops.fifo.unbind_channel(&f->channel[chid]);
+				gk20a_channel_put(&f->channel[chid]);
+			}
 		}
 	}
 
@@ -2095,8 +2287,6 @@ int gk20a_channel_suspend(struct gk20a *g)
 	return 0;
 }
 
-/* in this context the "channel" is the host1x channel which
- * maps to *all* gk20a channels */
 int gk20a_channel_resume(struct gk20a *g)
 {
 	struct fifo_gk20a *f = &g->fifo;
@@ -2106,10 +2296,11 @@ int gk20a_channel_resume(struct gk20a *g)
 	gk20a_dbg_fn("");
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		if (f->channel[chid].in_use) {
+		if (gk20a_channel_get(&f->channel[chid])) {
 			gk20a_dbg_info("resume channel %d", chid);
 			g->ops.fifo.bind_channel(&f->channel[chid]);
 			channels_in_use = true;
+			gk20a_channel_put(&f->channel[chid]);
 		}
 	}
 
@@ -2129,10 +2320,11 @@ void gk20a_channel_semaphore_wakeup(struct gk20a *g)
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *c = g->fifo.channel+chid;
-		if (c->in_use) {
+		if (gk20a_channel_get(c)) {
 			gk20a_channel_event(c);
 			wake_up_interruptible_all(&c->semaphore_wq);
 			gk20a_channel_update(c, 0);
+			gk20a_channel_put(c);
 		}
 	}
 }
@@ -2225,10 +2417,18 @@ long gk20a_channel_ioctl(struct file *filp,
 			return -EFAULT;
 	}
 
+	/* take a ref or return timeout if channel refs can't be taken */
+	ch = gk20a_channel_get(ch);
+	if (!ch)
+		return -ETIMEDOUT;
+
 	/* protect our sanity for threaded userspace - most of the channel is
 	 * not thread safe */
 	mutex_lock(&ch->ioctl_lock);
 
+	/* this ioctl call keeps a ref to the file which keeps a ref to the
+	 * channel */
+
 	switch (cmd) {
 	case NVGPU_IOCTL_CHANNEL_OPEN:
 		err = gk20a_channel_open_ioctl(ch->g,
@@ -2449,9 +2649,11 @@ long gk20a_channel_ioctl(struct file *filp,
 	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
 		err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
 
-	gk20a_dbg_fn("end");
-
 	mutex_unlock(&ch->ioctl_lock);
 
+	gk20a_channel_put(ch);
+
+	gk20a_dbg_fn("end");
+
 	return err;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f022fe36d..2ea5b4beb 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -19,12 +19,13 @@
 #define CHANNEL_GK20A_H
 
 #include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
 #include <linux/mutex.h>
-#include <uapi/linux/nvgpu.h>
 #include <linux/poll.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <uapi/linux/nvgpu.h>
 
 struct gk20a;
 struct gr_gk20a;
@@ -77,8 +78,15 @@ struct channel_gk20a_poll_events {
 
 /* this is the priv element of struct nvhost_channel */
 struct channel_gk20a {
-	struct gk20a *g;
-	bool in_use;
+	struct gk20a *g; /* set only when channel is active */
+
+	struct list_head free_chs;
+
+	spinlock_t ref_obtain_lock;
+	bool referenceable;
+	atomic_t ref_count;
+	wait_queue_head_t ref_count_dec_wq;
+
 	int hw_chid;
 	bool bound;
 	bool first_init;
@@ -171,7 +179,10 @@ static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
 }
 int channel_gk20a_commit_va(struct channel_gk20a *c);
 int gk20a_init_channel_support(struct gk20a *, u32 chid);
-void gk20a_free_channel(struct channel_gk20a *ch, bool finish);
+
+/* must be inside gk20a_busy()..gk20a_idle() */
+void gk20a_channel_close(struct channel_gk20a *ch);
+
 bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
 					    u32 timeout_delta_ms);
 void gk20a_disable_channel(struct channel_gk20a *ch,
@@ -202,6 +213,15 @@ void gk20a_channel_event(struct channel_gk20a *ch);
 
 void gk20a_init_channel(struct gpu_ops *gops);
 
+/* returns ch if reference was obtained */
+struct channel_gk20a *__must_check _gk20a_channel_get(struct channel_gk20a *ch,
+						      const char *caller);
+#define gk20a_channel_get(ch) _gk20a_channel_get(ch, __func__)
+
+
+void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller);
+#define gk20a_channel_put(ch) _gk20a_channel_put(ch, __func__)
+
 int gk20a_wait_channel_idle(struct channel_gk20a *ch);
 struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g);
 struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 8cc852c71..7a707fbdd 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -154,8 +154,23 @@ static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
 
 static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
 {
-	struct channel_gk20a *ch20a = priv;
-	gk20a_channel_update(ch20a, nr_completed);
+	struct channel_gk20a *ch = priv;
+	struct gk20a *g = ch->g;
+
+	/* need busy for possible channel deletion */
+	if (gk20a_busy(ch->g->dev)) {
+		gk20a_err(dev_from_gk20a(ch->g),
+				"failed to busy while syncpt update");
+		/* Last gk20a_idle()s are in channel_update, so we shouldn't
+		 * get here. If we do, the channel is badly broken now */
+		return;
+	}
+
+	/* note: channel_get() is in __gk20a_channel_syncpt_incr() */
+	gk20a_channel_update(ch, nr_completed);
+	gk20a_channel_put(ch);
+
+	gk20a_idle(g->dev);
 }
 
 static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
@@ -209,14 +224,37 @@ static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 	thresh = nvhost_syncpt_incr_max_ext(sp->host1x_pdev, sp->id, 2);
 
 	if (register_irq) {
-		err = nvhost_intr_register_notifier(sp->host1x_pdev,
-				sp->id, thresh,
-				gk20a_channel_syncpt_update, c);
+		err = gk20a_busy(c->g->dev);
+		if (err)
+			gk20a_err(dev_from_gk20a(c->g),
+				  "failed to add syncpt interrupt notifier for channel %d",
+				  c->hw_chid);
+		else {
+			struct channel_gk20a *referenced = gk20a_channel_get(c);
 
-		/* Adding interrupt action should never fail. A proper error
-		 * handling here would require us to decrement the syncpt max
-		 * back to its original value. */
-		WARN(err, "failed to set submit complete interrupt");
+			WARN_ON(!referenced);
+			gk20a_idle(c->g->dev);
+
+			if (referenced) {
+				/* note: channel_put() is in
+				 * gk20a_channel_syncpt_update() */
+
+				err = nvhost_intr_register_notifier(
+					sp->host1x_pdev,
+					sp->id, thresh,
+					gk20a_channel_syncpt_update, c);
+				if (err)
+					gk20a_channel_put(referenced);
+
+				/* Adding interrupt action should
+				 * never fail. A proper error handling
+				 * here would require us to decrement
+				 * the syncpt max back to its original
+				 * value. */
+				WARN(err,
+				     "failed to set submit complete interrupt");
+			}
+		}
 	}
 
 	*fence = gk20a_fence_from_syncpt(sp->host1x_pdev, sp->id, thresh,
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index 0f1c31ddf..bda0dab03 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -36,6 +36,7 @@ static struct platform_device *gk20a_device;
 
 struct ch_state {
 	int pid;
+	int refs;
 	u8 inst_block[0];
 };
 
@@ -118,9 +119,10 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
 	syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
 
-	gk20a_debug_output(o, "%d-%s, pid %d: ", hw_chid,
+	gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
 			g->dev->name,
-			ch_state->pid);
+			ch_state->pid,
+			ch_state->refs);
 	gk20a_debug_output(o, "%s in use %s %s\n",
 			ccsr_channel_enable_v(channel) ? "" : "not",
 			ccsr_chan_status_str[status],
@@ -231,16 +233,30 @@ void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
 	}
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		if (f->channel[chid].in_use)
-			ch_state[chid] = kmalloc(sizeof(struct ch_state) + ram_in_alloc_size_v(), GFP_KERNEL);
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (gk20a_channel_get(ch)) {
+			ch_state[chid] =
+				kmalloc(sizeof(struct ch_state) +
+					ram_in_alloc_size_v(), GFP_KERNEL);
+			/* ref taken stays to below loop with
+			 * successful allocs */
+			if (!ch_state[chid])
+				gk20a_channel_put(ch);
+		}
 	}
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		if (ch_state[chid] && f->channel[chid].inst_block.cpu_va) {
-			ch_state[chid]->pid = f->channel[chid].pid;
-			memcpy(&ch_state[chid]->inst_block[0],
-			       f->channel[chid].inst_block.cpu_va,
-			       ram_in_alloc_size_v());
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (ch_state[chid]) {
+			if (ch->inst_block.cpu_va) {
+				ch_state[chid]->pid = ch->pid;
+				ch_state[chid]->refs =
+					atomic_read(&ch->ref_count);
+				memcpy(&ch_state[chid]->inst_block[0],
+						ch->inst_block.cpu_va,
+						ram_in_alloc_size_v());
+			}
+			gk20a_channel_put(ch);
 		}
 	}
 	for (chid = 0; chid < f->num_channels; chid++) {
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 56b954a98..4ef310b23 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -515,6 +515,9 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 
 	init_runlist(g, f);
 
+	INIT_LIST_HEAD(&f->free_chs);
+	mutex_init(&f->free_chs_mutex);
+
 	for (chid = 0; chid < f->num_channels; chid++) {
 		f->channel[chid].userd_cpu_va =
 			f->userd.cpu_va + chid * f->userd_entry_size;
@@ -527,7 +530,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 		gk20a_init_channel_support(g, chid);
 		gk20a_init_tsg_support(g, chid);
 	}
-	mutex_init(&f->ch_inuse_mutex);
 	mutex_init(&f->tsg_inuse_mutex);
 
 	f->remove_support = gk20a_remove_fifo_support;
@@ -637,6 +639,7 @@ int gk20a_init_fifo_support(struct gk20a *g)
 	return err;
 }
 
+/* return with a reference to the channel, caller must put it back */
 static struct channel_gk20a *
 channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
 {
@@ -644,10 +647,16 @@ channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
 	if (unlikely(!f->channel))
 		return NULL;
 	for (ci = 0; ci < f->num_channels; ci++) {
-		struct channel_gk20a *c = f->channel+ci;
-		if (c->inst_block.cpu_va &&
-		    (inst_ptr == gk20a_mem_phys(&c->inst_block)))
-			return f->channel+ci;
+		struct channel_gk20a *ch = gk20a_channel_get(&f->channel[ci]);
+		/* only alive channels are searched */
+		if (!ch)
+			continue;
+
+		if (ch->inst_block.cpu_va &&
+		    (inst_ptr == gk20a_mem_phys(&ch->inst_block)))
+			return ch;
+
+		gk20a_channel_put(ch);
 	}
 	return NULL;
 }
@@ -803,6 +812,7 @@ static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
 	return true;
 }
 
+/* caller must hold a channel reference */
 static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
 		struct channel_gk20a *ch)
 {
@@ -854,14 +864,38 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
 		"TSG %d generated a mmu fault", tsg->tsgid);
 
 	mutex_lock(&tsg->ch_list_lock);
-	list_for_each_entry(ch, &tsg->ch_list, ch_entry)
-		ret = gk20a_fifo_set_ctx_mmu_error(g, ch);
+	list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+		if (gk20a_channel_get(ch)) {
+			if (!gk20a_fifo_set_ctx_mmu_error(g, ch))
+				ret = false;
+			gk20a_channel_put(ch);
+		}
+	}
 	mutex_unlock(&tsg->ch_list_lock);
 
 	return ret;
 }
 
-static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
+static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
+{
+	struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
+	struct channel_gk20a *ch;
+
+	mutex_lock(&tsg->ch_list_lock);
+	list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+		if (gk20a_channel_get(ch)) {
+			gk20a_channel_abort(ch);
+			gk20a_channel_put(ch);
+		}
+	}
+	mutex_unlock(&tsg->ch_list_lock);
+}
+
+static bool gk20a_fifo_handle_mmu_fault(
+	struct gk20a *g,
+	u32 mmu_fault_engines, /* queried from HW if 0 */
+	u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
+	bool id_is_tsg)
 {
 	bool fake_fault;
 	unsigned long fault_id;
@@ -894,10 +928,8 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
 		gr_gpfifo_ctl_semaphore_access_f(0));
 
-	/* If we have recovery in progress, MMU fault id is invalid */
-	if (g->fifo.mmu_fault_engines) {
-		fault_id = g->fifo.mmu_fault_engines;
-		g->fifo.mmu_fault_engines = 0;
+	if (mmu_fault_engines) {
+		fault_id = mmu_fault_engines;
 		fake_fault = true;
 	} else {
 		fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
@@ -914,6 +946,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		struct fifo_mmu_fault_info_gk20a f;
 		struct channel_gk20a *ch = NULL;
 		struct tsg_gk20a *tsg = NULL;
+		struct channel_gk20a *referenced_channel = 0;
 		/* read and parse engine status */
 		u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
 		u32 ctx_status = fifo_engine_status_ctx_status_v(status);
@@ -953,22 +986,34 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		/* get the channel/TSG */
 		if (fake_fault) {
 			/* use next_id if context load is failing */
-			u32 id = (ctx_status ==
-				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
-				fifo_engine_status_next_id_v(status) :
-				fifo_engine_status_id_v(status);
-			u32 type = (ctx_status ==
-				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
-				fifo_engine_status_next_id_type_v(status) :
-				fifo_engine_status_id_type_v(status);
+			u32 id, type;
+
+			if (hw_id == ~(u32)0) {
+				id = (ctx_status ==
+				      fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+					fifo_engine_status_next_id_v(status) :
+					fifo_engine_status_id_v(status);
+				type = (ctx_status ==
+					fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+					fifo_engine_status_next_id_type_v(status) :
+					fifo_engine_status_id_type_v(status);
+			} else {
+				id = hw_id;
+				type = id_is_tsg ?
+					fifo_engine_status_id_type_tsgid_v() :
+					fifo_engine_status_id_type_chid_v();
+			}
 
 			if (type == fifo_engine_status_id_type_tsgid_v())
 				tsg = &g->fifo.tsg[id];
-			else if (type == fifo_engine_status_id_type_chid_v())
+			else if (type == fifo_engine_status_id_type_chid_v()) {
 				ch = &g->fifo.channel[id];
+				referenced_channel = gk20a_channel_get(ch);
+			}
 		} else {
 			/* read channel based on instruction pointer */
 			ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr);
+			referenced_channel = ch;
 		}
 
 		if (ch && gk20a_is_channel_marked_as_tsg(ch))
@@ -977,7 +1022,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		/* check if engine reset should be deferred */
 		if ((ch || tsg) && gk20a_fifo_should_defer_engine_reset(g,
 				engine_id, &f, fake_fault)) {
-			g->fifo.mmu_fault_engines = fault_id;
+			g->fifo.deferred_fault_engines = fault_id;
 
 			/* handled during channel free */
 			g->fifo.deferred_reset_pending = true;
@@ -988,19 +1033,31 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		 * syncpoints */
 
 		if (tsg) {
-			struct channel_gk20a *ch = NULL;
 			if (!g->fifo.deferred_reset_pending)
 				verbose =
 				       gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
-			mutex_lock(&tsg->ch_list_lock);
-			list_for_each_entry(ch, &tsg->ch_list, ch_entry)
-				gk20a_channel_abort(ch);
-			mutex_unlock(&tsg->ch_list_lock);
+
+			gk20a_fifo_abort_tsg(g, ch->tsgid);
+
+			/* put back the ref taken early above */
+			if (referenced_channel) {
+				gk20a_channel_put(ch);
+			} else {
+				gk20a_err(dev_from_gk20a(g),
+						"mmu error in freed tsg channel %d on tsgid %d",
+						ch->hw_chid, ch->tsgid);
+			}
 		} else if (ch) {
-			if (!g->fifo.deferred_reset_pending)
-				verbose =
-					gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
-			gk20a_channel_abort(ch);
+			if (referenced_channel) {
+				if (!g->fifo.deferred_reset_pending)
+					verbose = gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+				gk20a_channel_abort(ch);
+				gk20a_channel_put(ch);
+			} else {
+				gk20a_err(dev_from_gk20a(g),
+						"mmu error in freed channel %d",
+						ch->hw_chid);
+			}
 		} else if (f.inst_ptr ==
 				gk20a_mem_phys(&g->mm.bar1.inst_block)) {
 			gk20a_err(dev_from_gk20a(g), "mmu fault from bar1");
@@ -1133,46 +1190,69 @@ static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg)
 
 void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose)
 {
-	u32 engines = gk20a_fifo_engines_on_id(g, hw_chid, false);
+	u32 engines;
+
+	/* stop context switching to prevent engine assignments from
+	   changing until channel is recovered */
+	mutex_lock(&g->dbg_sessions_lock);
+	gr_gk20a_disable_ctxsw(g);
+
+	engines = gk20a_fifo_engines_on_id(g, hw_chid, false);
+
 	if (engines)
-		gk20a_fifo_recover(g, engines, verbose);
+		gk20a_fifo_recover(g, engines, hw_chid, false, verbose);
 	else {
-		struct channel_gk20a *ch =
-			g->fifo.channel + hw_chid;
+		struct channel_gk20a *ch = &g->fifo.channel[hw_chid];
 
-		gk20a_channel_abort(ch);
+		if (gk20a_channel_get(ch)) {
+			gk20a_channel_abort(ch);
 
-		if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
-			gk20a_debug_dump(g->dev);
+			if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
+				gk20a_debug_dump(g->dev);
+
+			gk20a_channel_put(ch);
+		}
 	}
+
+	gr_gk20a_enable_ctxsw(g);
+	mutex_unlock(&g->dbg_sessions_lock);
 }
 
 void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
 {
-	u32 engines = gk20a_fifo_engines_on_id(g, tsgid, true);
+	u32 engines;
+
+	/* stop context switching to prevent engine assignments from
+	   changing until TSG is recovered */
+	mutex_lock(&g->dbg_sessions_lock);
+	gr_gk20a_disable_ctxsw(g);
+
+	engines = gk20a_fifo_engines_on_id(g, tsgid, true);
+
 	if (engines)
-		gk20a_fifo_recover(g, engines, verbose);
+		gk20a_fifo_recover(g, engines, tsgid, true, verbose);
 	else {
 		struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
-		struct channel_gk20a *ch;
 
 		if (gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg))
 			gk20a_debug_dump(g->dev);
 
-		mutex_lock(&tsg->ch_list_lock);
-		list_for_each_entry(ch, &tsg->ch_list, ch_entry)
-			gk20a_channel_abort(ch);
-		mutex_unlock(&tsg->ch_list_lock);
+		gk20a_fifo_abort_tsg(g, tsgid);
 	}
+
+	gr_gk20a_enable_ctxsw(g);
+	mutex_unlock(&g->dbg_sessions_lock);
 }
 
 void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
-		bool verbose)
+			u32 hw_id, bool id_is_tsg,
+			bool verbose)
 {
 	unsigned long engine_id, i;
 	unsigned long _engine_ids = __engine_ids;
 	unsigned long engine_ids = 0;
 	u32 val;
+	u32 mmu_fault_engines = 0;
 
 	if (verbose)
 		gk20a_debug_dump(g->dev);
@@ -1181,7 +1261,6 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 		g->ops.ltc.flush(g);
 
 	/* store faulted engines in advance */
-	g->fifo.mmu_fault_engines = 0;
 	for_each_set_bit(engine_id, &_engine_ids, 32) {
 		u32 ref_type;
 		u32 ref_id;
@@ -1196,11 +1275,10 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 			gk20a_fifo_get_faulty_id_type(g, i, &id, &type);
 			if (ref_type == type && ref_id == id) {
 				engine_ids |= BIT(i);
-				g->fifo.mmu_fault_engines |=
+				mmu_fault_engines |=
 					BIT(gk20a_engine_id_to_mmu_id(i));
 			}
 		}
-
 	}
 
 	/*
@@ -1214,7 +1292,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 			fifo_intr_0_sched_error_reset_f());
 
 	g->ops.fifo.trigger_mmu_fault(g, engine_ids);
-	gk20a_fifo_handle_mmu_fault(g);
+	gk20a_fifo_handle_mmu_fault(g, engine_ids, hw_id, id_is_tsg);
 
 	val = gk20a_readl(g, fifo_intr_en_0_r());
 	val |= fifo_intr_en_0_mmu_fault_f(1)
@@ -1222,25 +1300,32 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
 	gk20a_writel(g, fifo_intr_en_0_r(), val);
 }
 
+/* force reset channel and tsg (if it's part of one) */
 int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)
 {
 	struct tsg_gk20a *tsg = NULL;
 	struct channel_gk20a *ch_tsg = NULL;
+	struct gk20a *g = ch->g;
 
 	if (gk20a_is_channel_marked_as_tsg(ch)) {
-		tsg = &ch->g->fifo.tsg[ch->hw_chid];
+		tsg = &g->fifo.tsg[ch->hw_chid];
 
 		mutex_lock(&tsg->ch_list_lock);
+
 		list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
-			gk20a_set_error_notifier(ch_tsg,
-			       NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
+			if (gk20a_channel_get(ch_tsg)) {
+				gk20a_set_error_notifier(ch_tsg,
+				       NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
+				gk20a_channel_put(ch_tsg);
+			}
 		}
+
 		mutex_unlock(&tsg->ch_list_lock);
-		gk20a_fifo_recover_tsg(ch->g, ch->tsgid, verbose);
+		gk20a_fifo_recover_tsg(g, ch->tsgid, verbose);
 	} else {
 		gk20a_set_error_notifier(ch,
 			NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
-		gk20a_fifo_recover_ch(ch->g, ch->hw_chid, verbose);
+		gk20a_fifo_recover_ch(g, ch->hw_chid, verbose);
 	}
 
 	return 0;
@@ -1300,11 +1385,14 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
 		struct channel_gk20a *ch = &f->channel[id];
 
 		if (non_chid) {
-			gk20a_fifo_recover(g, BIT(engine_id), true);
+			gk20a_fifo_recover(g, BIT(engine_id), id, true, true);
 			ret = true;
 			goto err;
 		}
 
+		if (!gk20a_channel_get(ch))
+			goto err;
+
 		if (gk20a_channel_update_and_check_timeout(ch,
 			GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
 			gk20a_set_error_notifier(ch,
@@ -1313,7 +1401,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
 				"fifo sched ctxsw timeout error:"
 				"engine = %u, ch = %d", engine_id, id);
 			gk20a_gr_debug_dump(g->dev);
-			gk20a_fifo_recover(g, BIT(engine_id),
+			gk20a_fifo_recover(g, BIT(engine_id), id, false,
 				ch->timeout_debug_dump);
 			ret = true;
 		} else {
@@ -1324,6 +1412,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
 				id);
 			ret = false;
 		}
+		gk20a_channel_put(ch);
 		return ret;
 	}
 
@@ -1336,7 +1425,7 @@ err:
 
 static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
 {
-	bool print_channel_reset_log = false, reset_engine = false;
+	bool print_channel_reset_log = false;
 	struct device *dev = dev_from_gk20a(g);
 	u32 handled = 0;
 
@@ -1367,8 +1456,8 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
 	}
 
 	if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) {
-		print_channel_reset_log = gk20a_fifo_handle_mmu_fault(g);
-		reset_engine  = true;
+		print_channel_reset_log =
+			gk20a_fifo_handle_mmu_fault(g, 0, ~(u32)0, false);
 		handled |= fifo_intr_0_mmu_fault_pending_f();
 	}
 
@@ -1452,9 +1541,12 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
 				== fifo_pbdma_status_id_type_chid_v()) {
 			struct channel_gk20a *ch = &f->channel[id];
 
-			gk20a_set_error_notifier(ch,
-				NVGPU_CHANNEL_PBDMA_ERROR);
-			gk20a_fifo_recover_ch(g, id, true);
+			if (gk20a_channel_get(ch)) {
+				gk20a_set_error_notifier(ch,
+						NVGPU_CHANNEL_PBDMA_ERROR);
+				gk20a_fifo_recover_ch(g, id, true);
+				gk20a_channel_put(ch);
+			}
 		} else if (fifo_pbdma_status_id_type_v(status)
 				== fifo_pbdma_status_id_type_tsgid_v()) {
 			struct tsg_gk20a *tsg = &f->tsg[id];
@@ -1462,8 +1554,11 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
 
 			mutex_lock(&tsg->ch_list_lock);
 			list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
-				gk20a_set_error_notifier(ch,
-					NVGPU_CHANNEL_PBDMA_ERROR);
+				if (gk20a_channel_get(ch)) {
+					gk20a_set_error_notifier(ch,
+						NVGPU_CHANNEL_PBDMA_ERROR);
+					gk20a_channel_put(ch);
+				}
 			}
 			mutex_unlock(&tsg->ch_list_lock);
 			gk20a_fifo_recover_tsg(g, id, true);
@@ -1559,6 +1654,8 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 		+ msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
 	u32 ret = 0;
 
+	gk20a_dbg_fn("%d", id);
+
 	/* issue preempt */
 	if (is_tsg)
 		gk20a_writel(g, fifo_preempt_r(),
@@ -1569,6 +1666,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 			fifo_preempt_chid_f(id) |
 			fifo_preempt_type_channel_f());
 
+	gk20a_dbg_fn("%d", id);
 	/* wait for preempt */
 	ret = -EBUSY;
 	do {
@@ -1583,6 +1681,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 	} while (time_before(jiffies, end_jiffies) ||
 			!tegra_platform_is_silicon());
 
+	gk20a_dbg_fn("%d", id);
 	if (ret) {
 		if (is_tsg) {
 			struct tsg_gk20a *tsg = &g->fifo.tsg[id];
@@ -1593,8 +1692,11 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 
 			mutex_lock(&tsg->ch_list_lock);
 			list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
+				if (!gk20a_channel_get(ch))
+					continue;
 				gk20a_set_error_notifier(ch,
 					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+				gk20a_channel_put(ch);
 			}
 			mutex_unlock(&tsg->ch_list_lock);
 			gk20a_fifo_recover_tsg(g, id, true);
@@ -1604,9 +1706,12 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 			gk20a_err(dev_from_gk20a(g),
 				"preempt channel %d timeout\n", id);
 
-			gk20a_set_error_notifier(ch,
-					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
-			gk20a_fifo_recover_ch(g, id, true);
+			if (gk20a_channel_get(ch)) {
+				gk20a_set_error_notifier(ch,
+						NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+				gk20a_fifo_recover_ch(g, id, true);
+				gk20a_channel_put(ch);
+			}
 		}
 	}
 
@@ -1790,7 +1895,9 @@ static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
 		    (f->engine_info[i].runlist_id == runlist_id))
 			engines |= BIT(i);
 	}
-	gk20a_fifo_recover(g, engines, true);
+
+	if (engines)
+		gk20a_fifo_recover(g, engines, ~(u32)0, false, true);
 }
 
 static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
@@ -1994,6 +2101,8 @@ int gk20a_fifo_update_runlist(struct gk20a *g, u32 runlist_id, u32 hw_chid,
 	u32 mutex_ret;
 	u32 ret = 0;
 
+	gk20a_dbg_fn("");
+
 	runlist = &f->runlist_info[runlist_id];
 
 	mutex_lock(&runlist->mutex);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index dd320ae1e..fdf843d2f 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -3,7 +3,7 @@
  *
  * GK20A graphics fifo (gr host)
  *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -106,7 +106,9 @@ struct fifo_gk20a {
 	u32 userd_entry_size;
 
 	struct channel_gk20a *channel;
-	struct mutex ch_inuse_mutex; /* protect unused chid look up */
+	/* zero-kref'd channels here */
+	struct list_head free_chs;
+	struct mutex free_chs_mutex;
 
 	struct tsg_gk20a *tsg;
 	struct mutex tsg_inuse_mutex;
@@ -130,7 +132,7 @@ struct fifo_gk20a {
 
 	} intr;
 
-	u32 mmu_fault_engines;
+	u32 deferred_fault_engines;
 	bool deferred_reset_pending;
 	struct mutex deferred_reset_mutex;
 };
@@ -157,7 +159,12 @@ int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 hw_chid,
 int gk20a_fifo_suspend(struct gk20a *g);
 
 bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
-void gk20a_fifo_recover(struct gk20a *g, u32 engine_ids, bool verbose);
+
+void gk20a_fifo_recover(struct gk20a *g,
+			u32 engine_ids, /* if zero, will be queried from HW */
+			u32 hw_id, /* if ~0, will be queried from HW */
+			bool hw_id_is_tsg, /* ignored if hw_id == ~0 */
+			bool verbose);
 void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose);
 void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose);
 int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 9c201f32a..498de7e78 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1388,6 +1388,9 @@ static int gk20a_probe(struct platform_device *dev)
 		return -ENOMEM;
 	}
 
+	init_waitqueue_head(&gk20a->sw_irq_stall_last_handled_wq);
+	init_waitqueue_head(&gk20a->sw_irq_nonstall_last_handled_wq);
+
 #ifdef CONFIG_PM_GENERIC_DOMAINS_OF
 	gk20a_domain = container_of(dev_to_genpd(&dev->dev),
 			     struct gk20a_domain_data, gpd);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index a52d97f36..d8e3586f4 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -538,6 +538,15 @@ struct gk20a {
 	u32 max_ltc_count;
 	u32 ltc_count;
 
+	atomic_t hw_irq_stall_count;
+	atomic_t hw_irq_nonstall_count;
+
+	atomic_t sw_irq_stall_last_handled;
+	wait_queue_head_t sw_irq_stall_last_handled_wq;
+
+	atomic_t sw_irq_nonstall_last_handled;
+	wait_queue_head_t sw_irq_nonstall_last_handled_wq;
+
 	struct devfreq *devfreq;
 
 	struct gk20a_scale_profile *scale_profile;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index b2fea5b82..edd4c6c86 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5138,22 +5138,25 @@ static int gk20a_gr_handle_notify_pending(struct gk20a *g,
  * Also used by regops to translate current ctx to chid and tsgid.
  * For performance, we don't want to go through 128 channels every time.
  * curr_ctx should be the value read from gr_fecs_current_ctx_r().
- * A small tlb is used here to cache translation */
-static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx,
-				      int *curr_tsgid)
+ * A small tlb is used here to cache translation.
+ *
+ * Returned channel must be freed with gk20a_channel_put() */
+static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
+	struct gk20a *g, u32 curr_ctx, int *curr_tsgid)
 {
 	struct fifo_gk20a *f = &g->fifo;
 	struct gr_gk20a *gr = &g->gr;
 	u32 chid = -1;
 	int tsgid = NVGPU_INVALID_TSG_ID;
 	u32 i;
+	struct channel_gk20a *ret = NULL;
 
 	/* when contexts are unloaded from GR, the valid bit is reset
 	 * but the instance pointer information remains intact. So the
 	 * valid bit must be checked to be absolutely certain that a
 	 * valid context is currently resident. */
 	if (!gr_fecs_current_ctx_valid_v(curr_ctx))
-		return -1;
+		return NULL;
 
 	spin_lock(&gr->ch_tlb_lock);
 
@@ -5162,25 +5165,30 @@ static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx,
 		if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
 			chid = gr->chid_tlb[i].hw_chid;
 			tsgid = gr->chid_tlb[i].tsgid;
+			ret = gk20a_channel_get(&f->channel[chid]);
 			goto unlock;
 		}
 	}
 
 	/* slow path */
-	for (chid = 0; chid < f->num_channels; chid++)
-		if (f->channel[chid].in_use) {
-			if ((u32)(gk20a_mem_phys(&f->channel[chid].inst_block) >>
-				ram_in_base_shift_v()) ==
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *ch = &f->channel[chid];
+		if (!gk20a_channel_get(ch))
+			continue;
+
+		if ((u32)(gk20a_mem_phys(&ch->inst_block) >>
+					ram_in_base_shift_v()) ==
 				gr_fecs_current_ctx_ptr_v(curr_ctx)) {
-				tsgid = f->channel[chid].tsgid;
-				break;
-			}
+			tsgid = ch->tsgid;
+			/* found it */
+			ret = ch;
+			break;
+		}
+		gk20a_channel_put(ch);
 	}
 
-	if (chid >= f->num_channels) {
-		chid = -1;
+	if (!ret)
 		goto unlock;
-	}
 
 	/* add to free tlb entry */
 	for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
@@ -5205,7 +5213,7 @@ unlock:
 	spin_unlock(&gr->ch_tlb_lock);
 	if (curr_tsgid)
 		*curr_tsgid = tsgid;
-	return chid;
+	return ret;
 }
 
 int gk20a_gr_lock_down_sm(struct gk20a *g,
@@ -5399,6 +5407,7 @@ int gk20a_gr_isr(struct gk20a *g)
 	u32 obj_table;
 	int need_reset = 0;
 	u32 gr_intr = gk20a_readl(g, gr_intr_r());
+	struct channel_gk20a *ch = NULL;
 
 	gk20a_dbg_fn("");
 	gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
@@ -5424,13 +5433,13 @@ int gk20a_gr_isr(struct gk20a *g)
 		gr_fe_object_table_r(isr_data.sub_chan)) : 0;
 	isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
 
-	isr_data.chid =
-		gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx, NULL);
-	if (isr_data.chid == -1) {
+	ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, NULL);
+	if (!ch) {
 		gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
 			   isr_data.curr_ctx);
 		goto clean_up;
 	}
+	isr_data.chid = ch->hw_chid;
 
 	gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
 		"channel %d: addr 0x%08x, "
@@ -5512,8 +5521,6 @@ int gk20a_gr_isr(struct gk20a *g)
 
 	if (gr_intr & gr_intr_exception_pending_f()) {
 		u32 exception = gk20a_readl(g, gr_exception_r());
-		struct fifo_gk20a *f = &g->fifo;
-		struct channel_gk20a *ch = &f->channel[isr_data.chid];
 
 		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
 
@@ -5572,9 +5579,20 @@ int gk20a_gr_isr(struct gk20a *g)
 	}
 
 	if (need_reset)
-		gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
+		gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A),
+				   ~(u32)0, false, true);
 
 clean_up:
+	if (gr_intr && !ch) {
+		/* Clear interrupts for unused channel. This is
+		   probably an interrupt during gk20a_free_channel() */
+		gk20a_err(dev_from_gk20a(g),
+			  "unhandled gr interrupt 0x%08x for unreferenceable channel, clearing",
+			  gr_intr);
+		gk20a_writel(g, gr_intr_r(), gr_intr);
+		gr_intr = 0;
+	}
+
 	gk20a_writel(g, gr_gpfifo_ctl_r(),
 		grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
 		gr_gpfifo_ctl_semaphore_access_f(1));
@@ -5583,6 +5601,9 @@ clean_up:
 		gk20a_err(dev_from_gk20a(g),
 			   "unhandled gr interrupt 0x%08x", gr_intr);
 
+	if (ch)
+		gk20a_channel_put(ch);
+
 	return 0;
 }
 
@@ -6670,28 +6691,34 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 
 bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
 {
-	int curr_gr_chid, curr_gr_ctx, curr_gr_tsgid;
+	int curr_gr_ctx, curr_gr_tsgid;
 	struct gk20a *g = ch->g;
+	struct channel_gk20a *curr_ch;
+	bool ret = false;
 
 	curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
-	curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx,
-						  &curr_gr_tsgid);
+	curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
+					      &curr_gr_tsgid);
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
-			"curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
-			" ch->hw_chid=%d", curr_gr_chid,
-			curr_gr_tsgid, ch->tsgid, ch->hw_chid);
+		  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
+		  " ch->hw_chid=%d",
+		  curr_ch ? curr_ch->hw_chid : -1,
+		  curr_gr_tsgid,
+		  ch->tsgid,
+		  ch->hw_chid);
 
-	if (curr_gr_chid == -1)
+	if (!curr_ch)
 		return false;
 
-	if (ch->hw_chid == curr_gr_chid)
-		return true;
+	if (ch->hw_chid == curr_ch->hw_chid)
+		ret = true;
 
 	if (gk20a_is_channel_marked_as_tsg(ch) && (ch->tsgid == curr_gr_tsgid))
-		return true;
+		ret = true;
 
-	return false;
+	gk20a_channel_put(curr_ch);
+	return ret;
 }
 
 int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
diff --git a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
index 06b00a25c..0a773d10d 100644
--- a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c
@@ -40,6 +40,8 @@ irqreturn_t mc_gk20a_isr_stall(struct gk20a *g)
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_0_r());
 
+	atomic_inc(&g->hw_irq_stall_count);
+
 	trace_mc_gk20a_intr_stall_done(g->dev->name);
 
 	return IRQ_WAKE_THREAD;
@@ -63,18 +65,22 @@ irqreturn_t mc_gk20a_isr_nonstall(struct gk20a *g)
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_1_r());
 
+	atomic_inc(&g->hw_irq_nonstall_count);
+
 	return IRQ_WAKE_THREAD;
 }
 
 irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 {
 	u32 mc_intr_0;
+	int hw_irq_count;
 
 	gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
 
 	trace_mc_gk20a_intr_thread_stall(g->dev->name);
 
 	mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+	hw_irq_count = atomic_read(&g->hw_irq_stall_count);
 
 	gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0);
 
@@ -94,12 +100,17 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 	if (mc_intr_0 & mc_intr_0_pbus_pending_f())
 		gk20a_pbus_isr(g);
 
+	/* sync handled irq counter before re-enabling interrupts */
+	atomic_set(&g->sw_irq_stall_last_handled, hw_irq_count);
+
 	gk20a_writel(g, mc_intr_en_0_r(),
 		mc_intr_en_0_inta_hardware_f());
 
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_0_r());
 
+	wake_up_all(&g->sw_irq_stall_last_handled_wq);
+
 	trace_mc_gk20a_intr_thread_stall_done(g->dev->name);
 
 	return IRQ_HANDLED;
@@ -108,10 +119,12 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
 irqreturn_t mc_gk20a_intr_thread_nonstall(struct gk20a *g)
 {
 	u32 mc_intr_1;
+	int hw_irq_count;
 
 	gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
 
 	mc_intr_1 = gk20a_readl(g, mc_intr_1_r());
+	hw_irq_count = atomic_read(&g->hw_irq_nonstall_count);
 
 	gk20a_dbg(gpu_dbg_intr, "non-stall intr %08x\n", mc_intr_1);
 
@@ -125,12 +138,17 @@ irqreturn_t mc_gk20a_intr_thread_nonstall(struct gk20a *g)
 		&& g->ops.ce2.isr_nonstall)
 		g->ops.ce2.isr_nonstall(g);
 
+	/* sync handled irq counter before re-enabling interrupts */
+	atomic_set(&g->sw_irq_nonstall_last_handled, hw_irq_count);
+
 	gk20a_writel(g, mc_intr_en_1_r(),
 		mc_intr_en_1_inta_hardware_f());
 
 	/* flush previous write */
 	gk20a_readl(g, mc_intr_en_1_r());
 
+	wake_up_all(&g->sw_irq_stall_last_handled_wq);
+
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 68a31ecac..23ff86778 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -283,6 +283,9 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 
 	init_runlist(g, f);
 
+	INIT_LIST_HEAD(&f->free_chs);
+	mutex_init(&f->free_chs_mutex);
+
 	for (chid = 0; chid < f->num_channels; chid++) {
 		f->channel[chid].userd_cpu_va =
 			f->userd.cpu_va + chid * f->userd_entry_size;
@@ -294,7 +297,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 
 		gk20a_init_channel_support(g, chid);
 	}
-	mutex_init(&f->ch_inuse_mutex);
 
 	f->deferred_reset_pending = false;
 	mutex_init(&f->deferred_reset_mutex);
diff --git a/include/trace/events/gk20a.h b/include/trace/events/gk20a.h
index ad738f437..461ff6e8c 100644
--- a/include/trace/events/gk20a.h
+++ b/include/trace/events/gk20a.h
@@ -140,12 +140,54 @@ DEFINE_EVENT(gk20a, gk20a_mm_g_elpg_flush_locked_done,
 	TP_ARGS(name)
 );
 
-TRACE_EVENT(gk20a_channel_update,
-	TP_PROTO(const void *channel),
+DECLARE_EVENT_CLASS(gk20a_channel,
+	TP_PROTO(int channel),
 	TP_ARGS(channel),
-	TP_STRUCT__entry(__field(const void *, channel)),
+	TP_STRUCT__entry(__field(int, channel)),
 	TP_fast_assign(__entry->channel = channel;),
-	TP_printk("channel=%p", __entry->channel)
+	TP_printk("ch id %d", __entry->channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_channel_update,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_free_channel,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_open_new_channel,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+DEFINE_EVENT(gk20a_channel, gk20a_release_used_channel,
+	TP_PROTO(int channel),
+	TP_ARGS(channel)
+);
+
+DECLARE_EVENT_CLASS(gk20a_channel_getput,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller),
+	TP_STRUCT__entry(
+		__field(int, channel)
+		__field(const char *, caller)
+	),
+	TP_fast_assign(
+		__entry->channel = channel;
+		__entry->caller = caller;
+	),
+	TP_printk("channel %d caller %s", __entry->channel, __entry->caller)
+);
+DEFINE_EVENT(gk20a_channel_getput, gk20a_channel_get,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller)
+);
+DEFINE_EVENT(gk20a_channel_getput, gk20a_channel_put,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller)
+);
+DEFINE_EVENT(gk20a_channel_getput, gk20a_channel_put_nofree,
+	TP_PROTO(int channel, const char *caller),
+	TP_ARGS(channel, caller)
 );
 
 TRACE_EVENT(gk20a_push_cmdbuf,