diff --git a/drivers/gpu/nvgpu/common/nvgpu_common.c b/drivers/gpu/nvgpu/common/nvgpu_common.c
index 521ccd9d6..16640aa6d 100644
--- a/drivers/gpu/nvgpu/common/nvgpu_common.c
+++ b/drivers/gpu/nvgpu/common/nvgpu_common.c
@@ -39,7 +39,6 @@ static void nvgpu_init_vars(struct gk20a *g)
 	nvgpu_mutex_init(&platform->railgate_lock);
 	nvgpu_mutex_init(&g->dbg_sessions_lock);
 	nvgpu_mutex_init(&g->client_lock);
-	nvgpu_mutex_init(&g->ch_wdt_lock);
 	nvgpu_mutex_init(&g->poweroff_lock);
 
 	g->regs_saved = g->regs;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index ef8a3e7d9..6eb1cb068 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -20,6 +20,7 @@
 #include <linux/list.h>
 #include <linux/delay.h>
 #include <linux/highmem.h> /* need for nvmap.h*/
+#include <linux/kthread.h>
 #include <trace/events/gk20a.h>
 #include <linux/scatterlist.h>
 #include <linux/file.h>
@@ -91,8 +92,6 @@ static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
 
 static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
 					bool clean_all);
-static void gk20a_channel_cancel_job_clean_up(struct channel_gk20a *c,
-				bool wait_for_completion);
 
 /* allocate GPU channel */
 static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
@@ -491,7 +490,8 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
 	bool released_job_semaphore = false;
 	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(ch);
 
-	gk20a_channel_cancel_job_clean_up(ch, true);
+	/* synchronize with actual job cleanup */
+	nvgpu_mutex_acquire(&ch->joblist.cleanup_lock);
 
 	/* ensure no fences are pending */
 	nvgpu_mutex_acquire(&ch->sync_lock);
@@ -533,10 +533,16 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
 	}
 	channel_gk20a_joblist_unlock(ch);
 
+	nvgpu_mutex_release(&ch->joblist.cleanup_lock);
+
 	if (released_job_semaphore)
 		wake_up_interruptible_all(&ch->semaphore_wq);
 
-	gk20a_channel_update(ch, 0);
+	/*
+	 * When closing the channel, this scheduled update holds one ref which
+	 * is waited for before advancing with freeing.
+	 */
+	gk20a_channel_update(ch);
 }
 
 void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt)
@@ -1016,8 +1022,6 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
 	ch->update_fn_data = NULL;
 	nvgpu_spinlock_release(&ch->update_fn_lock);
 	cancel_work_sync(&ch->update_fn_work);
-	cancel_delayed_work_sync(&ch->clean_up.wq);
-	cancel_delayed_work_sync(&ch->timeout.wq);
 
 	/* make sure we don't have deferred interrupts pending that
 	 * could still touch the channel */
@@ -1345,7 +1349,6 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
 	ch->has_timedout = false;
 	ch->wdt_enabled = true;
 	ch->obj_class = 0;
-	ch->clean_up.scheduled = false;
 	ch->interleave_level = NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW;
 	ch->timeslice_us = g->timeslice_low_priority_us;
 
@@ -2075,6 +2078,30 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 		nvgpu_kfree(g);
 }
 
+static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
+{
+	ch->timeout.gp_get = gk20a_userd_gp_get(ch->g, ch);
+	ch->timeout.running = true;
+	nvgpu_timeout_init(ch->g, &ch->timeout.timer,
+			gk20a_get_channel_watchdog_timeout(ch),
+			NVGPU_TIMER_CPU_TIMER);
+}
+
+/**
+ * Start a timeout counter (watchdog) on this channel.
+ *
+ * Trigger a watchdog to recover the channel after the per-platform timeout
+ * duration (but strictly no earlier) if the channel hasn't advanced within
+ * that time.
+ *
+ * If the timeout is already running, do nothing. This should be called when
+ * new jobs are submitted. The timeout will stop when the last tracked job
+ * finishes, making the channel idle.
+ *
+ * The channel's gpfifo read pointer will be used to determine if the job has
+ * actually stuck at that time. After the timeout duration has expired, a
+ * worker thread will consider the channel stuck and recover it if stuck.
+ */
 static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
 {
 	struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
@@ -2087,94 +2114,108 @@ static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
 
 	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
 
-	if (ch->timeout.initialized) {
+	if (ch->timeout.running) {
 		nvgpu_raw_spinlock_release(&ch->timeout.lock);
 		return;
 	}
-
-	ch->timeout.gp_get = gk20a_userd_gp_get(ch->g, ch);
-	ch->timeout.initialized = true;
+	__gk20a_channel_timeout_start(ch);
 	nvgpu_raw_spinlock_release(&ch->timeout.lock);
-
-	schedule_delayed_work(&ch->timeout.wq,
-	       msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch)));
 }
 
-static void gk20a_channel_timeout_stop(struct channel_gk20a *ch)
+/**
+ * Stop a running timeout counter (watchdog) on this channel.
+ *
+ * Make the watchdog consider the channel not running, so that it won't get
+ * recovered even if no progress is detected. Progress is not tracked if the
+ * watchdog is turned off.
+ *
+ * No guarantees are made about concurrent execution of the timeout handler.
+ * (This should be called from an update handler running in the same thread
+ * with the watchdog.)
+ */
+static bool gk20a_channel_timeout_stop(struct channel_gk20a *ch)
+{
+	bool was_running;
+
+	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
+	was_running = ch->timeout.running;
+	ch->timeout.running = false;
+	nvgpu_raw_spinlock_release(&ch->timeout.lock);
+	return was_running;
+}
+
+/**
+ * Continue a previously stopped timeout
+ *
+ * Enable the timeout again but don't reinitialize its timer.
+ *
+ * No guarantees are made about concurrent execution of the timeout handler.
+ * (This should be called from an update handler running in the same thread
+ * with the watchdog.)
+ */
+static void gk20a_channel_timeout_continue(struct channel_gk20a *ch)
 {
 	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
-	if (!ch->timeout.initialized) {
-		nvgpu_raw_spinlock_release(&ch->timeout.lock);
-		return;
-	}
-	nvgpu_raw_spinlock_release(&ch->timeout.lock);
-
-	cancel_delayed_work_sync(&ch->timeout.wq);
-
-	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
-	ch->timeout.initialized = false;
+	ch->timeout.running = true;
 	nvgpu_raw_spinlock_release(&ch->timeout.lock);
 }
 
+/**
+ * Rewind the timeout on each non-dormant channel.
+ *
+ * Reschedule the timeout of each active channel for which timeouts are running
+ * as if something was happened on each channel right now. This should be
+ * called when a global hang is detected that could cause a false positive on
+ * other innocent channels.
+ */
 void gk20a_channel_timeout_restart_all_channels(struct gk20a *g)
 {
-	u32 chid;
 	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *ch = &f->channel[chid];
 
-		if (gk20a_channel_get(ch)) {
-			nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
-			if (!ch->timeout.initialized) {
-				nvgpu_raw_spinlock_release(&ch->timeout.lock);
-				gk20a_channel_put(ch);
-				continue;
-			}
-			nvgpu_raw_spinlock_release(&ch->timeout.lock);
+		if (!gk20a_channel_get(ch))
+			continue;
 
-			cancel_delayed_work_sync(&ch->timeout.wq);
-			if (!ch->has_timedout)
-				schedule_delayed_work(&ch->timeout.wq,
-				       msecs_to_jiffies(
-				       gk20a_get_channel_watchdog_timeout(ch)));
+		nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
+		if (ch->timeout.running)
+			__gk20a_channel_timeout_start(ch);
+		nvgpu_raw_spinlock_release(&ch->timeout.lock);
 
-			gk20a_channel_put(ch);
-		}
+		gk20a_channel_put(ch);
 	}
 }
 
-static void gk20a_channel_timeout_handler(struct work_struct *work)
+/**
+ * Check if a timed out channel has hung and recover it if it has.
+ *
+ * Test if this channel has really got stuck at this point (should be called
+ * when the watchdog timer has expired) by checking if its gp_get has advanced
+ * or not. If no gp_get action happened since when the watchdog was started,
+ * force-reset the channel.
+ *
+ * The gpu is implicitly on at this point, because the watchdog can only run on
+ * channels that have submitted jobs pending for cleanup.
+ */
+static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
 {
+	struct gk20a *g = ch->g;
 	u32 gp_get;
-	struct gk20a *g;
-	struct channel_gk20a *ch;
 
-	ch = container_of(to_delayed_work(work), struct channel_gk20a,
-			timeout.wq);
-	ch = gk20a_channel_get(ch);
-	if (!ch)
-		return;
+	gk20a_dbg_fn("");
 
-	g = ch->g;
-
-	if (gk20a_busy(dev_from_gk20a(g))) {
-		gk20a_channel_put(ch);
-		return;
-	}
-
-	/* Need global lock since multiple channels can timeout at a time */
-	nvgpu_mutex_acquire(&g->ch_wdt_lock);
-
-	/* Get timed out job and reset the timer */
+	/* Get status and clear the timer */
 	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
 	gp_get = ch->timeout.gp_get;
-	ch->timeout.initialized = false;
+	ch->timeout.running = false;
 	nvgpu_raw_spinlock_release(&ch->timeout.lock);
 
 	if (gk20a_userd_gp_get(ch->g, ch) != gp_get) {
+		/* Channel has advanced, reschedule */
 		gk20a_channel_timeout_start(ch);
-		goto fail_unlock;
+		return;
 	}
 
 	gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out",
@@ -2185,11 +2226,262 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
 
 	g->ops.fifo.force_reset_ch(ch,
 		NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT, true);
+}
 
-fail_unlock:
-	nvgpu_mutex_release(&g->ch_wdt_lock);
+/**
+ * Test if the per-channel timeout is expired and handle the timeout in that case.
+ *
+ * Each channel has an expiration time based watchdog. The timer is
+ * (re)initialized in two situations: when a new job is submitted on an idle
+ * channel and when the timeout is checked but progress is detected.
+ *
+ * Watchdog timeout does not yet necessarily mean a stuck channel so this may
+ * or may not cause recovery.
+ *
+ * The timeout is stopped (disabled) after the last job in a row finishes
+ * making the channel idle.
+ */
+static void gk20a_channel_timeout_check(struct channel_gk20a *ch)
+{
+	bool timed_out;
+
+	nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
+	timed_out = ch->timeout.running &&
+		nvgpu_timeout_expired(&ch->timeout.timer);
+	nvgpu_raw_spinlock_release(&ch->timeout.lock);
+
+	if (timed_out)
+		gk20a_channel_timeout_handler(ch);
+}
+
+/**
+ * Loop every living channel, check timeouts and handle stuck channels.
+ */
+static void gk20a_channel_poll_timeouts(struct gk20a *g)
+{
+	unsigned int chid;
+
+	gk20a_dbg_fn("");
+
+	for (chid = 0; chid < g->fifo.num_channels; chid++) {
+		struct channel_gk20a *ch = &g->fifo.channel[chid];
+
+		if (gk20a_channel_get(ch)) {
+			gk20a_channel_timeout_check(ch);
+			gk20a_channel_put(ch);
+		}
+	}
+}
+
+/*
+ * Process one scheduled work item for this channel. Currently, the only thing
+ * the worker does is job cleanup handling.
+ */
+static void gk20a_channel_worker_process_ch(struct channel_gk20a *ch)
+{
+	gk20a_dbg_fn("");
+
+	gk20a_channel_clean_up_jobs(ch, true);
+
+	/* ref taken when enqueued */
 	gk20a_channel_put(ch);
-	gk20a_idle(dev_from_gk20a(g));
+}
+
+/**
+ * Tell the worker that one more work needs to be done.
+ *
+ * Increase the work counter to synchronize the worker with the new work. Wake
+ * up the worker. If the worker was already running, it will handle this work
+ * before going to sleep.
+ */
+static int __gk20a_channel_worker_wakeup(struct gk20a *g)
+{
+	int put;
+
+	gk20a_dbg_fn("");
+
+	/*
+	 * Currently, the only work type is associated with a lock, which deals
+	 * with any necessary barriers. If a work type with no locking were
+	 * added, a a wmb() would be needed here. See ..worker_pending() for a
+	 * pair.
+	 */
+
+	put = atomic_inc_return(&g->channel_worker.put);
+	wake_up(&g->channel_worker.wq);
+
+	return put;
+}
+
+/**
+ * Test if there is some work pending.
+ *
+ * This is a pair for __gk20a_channel_worker_wakeup to be called from the
+ * worker. The worker has an internal work counter which is incremented once
+ * per finished work item. This is compared with the number of queued jobs,
+ * which may be channels on the items list or any other types of work.
+ */
+static bool __gk20a_channel_worker_pending(struct gk20a *g, int get)
+{
+	bool pending = atomic_read(&g->channel_worker.put) != get;
+
+	/*
+	 * This would be the place for a rmb() pairing a wmb() for a wakeup
+	 * if we had any work with no implicit barriers caused by locking.
+	 */
+
+	return pending;
+}
+
+/**
+ * Process the queued works for the worker thread serially.
+ *
+ * Flush all the work items in the queue one by one. This may block timeout
+ * handling for a short while, as these are serialized.
+ */
+static void gk20a_channel_worker_process(struct gk20a *g, int *get)
+{
+	gk20a_dbg_fn("");
+
+	while (__gk20a_channel_worker_pending(g, *get)) {
+		struct channel_gk20a *ch;
+
+		/*
+		 * If a channel is on the list, it's guaranteed to be handled
+		 * eventually just once. However, the opposite is not true. A
+		 * channel may be being processed if it's on the list or not.
+		 *
+		 * With this, processing channel works should be conservative
+		 * as follows: it's always safe to look at a channel found in
+		 * the list, and if someone enqueues the channel, it will be
+		 * handled eventually, even if it's being handled at the same
+		 * time. A channel is on the list only once; multiple calls to
+		 * enqueue are harmless.
+		 */
+		nvgpu_spinlock_acquire(&g->channel_worker.items_lock);
+		ch = list_first_entry_or_null(&g->channel_worker.items,
+				struct channel_gk20a,
+				worker_item);
+		if (ch)
+			list_del_init(&ch->worker_item);
+		nvgpu_spinlock_release(&g->channel_worker.items_lock);
+
+		if (!ch) {
+			/*
+			 * Woke up for some other reason, but there are no
+			 * other reasons than a channel added in the items list
+			 * currently, so warn and ack the message.
+			 */
+			gk20a_warn(g->dev, "Spurious worker event!");
+			++*get;
+			break;
+		}
+
+		gk20a_channel_worker_process_ch(ch);
+		++*get;
+	}
+}
+
+/*
+ * Look at channel states periodically, until canceled. Abort timed out
+ * channels serially. Process all work items found in the queue.
+ */
+static int gk20a_channel_poll_worker(void *arg)
+{
+	struct gk20a *g = (struct gk20a *)arg;
+	struct gk20a_channel_worker *worker = &g->channel_worker;
+	unsigned long start_wait;
+	/* event timeout for also polling the watchdog */
+	unsigned long timeout = msecs_to_jiffies(100);
+	int get = 0;
+
+	gk20a_dbg_fn("");
+
+	start_wait = jiffies;
+	while (!kthread_should_stop()) {
+		bool got_events;
+
+		got_events = wait_event_timeout(
+				worker->wq,
+				__gk20a_channel_worker_pending(g, get),
+				timeout) > 0;
+
+		if (got_events)
+			gk20a_channel_worker_process(g, &get);
+
+		if (jiffies - start_wait >= timeout) {
+			gk20a_channel_poll_timeouts(g);
+			start_wait = jiffies;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Initialize the channel worker's metadata and start the background thread.
+ */
+int nvgpu_channel_worker_init(struct gk20a *g)
+{
+	struct task_struct *task;
+
+	atomic_set(&g->channel_worker.put, 0);
+	init_waitqueue_head(&g->channel_worker.wq);
+	INIT_LIST_HEAD(&g->channel_worker.items);
+	nvgpu_spinlock_init(&g->channel_worker.items_lock);
+	task = kthread_run(gk20a_channel_poll_worker, g,
+			"nvgpu_channel_poll_%s", dev_name(g->dev));
+	if (IS_ERR(task)) {
+		gk20a_err(g->dev, "failed to start channel poller thread");
+		return PTR_ERR(task);
+	}
+	g->channel_worker.poll_task = task;
+
+	return 0;
+}
+
+void nvgpu_channel_worker_deinit(struct gk20a *g)
+{
+	kthread_stop(g->channel_worker.poll_task);
+}
+
+/**
+ * Append a channel to the worker's list, if not there already.
+ *
+ * The worker thread processes work items (channels in its work list) and polls
+ * for other things. This adds @ch to the end of the list and wakes the worker
+ * up immediately. If the channel already existed in the list, it's not added,
+ * because in that case it has been scheduled already but has not yet been
+ * processed.
+ */
+void gk20a_channel_worker_enqueue(struct channel_gk20a *ch)
+{
+	struct gk20a *g = ch->g;
+
+	gk20a_dbg_fn("");
+
+	/*
+	 * Ref released when this item gets processed. The caller should hold
+	 * one ref already, so can't fail.
+	 */
+	if (WARN_ON(!gk20a_channel_get(ch))) {
+		gk20a_warn(g->dev, "cannot get ch ref for worker!");
+		return;
+	}
+
+	nvgpu_spinlock_acquire(&g->channel_worker.items_lock);
+	if (!list_empty(&ch->worker_item)) {
+		/*
+		 * Already queued, so will get processed eventually.
+		 * The worker is probably awake already.
+		 */
+		nvgpu_spinlock_release(&g->channel_worker.items_lock);
+		gk20a_channel_put(ch);
+		return;
+	}
+	list_add_tail(&ch->worker_item, &g->channel_worker.items);
+	nvgpu_spinlock_release(&g->channel_worker.items_lock);
+
+	__gk20a_channel_worker_wakeup(g);
 }
 
 int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e)
@@ -2214,32 +2506,6 @@ int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e)
 	return 0;
 }
 
-static void gk20a_channel_schedule_job_clean_up(struct channel_gk20a *c)
-{
-	nvgpu_mutex_acquire(&c->clean_up.lock);
-
-	if (c->clean_up.scheduled) {
-		nvgpu_mutex_release(&c->clean_up.lock);
-		return;
-	}
-
-	c->clean_up.scheduled = true;
-	schedule_delayed_work(&c->clean_up.wq, 1);
-
-	nvgpu_mutex_release(&c->clean_up.lock);
-}
-
-static void gk20a_channel_cancel_job_clean_up(struct channel_gk20a *c,
-				bool wait_for_completion)
-{
-	if (wait_for_completion)
-		cancel_delayed_work_sync(&c->clean_up.wq);
-
-	nvgpu_mutex_acquire(&c->clean_up.lock);
-	c->clean_up.scheduled = false;
-	nvgpu_mutex_release(&c->clean_up.lock);
-}
-
 static int gk20a_channel_add_job(struct channel_gk20a *c,
 				 struct channel_gk20a_job *job,
 				 bool skip_buffer_refcounting)
@@ -2256,7 +2522,10 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
 			return err;
 	}
 
-	/* put() is done in gk20a_channel_update() when the job is done */
+	/*
+	 * Ref to hold the channel open during the job lifetime. This is
+	 * released by job cleanup launched via syncpt or sema interrupt.
+	 */
 	c = gk20a_channel_get(c);
 
 	if (c) {
@@ -2291,14 +2560,16 @@ err_put_buffers:
 	return err;
 }
 
-static void gk20a_channel_clean_up_runcb_fn(struct work_struct *work)
-{
-	struct channel_gk20a *c = container_of(to_delayed_work(work),
-			struct channel_gk20a, clean_up.wq);
-
-	gk20a_channel_clean_up_jobs(c, true);
-}
-
+/**
+ * Clean up job resources for further jobs to use.
+ * @clean_all: If true, process as many jobs as possible, otherwise just one.
+ *
+ * Loop all jobs from the joblist until a pending job is found, or just one if
+ * clean_all is not set. Pending jobs are detected from the job's post fence,
+ * so this is only done for jobs that have job tracking resources. Free all
+ * per-job memory for completed jobs; in case of preallocated resources, this
+ * opens up slots for new jobs to be submitted.
+ */
 static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
 					bool clean_all)
 {
@@ -2307,6 +2578,7 @@ static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
 	struct gk20a_platform *platform;
 	struct gk20a *g;
 	int job_finished = 0;
+	bool watchdog_on = false;
 
 	c = gk20a_channel_get(c);
 	if (!c)
@@ -2321,13 +2593,25 @@ static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
 	g = c->g;
 	platform = gk20a_get_platform(g->dev);
 
-	gk20a_channel_cancel_job_clean_up(c, false);
+	/*
+	 * If !clean_all, we're in a condition where watchdog isn't supported
+	 * anyway (this would be a no-op).
+	 */
+	if (clean_all)
+		watchdog_on = gk20a_channel_timeout_stop(c);
+
+	/* Synchronize with abort cleanup that needs the jobs. */
+	nvgpu_mutex_acquire(&c->joblist.cleanup_lock);
 
 	while (1) {
 		bool completed;
 
 		channel_gk20a_joblist_lock(c);
 		if (channel_gk20a_joblist_is_empty(c)) {
+			/*
+			 * No jobs in flight, timeout will remain stopped until
+			 * new jobs are submitted.
+			 */
 			channel_gk20a_joblist_unlock(c);
 			break;
 		}
@@ -2343,7 +2627,15 @@ static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
 
 		completed = gk20a_fence_is_expired(job->post_fence);
 		if (!completed) {
-			gk20a_channel_timeout_start(c);
+			/*
+			 * The watchdog eventually sees an updated gp_get if
+			 * something happened in this loop. A new job can have
+			 * been submitted between the above call to stop and
+			 * this - in that case, this is a no-op and the new
+			 * later timeout is still used.
+			 */
+			if (clean_all && watchdog_on)
+				gk20a_channel_timeout_continue(c);
 			break;
 		}
 
@@ -2394,32 +2686,38 @@ static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
 		job_finished = 1;
 		gk20a_idle(g->dev);
 
-		if (!clean_all)
+		if (!clean_all) {
+			/* Timeout isn't supported here so don't touch it. */
 			break;
+		}
 	}
 
+	nvgpu_mutex_release(&c->joblist.cleanup_lock);
+
 	if (job_finished && c->update_fn)
 		schedule_work(&c->update_fn_work);
 
 	gk20a_channel_put(c);
 }
 
-void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
+/**
+ * Schedule a job cleanup work on this channel to free resources and to signal
+ * about completion.
+ *
+ * Call this when there has been an interrupt about finished jobs, or when job
+ * cleanup needs to be performed, e.g., when closing a channel. This is always
+ * safe to call even if there is nothing to clean up. Any visible actions on
+ * jobs just before calling this are guaranteed to be processed.
+ */
+void gk20a_channel_update(struct channel_gk20a *c)
 {
-	c = gk20a_channel_get(c);
-	if (!c)
-		return;
-
 	if (!c->g->power_on) { /* shutdown case */
-		gk20a_channel_put(c);
 		return;
 	}
 
 	trace_gk20a_channel_update(c->hw_chid);
-	gk20a_channel_timeout_stop(c);
-	gk20a_channel_schedule_job_clean_up(c);
-
-	gk20a_channel_put(c);
+	/* A queued channel is always checked for job cleanup. */
+	gk20a_channel_worker_enqueue(c);
 }
 
 static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
@@ -2809,7 +3107,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		if (c->deterministic && need_deferred_cleanup)
 			return -EINVAL;
 
-		/* gk20a_channel_update releases this ref. */
+		/* released by job cleanup via syncpt or sema interrupt */
 		err = gk20a_busy(g->dev);
 		if (err) {
 			gk20a_err(d, "failed to host gk20a to submit gpfifo, process %s",
@@ -2929,13 +3227,12 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 #endif
 	nvgpu_mutex_init(&c->ioctl_lock);
 	nvgpu_mutex_init(&c->error_notifier_mutex);
+	nvgpu_mutex_init(&c->joblist.cleanup_lock);
 	nvgpu_spinlock_init(&c->joblist.dynamic.lock);
 	nvgpu_mutex_init(&c->joblist.pre_alloc.read_lock);
 	nvgpu_raw_spinlock_init(&c->timeout.lock);
 	nvgpu_mutex_init(&c->sync_lock);
-	INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
-	INIT_DELAYED_WORK(&c->clean_up.wq, gk20a_channel_clean_up_runcb_fn);
-	nvgpu_mutex_init(&c->clean_up.lock);
+
 	INIT_LIST_HEAD(&c->joblist.dynamic.jobs);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	nvgpu_mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
@@ -2947,6 +3244,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	nvgpu_mutex_init(&c->dbg_s_lock);
 	list_add(&c->free_chs, &g->fifo.free_chs);
 
+	INIT_LIST_HEAD(&c->worker_item);
+
 	return 0;
 }
 
@@ -3384,8 +3683,6 @@ int gk20a_channel_suspend(struct gk20a *g)
 			gk20a_disable_channel_tsg(g, ch);
 			/* preempt the channel */
 			gk20a_fifo_preempt(g, ch);
-			gk20a_channel_timeout_stop(ch);
-			gk20a_channel_cancel_job_clean_up(ch, true);
 			/* wait for channel update notifiers */
 			if (ch->update_fn)
 				cancel_work_sync(&ch->update_fn_work);
@@ -3481,7 +3778,7 @@ void gk20a_channel_semaphore_wakeup(struct gk20a *g, bool post_events)
 				 * semaphore.
 				 */
 				if (!c->deterministic)
-					gk20a_channel_update(c, 0);
+					gk20a_channel_update(c);
 			}
 			gk20a_channel_put(c);
 		}
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 14ee9f69e..d9913cd7b 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -27,6 +27,7 @@
 #include <uapi/linux/nvgpu.h>
 
 #include <nvgpu/lock.h>
+#include <nvgpu/timers.h>
 
 struct gk20a;
 struct gr_gk20a;
@@ -87,12 +88,19 @@ struct channel_gk20a_joblist {
 		struct list_head jobs;
 		struct nvgpu_spinlock lock;
 	} dynamic;
+
+	/*
+	 * Synchronize abort cleanup (when closing a channel) and job cleanup
+	 * (asynchronously from worker) - protect from concurrent access when
+	 * job resources are being freed.
+	 */
+	struct nvgpu_mutex cleanup_lock;
 };
 
 struct channel_gk20a_timeout {
-	struct delayed_work wq;
 	struct nvgpu_raw_spinlock lock;
-	bool initialized;
+	struct nvgpu_timeout timer;
+	bool running;
 	u32 gp_get;
 };
 
@@ -110,12 +118,6 @@ struct gk20a_event_id_data {
 	struct list_head event_id_node;
 };
 
-struct channel_gk20a_clean_up {
-	struct nvgpu_mutex lock;
-	bool scheduled;
-	struct delayed_work wq;
-};
-
 /*
  * Track refcount actions, saving their stack traces. This number specifies how
  * many most recent actions are stored in a buffer. Set to 0 to disable. 128
@@ -214,7 +216,8 @@ struct channel_gk20a {
 	u32 timeout_gpfifo_get;
 
 	struct channel_gk20a_timeout timeout;
-	struct channel_gk20a_clean_up clean_up;
+	/* for job cleanup handling in the background worker */
+	struct list_head worker_item;
 
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	struct {
@@ -250,8 +253,11 @@ struct channel_gk20a {
 	u64 virt_ctx;
 #endif
 
-	/* signal channel owner via a callback, if set, in gk20a_channel_update
-	 * via schedule_work */
+	/*
+	 * Signal channel owner via a callback, if set, in job cleanup with
+	 * schedule_work. Means that something finished on the channel (perhaps
+	 * more than one job).
+	 */
 	void (*update_fn)(struct channel_gk20a *, void *);
 	void *update_fn_data;
 	struct nvgpu_spinlock update_fn_lock; /* make access to the two above atomic */
@@ -293,6 +299,9 @@ int gk20a_disable_channel_tsg(struct gk20a *g, struct channel_gk20a *ch);
 int gk20a_channel_suspend(struct gk20a *g);
 int gk20a_channel_resume(struct gk20a *g);
 
+int nvgpu_channel_worker_init(struct gk20a *g);
+void nvgpu_channel_worker_deinit(struct gk20a *g);
+
 /* Channel file operations */
 int gk20a_channel_open(struct inode *inode, struct file *filp);
 int gk20a_channel_open_ioctl(struct gk20a *g,
@@ -302,7 +311,7 @@ long gk20a_channel_ioctl(struct file *filp,
 			 unsigned long arg);
 int gk20a_channel_release(struct inode *inode, struct file *filp);
 struct channel_gk20a *gk20a_get_channel_from_file(int fd);
-void gk20a_channel_update(struct channel_gk20a *c, int nr_completed);
+void gk20a_channel_update(struct channel_gk20a *c);
 
 void gk20a_init_channel(struct gpu_ops *gops);
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 097635a7a..0aa202c5e 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -179,7 +179,7 @@ static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
 {
 	struct channel_gk20a *ch = priv;
 
-	gk20a_channel_update(ch, nr_completed);
+	gk20a_channel_update(ch);
 
 	/* note: channel_get() is in __gk20a_channel_syncpt_incr() */
 	gk20a_channel_put(ch);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 95351a43e..e9eab5513 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -502,6 +502,7 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
 
 	gk20a_dbg_fn("");
 
+	nvgpu_channel_worker_deinit(g);
 	/*
 	 * Make sure all channels are closed before deleting them.
 	 */
@@ -900,6 +901,9 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 	}
 	nvgpu_mutex_init(&f->tsg_inuse_mutex);
 
+	err = nvgpu_channel_worker_init(g);
+	if (err)
+		goto clean_up;
 	f->remove_support = gk20a_remove_fifo_support;
 
 	f->deferred_reset_pending = false;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index c30a8eaf6..c79cc6c87 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -874,8 +874,6 @@ struct gk20a {
 	bool timeouts_enabled;
 #endif
 
-	struct nvgpu_mutex ch_wdt_lock;
-
 	struct nvgpu_mutex poweroff_lock;
 
 	/* Channel priorities */
@@ -1008,6 +1006,14 @@ struct gk20a {
 	atomic_t sw_irq_nonstall_last_handled;
 	wait_queue_head_t sw_irq_nonstall_last_handled_wq;
 
+	struct gk20a_channel_worker {
+		struct task_struct *poll_task;
+		atomic_t put;
+		wait_queue_head_t wq;
+		struct list_head items;
+		struct nvgpu_spinlock items_lock;
+	} channel_worker;
+
 	struct devfreq *devfreq;
 
 	struct gk20a_scale_profile *scale_profile;
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 7b6ed3226..027a92fca 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -308,6 +308,10 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 	}
 	nvgpu_mutex_init(&f->tsg_inuse_mutex);
 
+	err = nvgpu_channel_worker_init(g);
+	if (err)
+		goto clean_up;
+
 	f->deferred_reset_pending = false;
 	nvgpu_mutex_init(&f->deferred_reset_mutex);
 
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index 4c88ab967..c8ab23f1a 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -252,7 +252,6 @@ static int vgpu_init_support(struct platform_device *pdev)
 
 	nvgpu_mutex_init(&g->dbg_sessions_lock);
 	nvgpu_mutex_init(&g->client_lock);
-	nvgpu_mutex_init(&g->ch_wdt_lock);
 
 	g->dbg_regops_tmp_buf = kzalloc(SZ_4K, GFP_KERNEL);
 	if (!g->dbg_regops_tmp_buf) {