diff --git a/drivers/gpu/nvgpu/common/linux/dma.c b/drivers/gpu/nvgpu/common/linux/dma.c
index b62c4593f..9e9d10073 100644
--- a/drivers/gpu/nvgpu/common/linux/dma.c
+++ b/drivers/gpu/nvgpu/common/linux/dma.c
@@ -514,7 +514,6 @@ static void nvgpu_dma_free_sys(struct gk20a *g, struct nvgpu_mem *mem)
 static void nvgpu_dma_free_vid(struct gk20a *g, struct nvgpu_mem *mem)
 {
 #if defined(CONFIG_GK20A_VIDMEM)
-	bool was_empty;
 	size_t mem_size = mem->size;
 
 	dma_dbg_free(g, mem->size, mem->priv.flags, "vidmem");
@@ -523,18 +522,19 @@ static void nvgpu_dma_free_vid(struct gk20a *g, struct nvgpu_mem *mem)
 	WARN_ON(mem->priv.flags != NVGPU_DMA_NO_KERNEL_MAPPING);
 
 	if (mem->mem_flags & NVGPU_MEM_FLAG_USER_MEM) {
-		nvgpu_mutex_acquire(&g->mm.vidmem.clear_list_mutex);
-		was_empty = nvgpu_list_empty(&g->mm.vidmem.clear_list_head);
-		nvgpu_list_add_tail(&mem->clear_list_entry,
-			      &g->mm.vidmem.clear_list_head);
-		atomic64_add(mem->aligned_size,
-			     &g->mm.vidmem.bytes_pending.atomic_var);
-		nvgpu_mutex_release(&g->mm.vidmem.clear_list_mutex);
+		int err = nvgpu_vidmem_clear_list_enqueue(g, mem);
 
-		if (was_empty) {
-			cancel_work_sync(&g->mm.vidmem.clear_mem_worker);
-			schedule_work(&g->mm.vidmem.clear_mem_worker);
-		}
+		/*
+		 * If there's an error here then that means we can't clear the
+		 * vidmem. That's too bad; however, we still own the nvgpu_mem
+		 * buf so we have to free that.
+		 *
+		 * We don't need to worry about the vidmem allocator itself
+		 * since when that gets cleaned up in the driver shutdown path
+		 * all the outstanding allocs are force freed.
+		 */
+		if (err)
+			nvgpu_kfree(g, mem);
 	} else {
 		nvgpu_memset(g, mem, 0, 0, mem->aligned_size);
 		nvgpu_free(mem->allocator,
diff --git a/drivers/gpu/nvgpu/common/linux/vidmem.c b/drivers/gpu/nvgpu/common/linux/vidmem.c
index ea8e552f7..92e7e5043 100644
--- a/drivers/gpu/nvgpu/common/linux/vidmem.c
+++ b/drivers/gpu/nvgpu/common/linux/vidmem.c
@@ -84,6 +84,8 @@ static void gk20a_vidbuf_release(struct dma_buf *dmabuf)
 
 	nvgpu_kfree(g, linux_buf);
 	nvgpu_vidmem_buf_free(g, buf);
+
+	gk20a_put(g);
 }
 
 static void *gk20a_vidbuf_kmap(struct dma_buf *dmabuf, unsigned long page_num)
@@ -160,13 +162,21 @@ struct gk20a *nvgpu_vidmem_buf_owner(struct dma_buf *dmabuf)
 
 int nvgpu_vidmem_export_linux(struct gk20a *g, size_t bytes)
 {
-	struct nvgpu_vidmem_buf *buf;
+	struct nvgpu_vidmem_buf *buf = NULL;
 	struct nvgpu_vidmem_linux *priv;
 	int err, fd;
 
+	/*
+	 * This ref is released when the dma_buf is closed.
+	 */
+	if (!gk20a_get(g))
+		return -ENODEV;
+
 	priv = nvgpu_kzalloc(g, sizeof(*priv));
-	if (!priv)
-		return -ENOMEM;
+	if (!priv) {
+		err = -ENOMEM;
+		goto fail;
+	}
 
 	buf = nvgpu_vidmem_user_alloc(g, bytes);
 	if (!buf) {
@@ -195,8 +205,10 @@ int nvgpu_vidmem_export_linux(struct gk20a *g, size_t bytes)
 	return fd;
 
 fail:
-	nvgpu_kfree(g, priv);
 	nvgpu_vidmem_buf_free(g, buf);
+	nvgpu_kfree(g, priv);
+	gk20a_put(g);
+
 	return err;
 }
 
@@ -229,24 +241,9 @@ int nvgpu_vidmem_buf_access_memory(struct gk20a *g, struct dma_buf *dmabuf,
 	return err;
 }
 
-void nvgpu_vidmem_clear_mem_worker(struct work_struct *work)
+void __nvgpu_mem_free_vidmem_alloc(struct gk20a *g, struct nvgpu_mem *vidmem)
 {
-	struct mm_gk20a *mm = container_of(work, struct mm_gk20a,
-					vidmem.clear_mem_worker);
-	struct gk20a *g = mm->g;
-	struct nvgpu_mem *mem;
-
-	while ((mem = nvgpu_vidmem_get_pending_alloc(mm)) != NULL) {
-		nvgpu_vidmem_clear(g, mem);
-		nvgpu_free(mem->allocator,
-			   (u64)nvgpu_vidmem_get_page_alloc(mem->priv.sgt->sgl));
-		nvgpu_free_sgtable(g, &mem->priv.sgt);
-
-		WARN_ON(nvgpu_atomic64_sub_return(mem->aligned_size,
-					&g->mm.vidmem.bytes_pending) < 0);
-		mem->size = 0;
-		mem->aperture = APERTURE_INVALID;
-
-		nvgpu_kfree(g, mem);
-	}
+	nvgpu_free(vidmem->allocator,
+		   (u64)nvgpu_vidmem_get_page_alloc(vidmem->priv.sgt->sgl));
+	nvgpu_free_sgtable(g, &vidmem->priv.sgt);
 }
diff --git a/drivers/gpu/nvgpu/common/mm/vidmem.c b/drivers/gpu/nvgpu/common/mm/vidmem.c
index d1c5a2e80..60b819d78 100644
--- a/drivers/gpu/nvgpu/common/mm/vidmem.c
+++ b/drivers/gpu/nvgpu/common/mm/vidmem.c
@@ -22,15 +22,55 @@
 
 #include <linux/scatterlist.h>
 
+#include <nvgpu/timers.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/vidmem.h>
 #include <nvgpu/page_allocator.h>
+#include <nvgpu/enabled.h>
 
 #include "gk20a/gk20a.h"
 #include "gk20a/mm_gk20a.h"
 
+/*
+ * This is expected to be called from the shutdown path (or the error path in
+ * the vidmem init code). As such we do not expect new vidmem frees to be
+ * enqueued.
+ */
 void nvgpu_vidmem_destroy(struct gk20a *g)
 {
+	struct nvgpu_timeout timeout;
+
+	nvgpu_timeout_init(g, &timeout, 100, NVGPU_TIMER_RETRY_TIMER);
+
+	/*
+	 * Ensure that the thread runs one last time to flush anything in the
+	 * queue.
+	 */
+	nvgpu_cond_signal_interruptible(&g->mm.vidmem.clearing_thread_cond);
+
+	/*
+	 * Wait for at most 1 second before just continuing on. It doesn't make
+	 * sense to hang the system over some potential memory leaks.
+	 */
+	do {
+		bool empty;
+
+		nvgpu_mutex_acquire(&g->mm.vidmem.clear_list_mutex);
+		empty = nvgpu_list_empty(&g->mm.vidmem.clear_list_head);
+		nvgpu_mutex_release(&g->mm.vidmem.clear_list_mutex);
+
+		if (empty)
+			break;
+
+		nvgpu_msleep(10);
+	} while (!nvgpu_timeout_expired(&timeout));
+
+	/*
+	 * Kill the vidmem clearing thread now. This will wake the thread up
+	 * automatically and cause the wait_interruptible condition trigger.
+	 */
+	nvgpu_thread_stop(&g->mm.vidmem.clearing_thread);
+
 	if (nvgpu_alloc_initialized(&g->mm.vidmem.allocator))
 		nvgpu_alloc_destroy(&g->mm.vidmem.allocator);
 }
@@ -107,6 +147,139 @@ static int __nvgpu_vidmem_do_clear_all(struct gk20a *g)
 	return 0;
 }
 
+void nvgpu_vidmem_thread_pause_sync(struct mm_gk20a *mm)
+{
+	/*
+	 * On the first increment of the pause_count (0 -> 1) take the pause
+	 * lock and prevent the vidmem clearing thread from processing work
+	 * items.
+	 *
+	 * Otherwise the increment is all that's needed - it's essentially a
+	 * ref-count for the number of pause() calls.
+	 *
+	 * The sync component is implemented by waiting for the lock to be
+	 * released by the clearing thread in case the thread is currently
+	 * processing work items.
+	 */
+	if (nvgpu_atomic_inc_return(&mm->vidmem.pause_count) == 1)
+		nvgpu_mutex_acquire(&mm->vidmem.clearing_thread_lock);
+}
+
+void nvgpu_vidmem_thread_unpause(struct mm_gk20a *mm)
+{
+	/*
+	 * And on the last decrement (1 -> 0) release the pause lock and let
+	 * the vidmem clearing thread continue.
+	 */
+	if (nvgpu_atomic_dec_return(&mm->vidmem.pause_count) == 0)
+		nvgpu_mutex_release(&mm->vidmem.clearing_thread_lock);
+}
+
+int nvgpu_vidmem_clear_list_enqueue(struct gk20a *g, struct nvgpu_mem *mem)
+{
+	struct mm_gk20a *mm = &g->mm;
+
+	/*
+	 * Crap. Can't enqueue new vidmem bufs! CE may be gone!
+	 *
+	 * However, an errant app can hold a vidmem dma_buf FD open past when
+	 * the nvgpu driver has exited. Thus when the FD does get closed
+	 * eventually the dma_buf release function will try to call the vidmem
+	 * free function which will attempt to enqueue the vidmem into the
+	 * vidmem clearing thread.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
+		return -ENOSYS;
+
+	nvgpu_mutex_acquire(&mm->vidmem.clear_list_mutex);
+	nvgpu_list_add_tail(&mem->clear_list_entry,
+			    &mm->vidmem.clear_list_head);
+	nvgpu_atomic64_add(mem->aligned_size, &mm->vidmem.bytes_pending);
+	nvgpu_mutex_release(&mm->vidmem.clear_list_mutex);
+
+	nvgpu_cond_signal_interruptible(&mm->vidmem.clearing_thread_cond);
+
+	return 0;
+}
+
+static struct nvgpu_mem *nvgpu_vidmem_clear_list_dequeue(struct mm_gk20a *mm)
+{
+	struct nvgpu_mem *mem = NULL;
+
+	nvgpu_mutex_acquire(&mm->vidmem.clear_list_mutex);
+	if (!nvgpu_list_empty(&mm->vidmem.clear_list_head)) {
+		mem = nvgpu_list_first_entry(&mm->vidmem.clear_list_head,
+				nvgpu_mem, clear_list_entry);
+		nvgpu_list_del(&mem->clear_list_entry);
+	}
+	nvgpu_mutex_release(&mm->vidmem.clear_list_mutex);
+
+	return mem;
+}
+
+static void nvgpu_vidmem_clear_pending_allocs(struct mm_gk20a *mm)
+{
+	struct gk20a *g = mm->g;
+	struct nvgpu_mem *mem;
+
+	while ((mem = nvgpu_vidmem_clear_list_dequeue(mm)) != NULL) {
+		nvgpu_vidmem_clear(g, mem);
+
+		WARN_ON(nvgpu_atomic64_sub_return(mem->aligned_size,
+					&g->mm.vidmem.bytes_pending) < 0);
+		mem->size = 0;
+		mem->aperture = APERTURE_INVALID;
+
+		__nvgpu_mem_free_vidmem_alloc(g, mem);
+		nvgpu_kfree(g, mem);
+	}
+}
+
+static int nvgpu_vidmem_clear_pending_allocs_thr(void *mm_ptr)
+{
+	struct mm_gk20a *mm = mm_ptr;
+
+	/*
+	 * Simple thread who's sole job is to periodically clear userspace
+	 * vidmem allocations that have been recently freed.
+	 *
+	 * Since it doesn't make sense to run unless there's pending work a
+	 * condition field is used to wait for work. When the DMA API frees a
+	 * userspace vidmem buf it enqueues it into the clear list and alerts us
+	 * that we have some work to do.
+	 */
+
+	while (!nvgpu_thread_should_stop(&mm->vidmem.clearing_thread)) {
+		int ret;
+
+		/*
+		 * Wait for work but also make sure we should not be paused.
+		 */
+		ret = NVGPU_COND_WAIT_INTERRUPTIBLE(
+				&mm->vidmem.clearing_thread_cond,
+				nvgpu_thread_should_stop(
+					&mm->vidmem.clearing_thread) ||
+				!nvgpu_list_empty(&mm->vidmem.clear_list_head),
+				0);
+		if (ret == -ERESTARTSYS)
+			continue;
+
+		/*
+		 * Use this lock to implement a pause mechanism. By taking this
+		 * lock some other code can prevent this thread from processing
+		 * work items.
+		 */
+		if (!nvgpu_mutex_tryacquire(&mm->vidmem.clearing_thread_lock))
+			continue;
+
+		nvgpu_vidmem_clear_pending_allocs(mm);
+
+		nvgpu_mutex_release(&mm->vidmem.clearing_thread_lock);
+	}
+
+	return 0;
+}
+
 int nvgpu_vidmem_init(struct mm_gk20a *mm)
 {
 	struct gk20a *g = mm->g;
@@ -156,16 +329,39 @@ int nvgpu_vidmem_init(struct mm_gk20a *mm)
 	mm->vidmem.bootstrap_base = bootstrap_base;
 	mm->vidmem.bootstrap_size = bootstrap_size;
 
-	nvgpu_mutex_init(&mm->vidmem.first_clear_mutex);
+	err = nvgpu_cond_init(&mm->vidmem.clearing_thread_cond);
+	if (err)
+		goto fail;
 
-	INIT_WORK(&mm->vidmem.clear_mem_worker, nvgpu_vidmem_clear_mem_worker);
 	nvgpu_atomic64_set(&mm->vidmem.bytes_pending, 0);
 	nvgpu_init_list_node(&mm->vidmem.clear_list_head);
 	nvgpu_mutex_init(&mm->vidmem.clear_list_mutex);
+	nvgpu_mutex_init(&mm->vidmem.clearing_thread_lock);
+	nvgpu_atomic_set(&mm->vidmem.pause_count, 0);
+
+	/*
+	 * Start the thread off in the paused state. The thread doesn't have to
+	 * be running for this to work. It will be woken up later on in
+	 * finalize_poweron(). We won't necessarily have a CE context yet
+	 * either, so hypothetically one could cause a race where we try to
+	 * clear a vidmem struct before we have a CE context to do so.
+	 */
+	nvgpu_vidmem_thread_pause_sync(mm);
+
+	err = nvgpu_thread_create(&mm->vidmem.clearing_thread, mm,
+				  nvgpu_vidmem_clear_pending_allocs_thr,
+				  "vidmem-clear");
+	if (err)
+		goto fail;
 
 	gk20a_dbg_info("registered vidmem: %zu MB", size / SZ_1M);
 
 	return 0;
+
+fail:
+	nvgpu_cond_destroy(&mm->vidmem.clearing_thread_cond);
+	nvgpu_vidmem_destroy(g);
+	return err;
 }
 
 int nvgpu_vidmem_get_space(struct gk20a *g, u64 *space)
@@ -244,21 +440,6 @@ int nvgpu_vidmem_clear(struct gk20a *g, struct nvgpu_mem *mem)
 	return err;
 }
 
-struct nvgpu_mem *nvgpu_vidmem_get_pending_alloc(struct mm_gk20a *mm)
-{
-	struct nvgpu_mem *mem = NULL;
-
-	nvgpu_mutex_acquire(&mm->vidmem.clear_list_mutex);
-	if (!nvgpu_list_empty(&mm->vidmem.clear_list_head)) {
-		mem = nvgpu_list_first_entry(&mm->vidmem.clear_list_head,
-				nvgpu_mem, clear_list_entry);
-		nvgpu_list_del(&mem->clear_list_entry);
-	}
-	nvgpu_mutex_release(&mm->vidmem.clear_list_mutex);
-
-	return mem;
-}
-
 static int nvgpu_vidmem_clear_all(struct gk20a *g)
 {
 	int err;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index e1bf2b4b6..02baf6838 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -33,6 +33,7 @@
 #include <nvgpu/pmu.h>
 #include <nvgpu/gmmu.h>
 #include <nvgpu/ltc.h>
+#include <nvgpu/vidmem.h>
 
 #include <trace/events/gk20a.h>
 
@@ -97,8 +98,6 @@ int gk20a_prepare_poweroff(struct gk20a *g)
 	if (gk20a_fifo_is_engine_busy(g))
 		return -EBUSY;
 
-	gk20a_ce_suspend(g);
-
 	ret = gk20a_channel_suspend(g);
 	if (ret)
 		return ret;
@@ -111,6 +110,8 @@ int gk20a_prepare_poweroff(struct gk20a *g)
 	ret |= gk20a_mm_suspend(g);
 	ret |= gk20a_fifo_suspend(g);
 
+	gk20a_ce_suspend(g);
+
 	/* Disable GPCPLL */
 	if (g->ops.clk.suspend_clk_support)
 		ret |= g->ops.clk.suspend_clk_support(g);
@@ -323,6 +324,8 @@ int gk20a_finalize_poweron(struct gk20a *g)
 		}
 	}
 
+	nvgpu_vidmem_thread_unpause(&g->mm);
+
 #if defined(CONFIG_TEGRA_GK20A_NVHOST) && defined(CONFIG_TEGRA_19x_GPU)
 	if (gk20a_platform_has_syncpoints(g) && g->syncpt_unit_size) {
 		if (!nvgpu_mem_is_valid(&g->syncpt_mem)) {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 687951a9b..67ab307f1 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -978,9 +978,7 @@ int gk20a_mm_suspend(struct gk20a *g)
 {
 	gk20a_dbg_fn("");
 
-#if defined(CONFIG_GK20A_VIDMEM)
-	cancel_work_sync(&g->mm.vidmem.clear_mem_worker);
-#endif
+	nvgpu_vidmem_thread_pause_sync(&g->mm);
 
 	g->ops.mm.cbc_clean(g);
 	g->ops.mm.l2_flush(g, false);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 556cb234e..13698cd7c 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -36,6 +36,8 @@
 #include <nvgpu/rbtree.h>
 #include <nvgpu/kref.h>
 #include <nvgpu/atomic.h>
+#include <nvgpu/cond.h>
+#include <nvgpu/thread.h>
 
 struct nvgpu_pd_cache;
 
@@ -272,7 +274,11 @@ struct mm_gk20a {
 		struct nvgpu_list_node clear_list_head;
 		struct nvgpu_mutex clear_list_mutex;
 
-		struct work_struct clear_mem_worker;
+		struct nvgpu_cond clearing_thread_cond;
+		struct nvgpu_thread clearing_thread;
+		struct nvgpu_mutex clearing_thread_lock;
+		nvgpu_atomic_t pause_count;
+
 		nvgpu_atomic64_t bytes_pending;
 	} vidmem;
 };
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index 537409a82..6feacff72 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -272,6 +272,16 @@ int nvgpu_mem_create_from_mem(struct gk20a *g,
 			      struct nvgpu_mem *dest, struct nvgpu_mem *src,
 			      int start_page, int nr_pages);
 
+/*
+ * Really free a vidmem buffer. There's a fair amount of work involved in
+ * freeing vidmem buffers in the DMA API. This handles none of that - it only
+ * frees the underlying vidmem specific structures used in vidmem buffers.
+ *
+ * This is implemented in the OS specific code. If it's not necessary it can
+ * be a noop. But the symbol must at least be present.
+ */
+void __nvgpu_mem_free_vidmem_alloc(struct gk20a *g, struct nvgpu_mem *vidmem);
+
 /*
  * Buffer accessors - wrap between begin() and end() if there is no permanent
  * kernel mapping for this buffer.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vidmem.h b/drivers/gpu/nvgpu/include/nvgpu/vidmem.h
index 9e9f83013..690f81640 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vidmem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vidmem.h
@@ -73,17 +73,19 @@ struct nvgpu_vidmem_buf *nvgpu_vidmem_user_alloc(struct gk20a *g, size_t bytes);
 
 void nvgpu_vidmem_buf_free(struct gk20a *g, struct nvgpu_vidmem_buf *buf);
 
+int nvgpu_vidmem_clear_list_enqueue(struct gk20a *g, struct nvgpu_mem *mem);
+
 bool nvgpu_addr_is_vidmem_page_alloc(u64 addr);
 int nvgpu_vidmem_get_space(struct gk20a *g, u64 *space);
 
-struct nvgpu_mem *nvgpu_vidmem_get_pending_alloc(struct mm_gk20a *mm);
-
 void nvgpu_vidmem_destroy(struct gk20a *g);
 int nvgpu_vidmem_init(struct mm_gk20a *mm);
 
-void nvgpu_vidmem_clear_mem_worker(struct work_struct *work);
 int nvgpu_vidmem_clear(struct gk20a *g, struct nvgpu_mem *mem);
 
+void nvgpu_vidmem_thread_pause_sync(struct mm_gk20a *mm);
+void nvgpu_vidmem_thread_unpause(struct mm_gk20a *mm);
+
 #else /* !defined(CONFIG_GK20A_VIDMEM) */
 
 /*
@@ -110,11 +112,6 @@ static inline int nvgpu_vidmem_get_space(struct gk20a *g, u64 *space)
 	return -ENOSYS;
 }
 
-static inline struct nvgpu_mem *nvgpu_vidmem_get_pending_alloc(struct mm_gk20a *mm)
-{
-	return NULL;
-}
-
 static inline void nvgpu_vidmem_destroy(struct gk20a *g)
 {
 }
@@ -135,6 +132,14 @@ static inline int nvgpu_vidmem_clear(struct gk20a *g,
 	return -ENOSYS;
 }
 
+static inline void nvgpu_vidmem_thread_pause_sync(struct mm_gk20a *mm)
+{
+}
+
+static inline void nvgpu_vidmem_thread_unpause(struct mm_gk20a *mm)
+{
+}
+
 #endif /* !defined(CONFIG_GK20A_VIDMEM) */
 
 #endif /* __NVGPU_VIDMEM_H__ */