diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ee62f02a9..9067aae55 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -34,7 +34,10 @@
 #include "hw_ccsr_gk20a.h"
 #include "hw_pbdma_gk20a.h"
 
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use);
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g);
+
+#define CTX_DELETE_TIME 1000
 
 static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
 {
@@ -67,7 +70,7 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
 	cde_ctx->init_cmd_executed = false;
 }
 
-static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
+static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
 {
 	struct gk20a *g = cde_ctx->g;
 	struct channel_gk20a *ch = cde_ctx->ch;
@@ -81,23 +84,90 @@ static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
 	gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,
 			 g->gr.compbit_store.size, 1);
 
-	return 0;
+	/* housekeeping on app */
+	list_del(&cde_ctx->list);
+	cde_ctx->g->cde_app.lru_len--;
+	kfree(cde_ctx);
 }
 
-int gk20a_cde_destroy(struct gk20a *g)
+static void gk20a_cde_prepare_ctx_remove(struct gk20a_cde_ctx *cde_ctx)
+{
+	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+
+	/* permanent contexts do not have deleter works */
+	if (!cde_ctx->is_temporary)
+		return;
+
+	/* safe to go off the mutex since app is deinitialised. deleter works
+	 * may be only at waiting for the mutex or before, going to abort */
+	mutex_unlock(&cde_app->mutex);
+
+	/* the deleter can rearm itself */
+	do {
+		cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
+	} while (delayed_work_pending(&cde_ctx->ctx_deleter_work));
+
+	mutex_lock(&cde_app->mutex);
+}
+
+static void gk20a_cde_deallocate_contexts(struct gk20a *g)
+{
+	struct gk20a_cde_app *cde_app = &g->cde_app;
+	struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
+
+	list_for_each_entry_safe(cde_ctx, cde_ctx_save,
+			&cde_app->cde_ctx_lru, list) {
+		gk20a_cde_prepare_ctx_remove(cde_ctx);
+		gk20a_cde_remove_ctx(cde_ctx);
+	}
+}
+
+void gk20a_cde_stop(struct gk20a *g)
+{
+	struct gk20a_cde_app *cde_app = &g->cde_app;
+
+	/* prevent further conversions and delayed works from working */
+	cde_app->initialised = false;
+	/* free all data, empty the list */
+	gk20a_cde_deallocate_contexts(g);
+}
+
+void gk20a_cde_destroy(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int ret, i;
 
 	if (!cde_app->initialised)
-		return 0;
+		return;
 
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++)
-		ret = gk20a_cde_remove(cde_ctx);
+	mutex_lock(&cde_app->mutex);
+	gk20a_cde_stop(g);
+	mutex_unlock(&cde_app->mutex);
+}
 
-	cde_app->initialised = false;
-	return ret;
+static int gk20a_cde_allocate_contexts(struct gk20a *g)
+{
+	struct gk20a_cde_app *cde_app = &g->cde_app;
+	struct gk20a_cde_ctx *cde_ctx;
+	int err = 0;
+	int i;
+
+	for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
+		cde_ctx = gk20a_cde_allocate_context(g);
+		if (IS_ERR(cde_ctx)) {
+			err = PTR_ERR(cde_ctx);
+			goto out;
+		}
+
+		list_add(&cde_ctx->list, &cde_app->cde_ctx_lru);
+		cde_app->lru_len++;
+		if (cde_app->lru_len > cde_app->lru_max_len)
+			cde_app->lru_max_len = cde_app->lru_len;
+	}
+
+	return 0;
+out:
+	gk20a_cde_deallocate_contexts(g);
+	return err;
 }
 
 static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
@@ -591,29 +661,117 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
 					   num_entries, flags, fence, fence_out);
 }
 
+static void gk20a_ctx_release(struct gk20a_cde_ctx *cde_ctx)
+{
+	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+
+	gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
+
+	mutex_lock(&cde_app->mutex);
+
+	cde_ctx->in_use = false;
+	list_move(&cde_ctx->list, &cde_app->cde_ctx_lru);
+	cde_app->lru_used--;
+
+	mutex_unlock(&cde_app->mutex);
+}
+
+static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
+{
+	struct delayed_work *delay_work = to_delayed_work(work);
+	struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
+			struct gk20a_cde_ctx, ctx_deleter_work);
+	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+	struct platform_device *pdev = cde_ctx->pdev;
+	int err;
+
+	/* someone has just taken it? engine deletion started? */
+	if (cde_ctx->in_use || !cde_app->initialised)
+		return;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+			"cde: attempting to delete temporary %p", cde_ctx);
+
+	/* this should fail only when shutting down the whole device */
+	err = gk20a_busy(pdev);
+	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel yet."
+				" rescheduling...")) {
+		schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+			msecs_to_jiffies(CTX_DELETE_TIME));
+		return;
+	}
+
+	/* mark so that nobody else assumes it's free to take */
+	mutex_lock(&cde_app->mutex);
+	if (cde_ctx->in_use || !cde_app->initialised) {
+		gk20a_dbg(gpu_dbg_cde_ctx,
+				"cde: context use raced, not deleting %p",
+				cde_ctx);
+		goto out;
+	}
+	cde_ctx->in_use = true;
+
+	gk20a_cde_remove_ctx(cde_ctx);
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+			"cde: destroyed %p len=%d use=%d max=%d",
+			cde_ctx, cde_app->lru_len, cde_app->lru_used,
+			cde_app->lru_max_len);
+
+out:
+	mutex_unlock(&cde_app->mutex);
+	gk20a_idle(pdev);
+}
+
 static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int i, ret;
+	struct gk20a_cde_ctx *cde_ctx;
 
-	/* try to find a jobless context */
+	/* try to get a jobless context. list is in lru order */
 
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-		struct channel_gk20a *ch = cde_ctx->ch;
-		bool empty;
+	cde_ctx = list_first_entry(&cde_app->cde_ctx_lru,
+			struct gk20a_cde_ctx, list);
 
-		mutex_lock(&ch->jobs_lock);
-		empty = list_empty(&ch->jobs);
-		mutex_unlock(&ch->jobs_lock);
-
-		if (empty)
-			return cde_ctx;
+	if (!cde_ctx->in_use) {
+		gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+				"cde: got free %p len=%d use=%d max=%d",
+				cde_ctx, cde_app->lru_len, cde_app->lru_used,
+				cde_app->lru_max_len);
+		/* deleter work may be scheduled, but in_use prevents it */
+		cde_ctx->in_use = true;
+		list_move_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+		cde_app->lru_used++;
+		return cde_ctx;
 	}
 
-	/* could not find a free one, so allocate dynamically */
+	/* no free contexts, get a temporary one */
 
-	gk20a_warn(&g->dev->dev, "cde: no free contexts, allocating temporary");
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+			"cde: no free contexts, list len=%d",
+			cde_app->lru_len);
+
+	cde_ctx = gk20a_cde_allocate_context(g);
+	if (IS_ERR(cde_ctx)) {
+		gk20a_warn(&g->dev->dev, "cde: cannot allocate context: %ld",
+				PTR_ERR(cde_ctx));
+		return cde_ctx;
+	}
+
+	cde_ctx->in_use = true;
+	cde_ctx->is_temporary = true;
+	list_add_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+	cde_app->lru_used++;
+	cde_app->lru_len++;
+	if (cde_app->lru_len > cde_app->lru_max_len)
+		cde_app->lru_max_len = cde_app->lru_len;
+
+	return cde_ctx;
+}
+
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
+{
+	struct gk20a_cde_ctx *cde_ctx;
+	int ret;
 
 	cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);
 	if (!cde_ctx)
@@ -622,12 +780,19 @@ static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
 	cde_ctx->g = g;
 	cde_ctx->pdev = g->dev;
 
-	ret = gk20a_cde_load(cde_ctx, true);
+	ret = gk20a_cde_load(cde_ctx);
 	if (ret) {
-		gk20a_err(&g->dev->dev, "cde: cde load failed on temporary");
+		kfree(cde_ctx);
 		return ERR_PTR(ret);
 	}
 
+	INIT_LIST_HEAD(&cde_ctx->list);
+	cde_ctx->is_temporary = false;
+	cde_ctx->in_use = false;
+	INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
+			gk20a_cde_ctx_deleter_fn);
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
 	return cde_ctx;
 }
 
@@ -653,8 +818,10 @@ int gk20a_cde_convert(struct gk20a *g,
 	mutex_lock(&cde_app->mutex);
 
 	cde_ctx = gk20a_cde_get_context(g);
-	if (IS_ERR(cde_ctx))
-		return PTR_ERR(cde_ctx);
+	if (IS_ERR(cde_ctx)) {
+		err = PTR_ERR(cde_ctx);
+		goto exit_unlock;
+	}
 
 	/* First, map the buffers to local va */
 
@@ -665,7 +832,7 @@ int gk20a_cde_convert(struct gk20a *g,
 
 	/* map the destination buffer */
 	get_dma_buf(dst); /* a ref for gk20a_vm_map */
-	dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
+	dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0,
 				 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 				 dst_kind, NULL, true,
 				 gk20a_mem_flag_none,
@@ -757,18 +924,17 @@ exit_unlock:
 
 	/* unmap the buffers - channel holds references to them now */
 	if (dst_vaddr)
-		gk20a_vm_unmap(g->cde_app.vm, dst_vaddr);
+		gk20a_vm_unmap(cde_ctx->vm, dst_vaddr);
 
 	mutex_unlock(&cde_app->mutex);
 
 	return err;
 }
 
-static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
+static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data)
 {
 	struct gk20a_cde_ctx *cde_ctx = data;
 	bool empty;
-	int err;
 
 	mutex_lock(&ch->jobs_lock);
 	empty = list_empty(&ch->jobs);
@@ -777,19 +943,17 @@ static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
 	if (!empty)
 		return;
 
-	/* this should fail only when shutting down the whole device */
-	err = gk20a_busy(cde_ctx->pdev);
-	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel"
-				", leaking memory"))
-		return;
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
 
-	gk20a_cde_remove(cde_ctx);
-	gk20a_idle(cde_ctx->pdev);
+	/* delete temporary contexts later */
+	if (cde_ctx->is_temporary)
+		schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+			msecs_to_jiffies(CTX_DELETE_TIME));
 
-	kfree(cde_ctx);
+	gk20a_ctx_release(cde_ctx);
 }
 
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 {
 	struct gk20a *g = cde_ctx->g;
 	const struct firmware *img;
@@ -804,10 +968,8 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
 		return -ENOSYS;
 	}
 
-	if (free_after_use)
-		ch = gk20a_open_new_channel_with_cb(g, gk20a_free_ctx_cb, cde_ctx);
-	else
-		ch = gk20a_open_new_channel(g);
+	ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
+			cde_ctx);
 	if (!ch) {
 		gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");
 		err = -ENOMEM;
@@ -876,8 +1038,7 @@ err_get_gk20a_channel:
 int gk20a_cde_reload(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int err, i;
+	int err;
 
 	if (!cde_app->initialised)
 		return -ENOSYS;
@@ -887,10 +1048,12 @@ int gk20a_cde_reload(struct gk20a *g)
 		return err;
 
 	mutex_lock(&cde_app->mutex);
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-		gk20a_cde_remove(cde_ctx);
-		err = gk20a_cde_load(cde_ctx, false);
-	}
+
+	gk20a_cde_stop(g);
+
+	err = gk20a_cde_allocate_contexts(g);
+	if (!err)
+		cde_app->initialised = true;
 
 	mutex_unlock(&cde_app->mutex);
 
@@ -901,39 +1064,28 @@ int gk20a_cde_reload(struct gk20a *g)
 int gk20a_init_cde_support(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int ret, i;
+	int err;
 
 	if (cde_app->initialised)
 		return 0;
 
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init");
+
 	mutex_init(&cde_app->mutex);
 	mutex_lock(&cde_app->mutex);
 
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-		cde_ctx->g = g;
-		cde_ctx->pdev = g->dev;
-		ret = gk20a_cde_load(cde_ctx, false);
-		if (ret)
-			goto err_init_instance;
-	}
+	INIT_LIST_HEAD(&cde_app->cde_ctx_lru);
+	cde_app->lru_len = 0;
+	cde_app->lru_max_len = 0;
+	cde_app->lru_used = 0;
 
-	/* take shadow to the vm for general usage */
-	cde_app->vm = cde_app->cde_ctx->vm;
+	err = gk20a_cde_allocate_contexts(g);
+	if (!err)
+		cde_app->initialised = true;
 
-	cde_app->initialised = true;
 	mutex_unlock(&cde_app->mutex);
-
-	return 0;
-
-err_init_instance:
-
-	/* deinitialise initialised channels */
-	while (i--) {
-		gk20a_cde_remove(cde_ctx);
-		cde_ctx--;
-	}
-	return ret;
+	gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
+	return err;
 }
 
 enum cde_launch_patch_offset {
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index e4d4659da..4120dc941 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -242,19 +242,26 @@ struct gk20a_cde_ctx {
 	struct kobj_attribute attr;
 
 	bool init_cmd_executed;
+
+	struct list_head list;
+	bool is_temporary;
+	bool in_use;
+	struct delayed_work ctx_deleter_work;
 };
 
 struct gk20a_cde_app {
 	bool initialised;
 	struct mutex mutex;
-	struct vm_gk20a *vm;
 
-	struct gk20a_cde_ctx cde_ctx[NUM_CDE_CONTEXTS];
+	struct list_head cde_ctx_lru;
+	int lru_len;
+	int lru_max_len;
+	int lru_used;
 
 	u32 shader_parameter;
 };
 
-int gk20a_cde_destroy(struct gk20a *g);
+void gk20a_cde_destroy(struct gk20a *g);
 int gk20a_init_cde_support(struct gk20a *g);
 int gk20a_cde_reload(struct gk20a *g);
 int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst,
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 309a1b083..825cb886d 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -535,6 +535,7 @@ enum gk20a_dbg_categories {
 	gpu_dbg_map     = BIT(8),  /* mem mappings */
 	gpu_dbg_gpu_dbg = BIT(9),  /* gpu debugger/profiler */
 	gpu_dbg_cde     = BIT(10), /* cde info messages */
+	gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
 	gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };