gpu: nvgpu: move CE app logic under CONFIG_NVGPU_DGPU

CE app functionality from nvgpu is non-safe for igpu. CE engines init /reset/cg related functionality is required in safety. Hence move the CE app logic under CONFIG_NVGPU_DGPU flag and update the sources accordingly. JIRA NVGPU-3814 Change-Id: I37aa00b1184baccd5fe569ec315be60ac42dac9b Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2168956 GVS: Gerrit_Virtual_Submit Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2019-08-06 10:07:21 +05:30
parent 6e2e4d0658
commit 2f95efd8d1
12 changed files with 682 additions and 647 deletions
--- a/arch/nvgpu-common.yaml
+++ b/arch/nvgpu-common.yaml
@@ -31,9 +31,16 @@ bios:

 ce:
  safe: yes
-  gpu: dgpu
  owner: Thomas F
  sources: [ common/ce/ce.c,
+             include/nvgpu/ce.h ]
+  deps:
+
+ce_app:
+  safe: yes
+  gpu: dgpu
+  owner: Thomas F
+  sources: [ common/ce/ce_app.c,
             common/ce/ce_priv.h,
             include/nvgpu/ce.h ]
  deps:
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -40,7 +40,6 @@ ccflags-y += -DCONFIG_NVGPU_GRAPHICS
 ccflags-y += -DCONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
 ccflags-y += -DCONFIG_NVGPU_CHANNEL_TSG_CONTROL
 ccflags-y += -DCONFIG_NVGPU_POWER_PG
-ccflags-y += -DCONFIG_NVGPU_CE
 ccflags-y += -DCONFIG_NVGPU_KERNEL_MODE_SUBMIT
 ccflags-y += -DCONFIG_NVGPU_COMPRESSION
 ccflags-y += -DCONFIG_NVGPU_SIM
@@ -515,6 +514,7 @@ nvgpu-y += \
 	common/fence/fence.o \
 	common/ecc.o \
 	common/ce/ce.o \
+	common/ce/ce_app.o \
 	common/debugger.o

 nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
--- a/drivers/gpu/nvgpu/Makefile.shared.configs
+++ b/drivers/gpu/nvgpu/Makefile.shared.configs
@@ -172,9 +172,6 @@ NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_SIM
 CONFIG_NVGPU_COMPRESSION        := 1
 NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_COMPRESSION

-CONFIG_NVGPU_CE                 := 1
-NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_CE
-
 # Enable non FUSA HALs for normal build
 CONFIG_NVGPU_HAL_NON_FUSA       := 1
 NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_HAL_NON_FUSA
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -140,6 +140,7 @@ srcs += common/utils/assert.c \
 	common/fifo/pbdma_status.c \
 	common/mc/mc.c \
 	common/rc/rc.c \
+	common/ce/ce.c \
 	hal/init/hal_gv11b.c \
 	hal/init/hal_gv11b_litter.c \
 	hal/init/hal_init.c \
@@ -365,10 +366,6 @@ srcs += hal/regops/regops_gm20b.c \
 endif
 endif

-ifeq ($(CONFIG_NVGPU_CE),1)
-srcs += common/ce/ce.c
-endif
-
 ifeq ($(CONFIG_NVGPU_KERNEL_MODE_SUBMIT),1)
 srcs += common/fifo/submit.c
 endif
@@ -581,6 +578,7 @@ srcs += common/sec2/sec2.c \
 	common/mm/allocators/page_allocator.c \
 	common/mm/vidmem.c \
 	common/pramin.c \
+	common/ce/ce_app.c \
 	hal/mm/mm_gv100.c \
 	hal/mm/mm_tu104.c \
 	hal/mc/mc_gv100.c  \
--- a/drivers/gpu/nvgpu/common/ce/ce.c
+++ b/drivers/gpu/nvgpu/common/ce/ce.c
@@ -23,404 +23,17 @@
 #include <nvgpu/types.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/engines.h>
-#include <nvgpu/os_sched.h>
-#include <nvgpu/channel.h>
-#include <nvgpu/dma.h>
-#include <nvgpu/utils.h>
-#include <nvgpu/fence.h>
 #include <nvgpu/ce.h>
 #include <nvgpu/power_features/cg.h>

-#include "common/ce/ce_priv.h"
-
-static inline u32 nvgpu_ce_get_valid_launch_flags(struct gk20a *g,
-		u32 launch_flags)
-{
-#ifdef CONFIG_NVGPU_DGPU
-	/*
-	 * there is no local memory available,
-	 * don't allow local memory related CE flags
-	 */
-	if (g->mm.vidmem.size == 0ULL) {
-		launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
-			NVGPU_CE_DST_LOCATION_LOCAL_FB);
-	}
-#endif
-	return launch_flags;
-}
-
-int nvgpu_ce_execute_ops(struct gk20a *g,
-		u32 ce_ctx_id,
-		u64 src_buf,
-		u64 dst_buf,
-		u64 size,
-		unsigned int payload,
-		u32 launch_flags,
-		u32 request_operation,
-		u32 submit_flags,
-		struct nvgpu_fence_type **fence_out)
-{
-	int ret = -EPERM;
-	struct nvgpu_ce_app *ce_app = g->ce_app;
-	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
-	bool found = false;
-	u32 *cmd_buf_cpu_va;
-	u64 cmd_buf_gpu_va = 0UL;
-	u32 method_size;
-	u32 cmd_buf_read_offset;
-	u32 dma_copy_class;
-	struct nvgpu_gpfifo_entry gpfifo;
-	struct nvgpu_channel_fence fence = {0U, 0U};
-	struct nvgpu_fence_type *ce_cmd_buf_fence_out = NULL;
-
-	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
-		goto end;
-	}
-
-	nvgpu_mutex_acquire(&ce_app->app_mutex);
-
-	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
-			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
-		if (ce_ctx->ctx_id == ce_ctx_id) {
-			found = true;
-			break;
-		}
-	}
-
-	nvgpu_mutex_release(&ce_app->app_mutex);
-
-	if (!found) {
-		ret = -EINVAL;
-		goto end;
-	}
-
-	if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
-		ret = -ENODEV;
-		goto end;
-	}
-
-	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
-
-	ce_ctx->cmd_buf_read_queue_offset %= NVGPU_CE_MAX_INFLIGHT_JOBS;
-
-	cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
-			(NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF /
-			U32(sizeof(u32))));
-
-	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
-
-	if (ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] != NULL) {
-		struct nvgpu_fence_type **prev_post_fence =
-			&ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset];
-
-		ret = nvgpu_fence_wait(g, *prev_post_fence,
-				       nvgpu_get_poll_timeout(g));
-
-		nvgpu_fence_put(*prev_post_fence);
-		*prev_post_fence = NULL;
-		if (ret != 0) {
-			goto noop;
-		}
-	}
-
-	cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va +
-			(u64)(cmd_buf_read_offset * sizeof(u32)));
-
-	dma_copy_class = g->ops.get_litter_value(g, GPU_LIT_DMA_COPY_CLASS);
-	method_size = nvgpu_ce_prepare_submit(src_buf,
-			dst_buf,
-			size,
-			&cmd_buf_cpu_va[cmd_buf_read_offset],
-			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
-			payload,
-			nvgpu_ce_get_valid_launch_flags(g, launch_flags),
-			request_operation,
-			dma_copy_class);
-
-	if (method_size != 0U) {
-		/* store the element into gpfifo */
-		g->ops.pbdma.format_gpfifo_entry(g, &gpfifo,
-				cmd_buf_gpu_va, method_size);
-
-		/*
-		 * take always the postfence as it is needed for protecting the
-		 * ce context
-		 */
-		submit_flags |= NVGPU_SUBMIT_FLAGS_FENCE_GET;
-
-		nvgpu_smp_wmb();
-
-		ret = nvgpu_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo,
-				1, submit_flags, &fence, &ce_cmd_buf_fence_out);
-
-		if (ret == 0) {
-			ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] =
-				ce_cmd_buf_fence_out;
-			if (fence_out != NULL) {
-				nvgpu_fence_get(ce_cmd_buf_fence_out);
-				*fence_out = ce_cmd_buf_fence_out;
-			}
-
-			/* Next available command buffer queue Index */
-			++ce_ctx->cmd_buf_read_queue_offset;
-		}
-	} else {
-		ret = -ENOMEM;
-	}
-noop:
-	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
-end:
-	return ret;
-}
-
-/* static CE app api */
-static void nvgpu_ce_put_fences(struct nvgpu_ce_gpu_ctx *ce_ctx)
-{
-	u32 i;
-
-	for (i = 0U; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) {
-		struct nvgpu_fence_type **fence = &ce_ctx->postfences[i];
-
-		if (*fence != NULL) {
-			nvgpu_fence_put(*fence);
-		}
-		*fence = NULL;
-	}
-}
-
-/* caller must hold ce_app->app_mutex */
-static void nvgpu_ce_delete_gpu_context_locked(struct nvgpu_ce_gpu_ctx *ce_ctx)
-{
-	struct nvgpu_list_node *list = &ce_ctx->list;
-
-	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
-	ce_ctx->tsg->abortable = true;
-
-	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
-
-	if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) {
-		nvgpu_ce_put_fences(ce_ctx);
-		nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
-	}
-
-	/*
-	 * free the channel
-	 * nvgpu_channel_close() will also unbind the channel from TSG
-	 */
-	nvgpu_channel_close(ce_ctx->ch);
-	nvgpu_ref_put(&ce_ctx->tsg->refcount, nvgpu_tsg_release);
-
-	/* housekeeping on app */
-	if ((list->prev != NULL) && (list->next != NULL)) {
-		nvgpu_list_del(list);
-	}
-
-	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
-	nvgpu_mutex_destroy(&ce_ctx->gpu_ctx_mutex);
-
-	nvgpu_kfree(ce_ctx->g, ce_ctx);
-}
-
-static inline unsigned int nvgpu_ce_get_method_size(u32 request_operation,
-			u64 size)
-{
-	/* failure size */
-	unsigned int methodsize = UINT_MAX;
-	unsigned int iterations = 0;
-	u32 shift;
-	u64 chunk = size;
-	u32 height, width;
-
-	while (chunk != 0ULL) {
-		iterations++;
-
-		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
-				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
-				MAX_CE_SHIFT;
-		width = chunk >> shift;
-		height = BIT32(shift);
-		width = MAX_CE_ALIGN(width);
-
-		chunk -= (u64) height * width;
-	}
-
-	if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
-		methodsize = (2U + (16U * iterations)) *
-				(unsigned int)sizeof(u32);
-	} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
-		methodsize = (2U + (15U * iterations)) *
-				(unsigned int)sizeof(u32);
-	}
-
-	return methodsize;
-}
-
-u32 nvgpu_ce_prepare_submit(u64 src_buf,
-		u64 dst_buf,
-		u64 size,
-		u32 *cmd_buf_cpu_va,
-		u32 max_cmd_buf_size,
-		unsigned int payload,
-		u32 launch_flags,
-		u32 request_operation,
-		u32 dma_copy_class)
-{
-	u32 launch = 0;
-	u32 methodSize = 0;
-	u64 offset = 0;
-	u64 chunk_size = 0;
-	u64 chunk = size;
-
-	/* failure case handling */
-	if ((nvgpu_ce_get_method_size(request_operation, size) >
-		max_cmd_buf_size) || (size == 0ULL) ||
-		(request_operation > NVGPU_CE_MEMSET)) {
-		return 0;
-	}
-
-	/* set the channel object */
-	cmd_buf_cpu_va[methodSize++] = 0x20018000;
-	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
-
-	/*
-	 * The purpose clear the memory in 2D rectangles. We get the ffs to
-	 * determine the number of lines to copy. The only constraint is that
-	 * maximum number of pixels per line is 4Gpix - 1, which is awkward for
-	 * calculation, so we settle to 2Gpix per line to make calculatione
-	 * more agreable
-	 */
-
-	/* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
-	 * a single submit, we are going to try to clear a range of up to 2Gpix
-	 * multiple lines. Because we want to copy byte aligned we will be
-	 * setting 1 byte pixels */
-
-	/*
-	 * per iteration
-	 * <------------------------- 40 bits ------------------------------>
-	 *                                             1 <------ ffs ------->
-	 *        <-----------up to 30 bits----------->
-	 */
-	while (chunk != 0ULL) {
-		u32 width, height, shift;
-
-		/*
-		 * We will be aligning to bytes, making the maximum number of
-		 * pix per line 2Gb
-		 */
-
-		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
-				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
-				MAX_CE_SHIFT;
-		height = chunk >> shift;
-		width = BIT32(shift);
-		height = MAX_CE_ALIGN(height);
-
-		chunk_size = (u64) height * width;
-
-		/* reset launch flag */
-		launch = 0;
-
-		if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
-			/* setup the source */
-			cmd_buf_cpu_va[methodSize++] = 0x20028100;
-			cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
-				offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
-			cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
-				offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
-
-			cmd_buf_cpu_va[methodSize++] = 0x20018098;
-			if ((launch_flags &
-			     NVGPU_CE_SRC_LOCATION_LOCAL_FB) != 0U) {
-				cmd_buf_cpu_va[methodSize++] = 0x00000000;
-			} else if ((launch_flags &
-			     NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
-				cmd_buf_cpu_va[methodSize++] = 0x00000002;
-			} else {
-				cmd_buf_cpu_va[methodSize++] = 0x00000001;
-			}
-
-			launch |= 0x00001000U;
-		} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
-			/* Remap from component A on 1 byte wide pixels */
-			cmd_buf_cpu_va[methodSize++] = 0x200181c2;
-			cmd_buf_cpu_va[methodSize++] = 0x00000004;
-
-			cmd_buf_cpu_va[methodSize++] = 0x200181c0;
-			cmd_buf_cpu_va[methodSize++] = payload;
-
-			launch |= 0x00000400U;
-		} else {
-			/* Illegal size */
-			return 0;
-		}
-
-		/* setup the destination/output */
-		cmd_buf_cpu_va[methodSize++] = 0x20068102;
-		cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
-			offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
-		cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
-			offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
-		/* Pitch in/out */
-		cmd_buf_cpu_va[methodSize++] = width;
-		cmd_buf_cpu_va[methodSize++] = width;
-		/* width and line count */
-		cmd_buf_cpu_va[methodSize++] = width;
-		cmd_buf_cpu_va[methodSize++] = height;
-
-		cmd_buf_cpu_va[methodSize++] = 0x20018099;
-		if ((launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) != 0U) {
-			cmd_buf_cpu_va[methodSize++] = 0x00000000;
-		} else if ((launch_flags &
-			   NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
-			cmd_buf_cpu_va[methodSize++] = 0x00000002;
-		} else {
-			cmd_buf_cpu_va[methodSize++] = 0x00000001;
-		}
-
-		launch |= 0x00002005U;
-
-		if ((launch_flags &
-		     NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
-			launch |= 0x00000000U;
-		} else {
-			launch |= 0x00000080U;
-		}
-
-		if ((launch_flags &
-		     NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
-			launch |= 0x00000000U;
-		} else {
-			launch |= 0x00000100U;
-		}
-
-		cmd_buf_cpu_va[methodSize++] = 0x200180c0;
-		cmd_buf_cpu_va[methodSize++] = launch;
-		offset += chunk_size;
-		chunk -= chunk_size;
-	}
-
-	return methodSize;
-}
-
-/* global CE app related apis */
 int nvgpu_ce_init_support(struct gk20a *g)
 {
-	struct nvgpu_ce_app *ce_app = g->ce_app;
 	u32 ce_reset_mask;

 	if (g->ops.ce.set_pce2lce_mapping != NULL) {
 		g->ops.ce.set_pce2lce_mapping(g);
 	}

-	if (unlikely(ce_app == NULL)) {
-		ce_app = nvgpu_kzalloc(g, sizeof(*ce_app));
-		if (ce_app == NULL) {
-			return -ENOMEM;
-		}
-		g->ce_app = ce_app;
-	}
-
 	ce_reset_mask = nvgpu_engine_get_all_ce_reset_mask(g);

 	g->ops.mc.reset(g, ce_reset_mask);
@@ -429,226 +42,9 @@ int nvgpu_ce_init_support(struct gk20a *g)

 	nvgpu_cg_blcg_ce_load_enable(g);

-	if (ce_app->initialised) {
-		/* assume this happen during poweron/poweroff GPU sequence */
-		ce_app->app_state = NVGPU_CE_ACTIVE;
-		return 0;
-	}
-
-	nvgpu_log(g, gpu_dbg_fn, "ce: init");
-
-	nvgpu_mutex_init(&ce_app->app_mutex);
-
-	nvgpu_mutex_acquire(&ce_app->app_mutex);
-
-	nvgpu_init_list_node(&ce_app->allocated_contexts);
-	ce_app->ctx_count = 0;
-	ce_app->next_ctx_id = 0;
-	ce_app->initialised = true;
-	ce_app->app_state = NVGPU_CE_ACTIVE;
-
-	nvgpu_mutex_release(&ce_app->app_mutex);
-
 	if (g->ops.ce.init_prod_values != NULL) {
 		g->ops.ce.init_prod_values(g);
 	}

-	nvgpu_log(g, gpu_dbg_cde_ctx, "ce: init finished");
-
 	return 0;
 }
-
-void nvgpu_ce_destroy(struct gk20a *g)
-{
-	struct nvgpu_ce_app *ce_app = g->ce_app;
-	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
-
-	if (ce_app == NULL) {
-		return;
-	}
-
-	if (ce_app->initialised == false) {
-		goto free;
-	}
-
-	ce_app->app_state = NVGPU_CE_SUSPEND;
-	ce_app->initialised = false;
-
-	nvgpu_mutex_acquire(&ce_app->app_mutex);
-
-	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
-			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
-		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
-	}
-
-	nvgpu_init_list_node(&ce_app->allocated_contexts);
-	ce_app->ctx_count = 0;
-	ce_app->next_ctx_id = 0;
-
-	nvgpu_mutex_release(&ce_app->app_mutex);
-
-	nvgpu_mutex_destroy(&ce_app->app_mutex);
-free:
-	nvgpu_kfree(g, ce_app);
-	g->ce_app = NULL;
-}
-
-void nvgpu_ce_suspend(struct gk20a *g)
-{
-	struct nvgpu_ce_app *ce_app = g->ce_app;
-
-	if (ce_app == NULL || !ce_app->initialised) {
-		return;
-	}
-
-	ce_app->app_state = NVGPU_CE_SUSPEND;
-}
-
-/* CE app utility functions */
-u32 nvgpu_ce_create_context(struct gk20a *g,
-		u32 runlist_id,
-		int timeslice,
-		int runlist_level)
-{
-	struct nvgpu_ce_gpu_ctx *ce_ctx;
-	struct nvgpu_ce_app *ce_app = g->ce_app;
-	struct nvgpu_setup_bind_args setup_bind_args;
-	u32 ctx_id = NVGPU_CE_INVAL_CTX_ID;
-	int err = 0;
-
-	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
-		return ctx_id;
-	}
-
-	ce_ctx = nvgpu_kzalloc(g, sizeof(*ce_ctx));
-	if (ce_ctx == NULL) {
-		return ctx_id;
-	}
-
-	nvgpu_mutex_init(&ce_ctx->gpu_ctx_mutex);
-
-	ce_ctx->g = g;
-	ce_ctx->cmd_buf_read_queue_offset = 0;
-	ce_ctx->vm = g->mm.ce.vm;
-
-	/* allocate a tsg if needed */
-	ce_ctx->tsg = nvgpu_tsg_open(g, nvgpu_current_pid(g));
-	if (ce_ctx->tsg == NULL) {
-		nvgpu_err(g, "ce: gk20a tsg not available");
-		goto end;
-	}
-
-	/* this TSG should never be aborted */
-	ce_ctx->tsg->abortable = false;
-
-	/* always kernel client needs privileged channel */
-	ce_ctx->ch = nvgpu_channel_open_new(g, runlist_id, true,
-				nvgpu_current_pid(g), nvgpu_current_tid(g));
-	if (ce_ctx->ch == NULL) {
-		nvgpu_err(g, "ce: gk20a channel not available");
-		goto end;
-	}
-
-#ifdef CONFIG_NVGPU_CHANNEL_WDT
-	ce_ctx->ch->wdt.enabled = false;
-#endif
-
-	/* bind the channel to the vm */
-	err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
-	if (err != 0) {
-		nvgpu_err(g, "ce: could not bind vm");
-		goto end;
-	}
-
-	err = nvgpu_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
-	if (err != 0) {
-		nvgpu_err(g, "ce: unable to bind to tsg");
-		goto end;
-	}
-
-	setup_bind_args.num_gpfifo_entries = 1024;
-	setup_bind_args.num_inflight_jobs = 0;
-	setup_bind_args.flags = 0;
-	err = nvgpu_channel_setup_bind(ce_ctx->ch, &setup_bind_args);
-	if (err != 0) {
-		nvgpu_err(g, "ce: unable to setup and bind channel");
-		goto end;
-	}
-
-	/* allocate command buffer from sysmem */
-	err = nvgpu_dma_alloc_map_sys(ce_ctx->vm,
-			NVGPU_CE_MAX_INFLIGHT_JOBS *
-			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
-			&ce_ctx->cmd_buf_mem);
-	 if (err != 0) {
-		nvgpu_err(g,
-			"ce: alloc command buffer failed");
-		goto end;
-	}
-
-	(void) memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00,
-		ce_ctx->cmd_buf_mem.size);
-
-#ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
-	/* -1 means default channel timeslice value */
-	if (timeslice != -1) {
-		err = g->ops.tsg.set_timeslice(ce_ctx->tsg, timeslice);
-		if (err != 0) {
-			nvgpu_err(g, "ce: set timesliced failed for CE context");
-			goto end;
-		}
-	}
-
-	/* -1 means default channel runlist level */
-	if (runlist_level != -1) {
-		err = nvgpu_tsg_set_interleave(ce_ctx->tsg, runlist_level);
-		if (err != 0) {
-			nvgpu_err(g, "ce: set runlist interleave failed");
-			goto end;
-		}
-	}
-#endif
-
-	nvgpu_mutex_acquire(&ce_app->app_mutex);
-	ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
-	nvgpu_list_add(&ce_ctx->list, &ce_app->allocated_contexts);
-	++ce_app->next_ctx_id;
-	++ce_app->ctx_count;
-	nvgpu_mutex_release(&ce_app->app_mutex);
-
-	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
-
-end:
-	if (ctx_id == NVGPU_CE_INVAL_CTX_ID) {
-		nvgpu_mutex_acquire(&ce_app->app_mutex);
-		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
-		nvgpu_mutex_release(&ce_app->app_mutex);
-	}
-	return ctx_id;
-
-}
-
-void nvgpu_ce_delete_context(struct gk20a *g,
-		u32 ce_ctx_id)
-{
-	struct nvgpu_ce_app *ce_app = g->ce_app;
-	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
-
-	if (ce_app == NULL || !ce_app->initialised ||
-		ce_app->app_state != NVGPU_CE_ACTIVE) {
-		return;
-	}
-
-	nvgpu_mutex_acquire(&ce_app->app_mutex);
-
-	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
-			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
-		if (ce_ctx->ctx_id == ce_ctx_id) {
-			nvgpu_ce_delete_gpu_context_locked(ce_ctx);
-			--ce_app->ctx_count;
-			break;
-		}
-	}
-
-	nvgpu_mutex_release(&ce_app->app_mutex);
-}
--- a/drivers/gpu/nvgpu/common/ce/ce_app.c
+++ b/drivers/gpu/nvgpu/common/ce/ce_app.c
@@ -0,0 +1,639 @@
+/*
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/types.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/engines.h>
+#include <nvgpu/os_sched.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/fence.h>
+#include <nvgpu/ce.h>
+#include <nvgpu/power_features/cg.h>
+
+#include "common/ce/ce_priv.h"
+
+static inline u32 nvgpu_ce_get_valid_launch_flags(struct gk20a *g,
+		u32 launch_flags)
+{
+#ifdef CONFIG_NVGPU_DGPU
+	/*
+	 * there is no local memory available,
+	 * don't allow local memory related CE flags
+	 */
+	if (g->mm.vidmem.size == 0ULL) {
+		launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
+			NVGPU_CE_DST_LOCATION_LOCAL_FB);
+	}
+#endif
+	return launch_flags;
+}
+
+int nvgpu_ce_execute_ops(struct gk20a *g,
+		u32 ce_ctx_id,
+		u64 src_buf,
+		u64 dst_buf,
+		u64 size,
+		unsigned int payload,
+		u32 launch_flags,
+		u32 request_operation,
+		u32 submit_flags,
+		struct nvgpu_fence_type **fence_out)
+{
+	int ret = -EPERM;
+	struct nvgpu_ce_app *ce_app = g->ce_app;
+	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
+	bool found = false;
+	u32 *cmd_buf_cpu_va;
+	u64 cmd_buf_gpu_va = 0UL;
+	u32 method_size;
+	u32 cmd_buf_read_offset;
+	u32 dma_copy_class;
+	struct nvgpu_gpfifo_entry gpfifo;
+	struct nvgpu_channel_fence fence = {0U, 0U};
+	struct nvgpu_fence_type *ce_cmd_buf_fence_out = NULL;
+
+	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
+		goto end;
+	}
+
+	nvgpu_mutex_acquire(&ce_app->app_mutex);
+
+	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
+		if (ce_ctx->ctx_id == ce_ctx_id) {
+			found = true;
+			break;
+		}
+	}
+
+	nvgpu_mutex_release(&ce_app->app_mutex);
+
+	if (!found) {
+		ret = -EINVAL;
+		goto end;
+	}
+
+	if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
+		ret = -ENODEV;
+		goto end;
+	}
+
+	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
+
+	ce_ctx->cmd_buf_read_queue_offset %= NVGPU_CE_MAX_INFLIGHT_JOBS;
+
+	cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
+			(NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF /
+			U32(sizeof(u32))));
+
+	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+
+	if (ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] != NULL) {
+		struct nvgpu_fence_type **prev_post_fence =
+			&ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset];
+
+		ret = nvgpu_fence_wait(g, *prev_post_fence,
+				       nvgpu_get_poll_timeout(g));
+
+		nvgpu_fence_put(*prev_post_fence);
+		*prev_post_fence = NULL;
+		if (ret != 0) {
+			goto noop;
+		}
+	}
+
+	cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va +
+			(u64)(cmd_buf_read_offset * sizeof(u32)));
+
+	dma_copy_class = g->ops.get_litter_value(g, GPU_LIT_DMA_COPY_CLASS);
+	method_size = nvgpu_ce_prepare_submit(src_buf,
+			dst_buf,
+			size,
+			&cmd_buf_cpu_va[cmd_buf_read_offset],
+			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
+			payload,
+			nvgpu_ce_get_valid_launch_flags(g, launch_flags),
+			request_operation,
+			dma_copy_class);
+
+	if (method_size != 0U) {
+		/* store the element into gpfifo */
+		g->ops.pbdma.format_gpfifo_entry(g, &gpfifo,
+				cmd_buf_gpu_va, method_size);
+
+		/*
+		 * take always the postfence as it is needed for protecting the
+		 * ce context
+		 */
+		submit_flags |= NVGPU_SUBMIT_FLAGS_FENCE_GET;
+
+		nvgpu_smp_wmb();
+
+		ret = nvgpu_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo,
+				1, submit_flags, &fence, &ce_cmd_buf_fence_out);
+
+		if (ret == 0) {
+			ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] =
+				ce_cmd_buf_fence_out;
+			if (fence_out != NULL) {
+				nvgpu_fence_get(ce_cmd_buf_fence_out);
+				*fence_out = ce_cmd_buf_fence_out;
+			}
+
+			/* Next available command buffer queue Index */
+			++ce_ctx->cmd_buf_read_queue_offset;
+		}
+	} else {
+		ret = -ENOMEM;
+	}
+noop:
+	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
+end:
+	return ret;
+}
+
+/* static CE app api */
+static void nvgpu_ce_put_fences(struct nvgpu_ce_gpu_ctx *ce_ctx)
+{
+	u32 i;
+
+	for (i = 0U; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) {
+		struct nvgpu_fence_type **fence = &ce_ctx->postfences[i];
+
+		if (*fence != NULL) {
+			nvgpu_fence_put(*fence);
+		}
+		*fence = NULL;
+	}
+}
+
+/* caller must hold ce_app->app_mutex */
+static void nvgpu_ce_delete_gpu_context_locked(struct nvgpu_ce_gpu_ctx *ce_ctx)
+{
+	struct nvgpu_list_node *list = &ce_ctx->list;
+
+	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
+	ce_ctx->tsg->abortable = true;
+
+	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
+
+	if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) {
+		nvgpu_ce_put_fences(ce_ctx);
+		nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
+	}
+
+	/*
+	 * free the channel
+	 * nvgpu_channel_close() will also unbind the channel from TSG
+	 */
+	nvgpu_channel_close(ce_ctx->ch);
+	nvgpu_ref_put(&ce_ctx->tsg->refcount, nvgpu_tsg_release);
+
+	/* housekeeping on app */
+	if ((list->prev != NULL) && (list->next != NULL)) {
+		nvgpu_list_del(list);
+	}
+
+	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
+	nvgpu_mutex_destroy(&ce_ctx->gpu_ctx_mutex);
+
+	nvgpu_kfree(ce_ctx->g, ce_ctx);
+}
+
+static inline unsigned int nvgpu_ce_get_method_size(u32 request_operation,
+			u64 size)
+{
+	/* failure size */
+	unsigned int methodsize = UINT_MAX;
+	unsigned int iterations = 0;
+	u32 shift;
+	u64 chunk = size;
+	u32 height, width;
+
+	while (chunk != 0ULL) {
+		iterations++;
+
+		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
+				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
+				MAX_CE_SHIFT;
+		width = chunk >> shift;
+		height = BIT32(shift);
+		width = MAX_CE_ALIGN(width);
+
+		chunk -= (u64) height * width;
+	}
+
+	if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
+		methodsize = (2U + (16U * iterations)) *
+				(unsigned int)sizeof(u32);
+	} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
+		methodsize = (2U + (15U * iterations)) *
+				(unsigned int)sizeof(u32);
+	}
+
+	return methodsize;
+}
+
+u32 nvgpu_ce_prepare_submit(u64 src_buf,
+		u64 dst_buf,
+		u64 size,
+		u32 *cmd_buf_cpu_va,
+		u32 max_cmd_buf_size,
+		unsigned int payload,
+		u32 launch_flags,
+		u32 request_operation,
+		u32 dma_copy_class)
+{
+	u32 launch = 0;
+	u32 methodSize = 0;
+	u64 offset = 0;
+	u64 chunk_size = 0;
+	u64 chunk = size;
+
+	/* failure case handling */
+	if ((nvgpu_ce_get_method_size(request_operation, size) >
+		max_cmd_buf_size) || (size == 0ULL) ||
+		(request_operation > NVGPU_CE_MEMSET)) {
+		return 0;
+	}
+
+	/* set the channel object */
+	cmd_buf_cpu_va[methodSize++] = 0x20018000;
+	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
+
+	/*
+	 * The purpose clear the memory in 2D rectangles. We get the ffs to
+	 * determine the number of lines to copy. The only constraint is that
+	 * maximum number of pixels per line is 4Gpix - 1, which is awkward for
+	 * calculation, so we settle to 2Gpix per line to make calculatione
+	 * more agreable
+	 */
+
+	/*
+	 * The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
+	 * a single submit, we are going to try to clear a range of up to 2Gpix
+	 * multiple lines. Because we want to copy byte aligned we will be
+	 * setting 1 byte pixels
+	 */
+
+	/*
+	 * per iteration
+	 * <------------------------- 40 bits ------------------------------>
+	 *                                             1 <------ ffs ------->
+	 *        <-----------up to 30 bits----------->
+	 */
+	while (chunk != 0ULL) {
+		u32 width, height, shift;
+
+		/*
+		 * We will be aligning to bytes, making the maximum number of
+		 * pix per line 2Gb
+		 */
+
+		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
+				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
+				MAX_CE_SHIFT;
+		height = chunk >> shift;
+		width = BIT32(shift);
+		height = MAX_CE_ALIGN(height);
+
+		chunk_size = (u64) height * width;
+
+		/* reset launch flag */
+		launch = 0;
+
+		if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
+			/* setup the source */
+			cmd_buf_cpu_va[methodSize++] = 0x20028100;
+			cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
+				offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+			cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
+				offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+
+			cmd_buf_cpu_va[methodSize++] = 0x20018098;
+			if ((launch_flags &
+			     NVGPU_CE_SRC_LOCATION_LOCAL_FB) != 0U) {
+				cmd_buf_cpu_va[methodSize++] = 0x00000000;
+			} else if ((launch_flags &
+			     NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
+				cmd_buf_cpu_va[methodSize++] = 0x00000002;
+			} else {
+				cmd_buf_cpu_va[methodSize++] = 0x00000001;
+			}
+
+			launch |= 0x00001000U;
+		} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
+			/* Remap from component A on 1 byte wide pixels */
+			cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+			cmd_buf_cpu_va[methodSize++] = 0x00000004;
+
+			cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+			cmd_buf_cpu_va[methodSize++] = payload;
+
+			launch |= 0x00000400U;
+		} else {
+			/* Illegal size */
+			return 0;
+		}
+
+		/* setup the destination/output */
+		cmd_buf_cpu_va[methodSize++] = 0x20068102;
+		cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
+			offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+		cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
+			offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+		/* Pitch in/out */
+		cmd_buf_cpu_va[methodSize++] = width;
+		cmd_buf_cpu_va[methodSize++] = width;
+		/* width and line count */
+		cmd_buf_cpu_va[methodSize++] = width;
+		cmd_buf_cpu_va[methodSize++] = height;
+
+		cmd_buf_cpu_va[methodSize++] = 0x20018099;
+		if ((launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) != 0U) {
+			cmd_buf_cpu_va[methodSize++] = 0x00000000;
+		} else if ((launch_flags &
+			   NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
+			cmd_buf_cpu_va[methodSize++] = 0x00000002;
+		} else {
+			cmd_buf_cpu_va[methodSize++] = 0x00000001;
+		}
+
+		launch |= 0x00002005U;
+
+		if ((launch_flags &
+		     NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
+			launch |= 0x00000000U;
+		} else {
+			launch |= 0x00000080U;
+		}
+
+		if ((launch_flags &
+		     NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
+			launch |= 0x00000000U;
+		} else {
+			launch |= 0x00000100U;
+		}
+
+		cmd_buf_cpu_va[methodSize++] = 0x200180c0;
+		cmd_buf_cpu_va[methodSize++] = launch;
+		offset += chunk_size;
+		chunk -= chunk_size;
+	}
+
+	return methodSize;
+}
+
+/* global CE app related apis */
+int nvgpu_ce_app_init_support(struct gk20a *g)
+{
+	struct nvgpu_ce_app *ce_app = g->ce_app;
+
+	if (unlikely(ce_app == NULL)) {
+		ce_app = nvgpu_kzalloc(g, sizeof(*ce_app));
+		if (ce_app == NULL) {
+			return -ENOMEM;
+		}
+		g->ce_app = ce_app;
+	}
+
+	if (ce_app->initialised) {
+		/* assume this happen during poweron/poweroff GPU sequence */
+		ce_app->app_state = NVGPU_CE_ACTIVE;
+		return 0;
+	}
+
+	nvgpu_log(g, gpu_dbg_fn, "ce: init");
+
+	nvgpu_mutex_init(&ce_app->app_mutex);
+
+	nvgpu_mutex_acquire(&ce_app->app_mutex);
+
+	nvgpu_init_list_node(&ce_app->allocated_contexts);
+	ce_app->ctx_count = 0;
+	ce_app->next_ctx_id = 0;
+	ce_app->initialised = true;
+	ce_app->app_state = NVGPU_CE_ACTIVE;
+
+	nvgpu_mutex_release(&ce_app->app_mutex);
+
+	nvgpu_log(g, gpu_dbg_cde_ctx, "ce: init finished");
+
+	return 0;
+}
+
+void nvgpu_ce_app_destroy(struct gk20a *g)
+{
+	struct nvgpu_ce_app *ce_app = g->ce_app;
+	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
+
+	if (ce_app == NULL) {
+		return;
+	}
+
+	if (ce_app->initialised == false) {
+		goto free;
+	}
+
+	ce_app->app_state = NVGPU_CE_SUSPEND;
+	ce_app->initialised = false;
+
+	nvgpu_mutex_acquire(&ce_app->app_mutex);
+
+	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
+		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
+	}
+
+	nvgpu_init_list_node(&ce_app->allocated_contexts);
+	ce_app->ctx_count = 0;
+	ce_app->next_ctx_id = 0;
+
+	nvgpu_mutex_release(&ce_app->app_mutex);
+
+	nvgpu_mutex_destroy(&ce_app->app_mutex);
+free:
+	nvgpu_kfree(g, ce_app);
+	g->ce_app = NULL;
+}
+
+void nvgpu_ce_app_suspend(struct gk20a *g)
+{
+	struct nvgpu_ce_app *ce_app = g->ce_app;
+
+	if (ce_app == NULL || !ce_app->initialised) {
+		return;
+	}
+
+	ce_app->app_state = NVGPU_CE_SUSPEND;
+}
+
+/* CE app utility functions */
+u32 nvgpu_ce_app_create_context(struct gk20a *g,
+		u32 runlist_id,
+		int timeslice,
+		int runlist_level)
+{
+	struct nvgpu_ce_gpu_ctx *ce_ctx;
+	struct nvgpu_ce_app *ce_app = g->ce_app;
+	struct nvgpu_setup_bind_args setup_bind_args;
+	u32 ctx_id = NVGPU_CE_INVAL_CTX_ID;
+	int err = 0;
+
+	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
+		return ctx_id;
+	}
+
+	ce_ctx = nvgpu_kzalloc(g, sizeof(*ce_ctx));
+	if (ce_ctx == NULL) {
+		return ctx_id;
+	}
+
+	nvgpu_mutex_init(&ce_ctx->gpu_ctx_mutex);
+
+	ce_ctx->g = g;
+	ce_ctx->cmd_buf_read_queue_offset = 0;
+	ce_ctx->vm = g->mm.ce.vm;
+
+	/* allocate a tsg if needed */
+	ce_ctx->tsg = nvgpu_tsg_open(g, nvgpu_current_pid(g));
+	if (ce_ctx->tsg == NULL) {
+		nvgpu_err(g, "ce: gk20a tsg not available");
+		goto end;
+	}
+
+	/* this TSG should never be aborted */
+	ce_ctx->tsg->abortable = false;
+
+	/* always kernel client needs privileged channel */
+	ce_ctx->ch = nvgpu_channel_open_new(g, runlist_id, true,
+				nvgpu_current_pid(g), nvgpu_current_tid(g));
+	if (ce_ctx->ch == NULL) {
+		nvgpu_err(g, "ce: gk20a channel not available");
+		goto end;
+	}
+
+#ifdef CONFIG_NVGPU_CHANNEL_WDT
+	ce_ctx->ch->wdt.enabled = false;
+#endif
+
+	/* bind the channel to the vm */
+	err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
+	if (err != 0) {
+		nvgpu_err(g, "ce: could not bind vm");
+		goto end;
+	}
+
+	err = nvgpu_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
+	if (err != 0) {
+		nvgpu_err(g, "ce: unable to bind to tsg");
+		goto end;
+	}
+
+	setup_bind_args.num_gpfifo_entries = 1024;
+	setup_bind_args.num_inflight_jobs = 0;
+	setup_bind_args.flags = 0;
+	err = nvgpu_channel_setup_bind(ce_ctx->ch, &setup_bind_args);
+	if (err != 0) {
+		nvgpu_err(g, "ce: unable to setup and bind channel");
+		goto end;
+	}
+
+	/* allocate command buffer from sysmem */
+	err = nvgpu_dma_alloc_map_sys(ce_ctx->vm,
+			NVGPU_CE_MAX_INFLIGHT_JOBS *
+			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
+			&ce_ctx->cmd_buf_mem);
+	if (err != 0) {
+		nvgpu_err(g,
+			"ce: alloc command buffer failed");
+		goto end;
+	}
+
+	(void) memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00,
+		ce_ctx->cmd_buf_mem.size);
+
+#ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
+	/* -1 means default channel timeslice value */
+	if (timeslice != -1) {
+		err = g->ops.tsg.set_timeslice(ce_ctx->tsg, timeslice);
+		if (err != 0) {
+			nvgpu_err(g, "ce: set timesliced failed for CE context");
+			goto end;
+		}
+	}
+
+	/* -1 means default channel runlist level */
+	if (runlist_level != -1) {
+		err = nvgpu_tsg_set_interleave(ce_ctx->tsg, runlist_level);
+		if (err != 0) {
+			nvgpu_err(g, "ce: set runlist interleave failed");
+			goto end;
+		}
+	}
+#endif
+
+	nvgpu_mutex_acquire(&ce_app->app_mutex);
+	ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
+	nvgpu_list_add(&ce_ctx->list, &ce_app->allocated_contexts);
+	++ce_app->next_ctx_id;
+	++ce_app->ctx_count;
+	nvgpu_mutex_release(&ce_app->app_mutex);
+
+	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
+
+end:
+	if (ctx_id == NVGPU_CE_INVAL_CTX_ID) {
+		nvgpu_mutex_acquire(&ce_app->app_mutex);
+		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
+		nvgpu_mutex_release(&ce_app->app_mutex);
+	}
+	return ctx_id;
+
+}
+
+void nvgpu_ce_app_delete_context(struct gk20a *g,
+		u32 ce_ctx_id)
+{
+	struct nvgpu_ce_app *ce_app = g->ce_app;
+	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
+
+	if (ce_app == NULL || !ce_app->initialised ||
+		ce_app->app_state != NVGPU_CE_ACTIVE) {
+		return;
+	}
+
+	nvgpu_mutex_acquire(&ce_app->app_mutex);
+
+	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
+		if (ce_ctx->ctx_id == ce_ctx_id) {
+			nvgpu_ce_delete_gpu_context_locked(ce_ctx);
+			--ce_app->ctx_count;
+			break;
+		}
+	}
+
+	nvgpu_mutex_release(&ce_app->app_mutex);
+}
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -47,9 +47,6 @@
 #include <nvgpu/channel_sync.h>
 #include <nvgpu/gr/gr.h>
 #include <nvgpu/nvgpu_init.h>
-#ifndef CONFIG_NVGPU_CE
-#include <nvgpu/engines.h>
-#endif

 #ifdef CONFIG_NVGPU_TRACE
 #include <trace/events/gk20a.h>
@@ -136,10 +133,8 @@ int nvgpu_prepare_poweroff(struct gk20a *g)
 	nvgpu_falcon_sw_free(g, FALCON_ID_GSPLITE);
 	nvgpu_falcon_sw_free(g, FALCON_ID_NVDEC);
 	nvgpu_falcon_sw_free(g, FALCON_ID_SEC2);
-#endif

-#ifdef CONFIG_NVGPU_CE
-	nvgpu_ce_suspend(g);
+	nvgpu_ce_app_suspend(g);
 #endif

 #ifdef CONFIG_NVGPU_DGPU
@@ -494,17 +489,19 @@ int nvgpu_finalize_poweron(struct gk20a *g)
 	g->ops.fb.set_debug_mode(g, g->mmu_debug_ctrl);
 #endif

-#ifdef CONFIG_NVGPU_CE
 	err = nvgpu_ce_init_support(g);
 	if (err != 0) {
 		nvgpu_err(g, "failed to init ce");
 		goto done;
 	}
-#else
-	g->ops.mc.reset(g, nvgpu_engine_get_all_ce_reset_mask(g));
-#endif

 #ifdef CONFIG_NVGPU_DGPU
+	err = nvgpu_ce_app_init_support(g);
+	if (err != 0) {
+		nvgpu_err(g, "failed to init ce app");
+		goto done;
+	}
+
 	if (g->ops.xve.available_speeds != NULL) {
 		u32 speed;

@@ -655,8 +652,8 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount)

 	nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!");

-#ifdef CONFIG_NVGPU_CE
-	nvgpu_ce_destroy(g);
+#ifdef CONFIG_NVGPU_DGPU
+	nvgpu_ce_app_destroy(g);
 #endif

 #ifdef CONFIG_NVGPU_COMPRESSION
--- a/drivers/gpu/nvgpu/common/mm/mm.c
+++ b/drivers/gpu/nvgpu/common/mm/mm.c
@@ -114,19 +114,17 @@ static int nvgpu_alloc_sysmem_flush(struct gk20a *g)
 	return nvgpu_dma_alloc_sys(g, SZ_4K, &g->mm.sysmem_flush);
 }

-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 static void nvgpu_remove_mm_ce_support(struct mm_gk20a *mm)
 {
-#ifdef CONFIG_NVGPU_DGPU
 	struct gk20a *g = gk20a_from_mm(mm);

 	if (mm->vidmem.ce_ctx_id != NVGPU_CE_INVAL_CTX_ID) {
-		nvgpu_ce_delete_context(g, mm->vidmem.ce_ctx_id);
+		nvgpu_ce_app_delete_context(g, mm->vidmem.ce_ctx_id);
 	}
 	mm->vidmem.ce_ctx_id = NVGPU_CE_INVAL_CTX_ID;

 	nvgpu_vm_put(mm->ce.vm);
-#endif
 }
 #endif

@@ -303,14 +301,13 @@ static int nvgpu_init_mmu_debug(struct mm_gk20a *mm)
 	return -ENOMEM;
 }

-#ifdef CONFIG_NVGPU_CE
+#if defined(CONFIG_NVGPU_DGPU)
 void nvgpu_init_mm_ce_context(struct gk20a *g)
 {
-#if defined(CONFIG_NVGPU_DGPU)
 	if (g->mm.vidmem.size > 0U &&
 	   (g->mm.vidmem.ce_ctx_id == NVGPU_CE_INVAL_CTX_ID)) {
 		g->mm.vidmem.ce_ctx_id =
-			nvgpu_ce_create_context(g,
+			nvgpu_ce_app_create_context(g,
 				nvgpu_engine_get_fast_ce_runlist_id(g),
 				-1,
 				-1);
@@ -320,9 +317,8 @@ void nvgpu_init_mm_ce_context(struct gk20a *g)
 				"Failed to allocate CE context for vidmem page clearing support");
 		}
 	}
-#endif
 }
-#endif /* NVGPU_FENCE_CE */
+#endif

 static int nvgpu_init_mm_reset_enable_hw(struct gk20a *g)
 {
@@ -523,7 +519,7 @@ static int nvgpu_init_mm_setup_sw(struct gk20a *g)
 	}

 	mm->remove_support = nvgpu_remove_mm_support;
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	mm->remove_ce_support = nvgpu_remove_mm_ce_support;
 #endif

--- a/drivers/gpu/nvgpu/common/mm/vidmem.c
+++ b/drivers/gpu/nvgpu/common/mm/vidmem.c
@@ -103,7 +103,7 @@ static int nvgpu_vidmem_do_clear_all(struct gk20a *g)

 	vidmem_dbg(g, "Clearing all VIDMEM:");

-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	err = nvgpu_ce_execute_ops(g,
 			mm->vidmem.ce_ctx_id,
 			0,
@@ -119,6 +119,9 @@ static int nvgpu_vidmem_do_clear_all(struct gk20a *g)
 			"Failed to clear vidmem : %d", err);
 		return err;
 	}
+#else
+	/* fail due to lack of ce app support */
+	return -ENOSYS;
 #endif

 	if (fence_out != NULL) {
@@ -464,7 +467,7 @@ int nvgpu_vidmem_clear(struct gk20a *g, struct nvgpu_mem *mem)
 			nvgpu_fence_put(last_fence);
 		}

-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 		err = nvgpu_ce_execute_ops(g,
 			g->mm.vidmem.ce_ctx_id,
 			0,
@@ -476,12 +479,12 @@ int nvgpu_vidmem_clear(struct gk20a *g, struct nvgpu_mem *mem)
 			0,
 			&fence_out);
 #else
-		/* fail due to lack of ce support */
+		/* fail due to lack of ce app support */
 		err = -ENOSYS;
 #endif

 		if (err != 0) {
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 			nvgpu_err(g,
 				"Failed nvgpu_ce_execute_ops[%d]", err);
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/ce.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/ce.h
@@ -69,17 +69,19 @@ enum {
 	NVGPU_CE_GPU_CTX_DELETED           = (1 << 1),
 };

-/* global CE app related apis */
 int nvgpu_ce_init_support(struct gk20a *g);
-void nvgpu_ce_suspend(struct gk20a *g);
-void nvgpu_ce_destroy(struct gk20a *g);
+
+/* global CE app related apis */
+int nvgpu_ce_app_init_support(struct gk20a *g);
+void nvgpu_ce_app_suspend(struct gk20a *g);
+void nvgpu_ce_app_destroy(struct gk20a *g);

 /* CE app utility functions */
-u32 nvgpu_ce_create_context(struct gk20a *g,
+u32 nvgpu_ce_app_create_context(struct gk20a *g,
 		u32 runlist_id,
 		int timeslice,
 		int runlist_level);
-void nvgpu_ce_delete_context(struct gk20a *g,
+void nvgpu_ce_app_delete_context(struct gk20a *g,
 		u32 ce_ctx_id);
 int nvgpu_ce_execute_ops(struct gk20a *g,
 		u32 ce_ctx_id,
--- a/drivers/gpu/nvgpu/include/nvgpu/mm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/mm.h
@@ -242,7 +242,7 @@ struct mm_gk20a {
 	struct nvgpu_mem hw_fault_buf[NVGPU_MMU_FAULT_TYPE_NUM];
 	struct mmu_fault_info fault_info[NVGPU_MMU_FAULT_TYPE_NUM];
 	struct nvgpu_mutex hub_isr_mutex;
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	/*
 	 * Separate function to cleanup the CE since it requires a channel to
 	 * be closed which must happen before fifo cleanup.
@@ -317,7 +317,7 @@ static inline u64 nvgpu_gmmu_va_small_page_limit(void)
 	return ((u64)SZ_1G * 56U);
 }

-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 void nvgpu_init_mm_ce_context(struct gk20a *g);
 #endif
 int nvgpu_init_mm_support(struct gk20a *g);
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -469,7 +469,7 @@ int gk20a_pm_finalize_poweron(struct device *dev)
 	if (err)
 		goto done;

-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	nvgpu_init_mm_ce_context(g);
 #endif