gpu: nvgpu: move CE app logic under CONFIG_NVGPU_DGPU

CE app functionality from nvgpu is non-safe for igpu. CE engines init /reset/cg related functionality is required in safety. Hence move the CE app logic under CONFIG_NVGPU_DGPU flag and update the sources accordingly. JIRA NVGPU-3814 Change-Id: I37aa00b1184baccd5fe569ec315be60ac42dac9b Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2168956 GVS: Gerrit_Virtual_Submit Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2019-08-06 10:07:21 +05:30
parent 6e2e4d0658
commit 2f95efd8d1
12 changed files with 682 additions and 647 deletions
--- a/arch/nvgpu-common.yaml
+++ b/arch/nvgpu-common.yaml
@@ -31,9 +31,16 @@ bios:
 ce:
  safe: yes
  gpu: dgpu
  owner: Thomas F
  sources: [ common/ce/ce.c,
             include/nvgpu/ce.h ]
  deps:
 ce_app:
  safe: yes
  gpu: dgpu
  owner: Thomas F
  sources: [ common/ce/ce_app.c,
             common/ce/ce_priv.h,
             include/nvgpu/ce.h ]
  deps:
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -40,7 +40,6 @@ ccflags-y += -DCONFIG_NVGPU_GRAPHICS
 ccflags-y += -DCONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
 ccflags-y += -DCONFIG_NVGPU_CHANNEL_TSG_CONTROL
 ccflags-y += -DCONFIG_NVGPU_POWER_PG
 ccflags-y += -DCONFIG_NVGPU_CE
 ccflags-y += -DCONFIG_NVGPU_KERNEL_MODE_SUBMIT
 ccflags-y += -DCONFIG_NVGPU_COMPRESSION
 ccflags-y += -DCONFIG_NVGPU_SIM
@@ -515,6 +514,7 @@ nvgpu-y += \
 	common/fence/fence.o \
 	common/ecc.o \
 	common/ce/ce.o \
 	common/ce/ce_app.o \
 	common/debugger.o
 nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
--- a/drivers/gpu/nvgpu/Makefile.shared.configs
+++ b/drivers/gpu/nvgpu/Makefile.shared.configs
@@ -172,9 +172,6 @@ NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_SIM
 CONFIG_NVGPU_COMPRESSION        := 1
 NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_COMPRESSION
 CONFIG_NVGPU_CE                 := 1
 NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_CE
 # Enable non FUSA HALs for normal build
 CONFIG_NVGPU_HAL_NON_FUSA       := 1
 NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_HAL_NON_FUSA
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -140,6 +140,7 @@ srcs += common/utils/assert.c \
 	common/fifo/pbdma_status.c \
 	common/mc/mc.c \
 	common/rc/rc.c \
 	common/ce/ce.c \
 	hal/init/hal_gv11b.c \
 	hal/init/hal_gv11b_litter.c \
 	hal/init/hal_init.c \
@@ -365,10 +366,6 @@ srcs += hal/regops/regops_gm20b.c \
 endif
 endif
 ifeq ($(CONFIG_NVGPU_CE),1)
 srcs += common/ce/ce.c
 endif
 ifeq ($(CONFIG_NVGPU_KERNEL_MODE_SUBMIT),1)
 srcs += common/fifo/submit.c
 endif
@@ -581,6 +578,7 @@ srcs += common/sec2/sec2.c \
 	common/mm/allocators/page_allocator.c \
 	common/mm/vidmem.c \
 	common/pramin.c \
 	common/ce/ce_app.c \
 	hal/mm/mm_gv100.c \
 	hal/mm/mm_tu104.c \
 	hal/mc/mc_gv100.c  \
--- a/drivers/gpu/nvgpu/common/ce/ce.c
+++ b/drivers/gpu/nvgpu/common/ce/ce.c
@@ -23,404 +23,17 @@
 #include <nvgpu/types.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/engines.h>
 #include <nvgpu/os_sched.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/utils.h>
 #include <nvgpu/fence.h>
 #include <nvgpu/ce.h>
 #include <nvgpu/power_features/cg.h>
 #include "common/ce/ce_priv.h"
 static inline u32 nvgpu_ce_get_valid_launch_flags(struct gk20a *g,
 		u32 launch_flags)
 {
 #ifdef CONFIG_NVGPU_DGPU
 	/*
 	 * there is no local memory available,
 	 * don't allow local memory related CE flags
 	 */
 	if (g->mm.vidmem.size == 0ULL) {
 		launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
 			NVGPU_CE_DST_LOCATION_LOCAL_FB);
 	}
 #endif
 	return launch_flags;
 }
 int nvgpu_ce_execute_ops(struct gk20a *g,
 		u32 ce_ctx_id,
 		u64 src_buf,
 		u64 dst_buf,
 		u64 size,
 		unsigned int payload,
 		u32 launch_flags,
 		u32 request_operation,
 		u32 submit_flags,
 		struct nvgpu_fence_type **fence_out)
 {
 	int ret = -EPERM;
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
 	bool found = false;
 	u32 *cmd_buf_cpu_va;
 	u64 cmd_buf_gpu_va = 0UL;
 	u32 method_size;
 	u32 cmd_buf_read_offset;
 	u32 dma_copy_class;
 	struct nvgpu_gpfifo_entry gpfifo;
 	struct nvgpu_channel_fence fence = {0U, 0U};
 	struct nvgpu_fence_type *ce_cmd_buf_fence_out = NULL;
 	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
 		goto end;
 	}
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
 			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
 		if (ce_ctx->ctx_id == ce_ctx_id) {
 			found = true;
 			break;
 		}
 	}
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	if (!found) {
 		ret = -EINVAL;
 		goto end;
 	}
 	if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
 		ret = -ENODEV;
 		goto end;
 	}
 	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
 	ce_ctx->cmd_buf_read_queue_offset %= NVGPU_CE_MAX_INFLIGHT_JOBS;
 	cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
 			(NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF /
 			U32(sizeof(u32))));
 	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
 	if (ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] != NULL) {
 		struct nvgpu_fence_type **prev_post_fence =
 			&ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset];
 		ret = nvgpu_fence_wait(g, *prev_post_fence,
 				       nvgpu_get_poll_timeout(g));
 		nvgpu_fence_put(*prev_post_fence);
 		*prev_post_fence = NULL;
 		if (ret != 0) {
 			goto noop;
 		}
 	}
 	cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va +
 			(u64)(cmd_buf_read_offset * sizeof(u32)));
 	dma_copy_class = g->ops.get_litter_value(g, GPU_LIT_DMA_COPY_CLASS);
 	method_size = nvgpu_ce_prepare_submit(src_buf,
 			dst_buf,
 			size,
 			&cmd_buf_cpu_va[cmd_buf_read_offset],
 			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
 			payload,
 			nvgpu_ce_get_valid_launch_flags(g, launch_flags),
 			request_operation,
 			dma_copy_class);
 	if (method_size != 0U) {
 		/* store the element into gpfifo */
 		g->ops.pbdma.format_gpfifo_entry(g, &gpfifo,
 				cmd_buf_gpu_va, method_size);
 		/*
 		 * take always the postfence as it is needed for protecting the
 		 * ce context
 		 */
 		submit_flags |= NVGPU_SUBMIT_FLAGS_FENCE_GET;
 		nvgpu_smp_wmb();
 		ret = nvgpu_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo,
 				1, submit_flags, &fence, &ce_cmd_buf_fence_out);
 		if (ret == 0) {
 			ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] =
 				ce_cmd_buf_fence_out;
 			if (fence_out != NULL) {
 				nvgpu_fence_get(ce_cmd_buf_fence_out);
 				*fence_out = ce_cmd_buf_fence_out;
 			}
 			/* Next available command buffer queue Index */
 			++ce_ctx->cmd_buf_read_queue_offset;
 		}
 	} else {
 		ret = -ENOMEM;
 	}
 noop:
 	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
 end:
 	return ret;
 }
 /* static CE app api */
 static void nvgpu_ce_put_fences(struct nvgpu_ce_gpu_ctx *ce_ctx)
 {
 	u32 i;
 	for (i = 0U; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) {
 		struct nvgpu_fence_type **fence = &ce_ctx->postfences[i];
 		if (*fence != NULL) {
 			nvgpu_fence_put(*fence);
 		}
 		*fence = NULL;
 	}
 }
 /* caller must hold ce_app->app_mutex */
 static void nvgpu_ce_delete_gpu_context_locked(struct nvgpu_ce_gpu_ctx *ce_ctx)
 {
 	struct nvgpu_list_node *list = &ce_ctx->list;
 	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
 	ce_ctx->tsg->abortable = true;
 	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
 	if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) {
 		nvgpu_ce_put_fences(ce_ctx);
 		nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
 	}
 	/*
 	 * free the channel
 	 * nvgpu_channel_close() will also unbind the channel from TSG
 	 */
 	nvgpu_channel_close(ce_ctx->ch);
 	nvgpu_ref_put(&ce_ctx->tsg->refcount, nvgpu_tsg_release);
 	/* housekeeping on app */
 	if ((list->prev != NULL) && (list->next != NULL)) {
 		nvgpu_list_del(list);
 	}
 	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
 	nvgpu_mutex_destroy(&ce_ctx->gpu_ctx_mutex);
 	nvgpu_kfree(ce_ctx->g, ce_ctx);
 }
 static inline unsigned int nvgpu_ce_get_method_size(u32 request_operation,
 			u64 size)
 {
 	/* failure size */
 	unsigned int methodsize = UINT_MAX;
 	unsigned int iterations = 0;
 	u32 shift;
 	u64 chunk = size;
 	u32 height, width;
 	while (chunk != 0ULL) {
 		iterations++;
 		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
 				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
 				MAX_CE_SHIFT;
 		width = chunk >> shift;
 		height = BIT32(shift);
 		width = MAX_CE_ALIGN(width);
 		chunk -= (u64) height * width;
 	}
 	if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
 		methodsize = (2U + (16U * iterations)) *
 				(unsigned int)sizeof(u32);
 	} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
 		methodsize = (2U + (15U * iterations)) *
 				(unsigned int)sizeof(u32);
 	}
 	return methodsize;
 }
 u32 nvgpu_ce_prepare_submit(u64 src_buf,
 		u64 dst_buf,
 		u64 size,
 		u32 *cmd_buf_cpu_va,
 		u32 max_cmd_buf_size,
 		unsigned int payload,
 		u32 launch_flags,
 		u32 request_operation,
 		u32 dma_copy_class)
 {
 	u32 launch = 0;
 	u32 methodSize = 0;
 	u64 offset = 0;
 	u64 chunk_size = 0;
 	u64 chunk = size;
 	/* failure case handling */
 	if ((nvgpu_ce_get_method_size(request_operation, size) >
 		max_cmd_buf_size) || (size == 0ULL) ||
 		(request_operation > NVGPU_CE_MEMSET)) {
 		return 0;
 	}
 	/* set the channel object */
 	cmd_buf_cpu_va[methodSize++] = 0x20018000;
 	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
 	/*
 	 * The purpose clear the memory in 2D rectangles. We get the ffs to
 	 * determine the number of lines to copy. The only constraint is that
 	 * maximum number of pixels per line is 4Gpix - 1, which is awkward for
 	 * calculation, so we settle to 2Gpix per line to make calculatione
 	 * more agreable
 	 */
 	/* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
 	 * a single submit, we are going to try to clear a range of up to 2Gpix
 	 * multiple lines. Because we want to copy byte aligned we will be
 	 * setting 1 byte pixels */
 	/*
 	 * per iteration
 	 * <------------------------- 40 bits ------------------------------>
 	 *                                             1 <------ ffs ------->
 	 *        <-----------up to 30 bits----------->
 	 */
 	while (chunk != 0ULL) {
 		u32 width, height, shift;
 		/*
 		 * We will be aligning to bytes, making the maximum number of
 		 * pix per line 2Gb
 		 */
 		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
 				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
 				MAX_CE_SHIFT;
 		height = chunk >> shift;
 		width = BIT32(shift);
 		height = MAX_CE_ALIGN(height);
 		chunk_size = (u64) height * width;
 		/* reset launch flag */
 		launch = 0;
 		if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
 			/* setup the source */
 			cmd_buf_cpu_va[methodSize++] = 0x20028100;
 			cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
 				offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
 			cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
 				offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
 			cmd_buf_cpu_va[methodSize++] = 0x20018098;
 			if ((launch_flags &
 			     NVGPU_CE_SRC_LOCATION_LOCAL_FB) != 0U) {
 				cmd_buf_cpu_va[methodSize++] = 0x00000000;
 			} else if ((launch_flags &
 			     NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
 				cmd_buf_cpu_va[methodSize++] = 0x00000002;
 			} else {
 				cmd_buf_cpu_va[methodSize++] = 0x00000001;
 			}
 			launch |= 0x00001000U;
 		} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
 			/* Remap from component A on 1 byte wide pixels */
 			cmd_buf_cpu_va[methodSize++] = 0x200181c2;
 			cmd_buf_cpu_va[methodSize++] = 0x00000004;
 			cmd_buf_cpu_va[methodSize++] = 0x200181c0;
 			cmd_buf_cpu_va[methodSize++] = payload;
 			launch |= 0x00000400U;
 		} else {
 			/* Illegal size */
 			return 0;
 		}
 		/* setup the destination/output */
 		cmd_buf_cpu_va[methodSize++] = 0x20068102;
 		cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
 			offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
 		cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
 			offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
 		/* Pitch in/out */
 		cmd_buf_cpu_va[methodSize++] = width;
 		cmd_buf_cpu_va[methodSize++] = width;
 		/* width and line count */
 		cmd_buf_cpu_va[methodSize++] = width;
 		cmd_buf_cpu_va[methodSize++] = height;
 		cmd_buf_cpu_va[methodSize++] = 0x20018099;
 		if ((launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) != 0U) {
 			cmd_buf_cpu_va[methodSize++] = 0x00000000;
 		} else if ((launch_flags &
 			   NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
 			cmd_buf_cpu_va[methodSize++] = 0x00000002;
 		} else {
 			cmd_buf_cpu_va[methodSize++] = 0x00000001;
 		}
 		launch |= 0x00002005U;
 		if ((launch_flags &
 		     NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
 			launch |= 0x00000000U;
 		} else {
 			launch |= 0x00000080U;
 		}
 		if ((launch_flags &
 		     NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
 			launch |= 0x00000000U;
 		} else {
 			launch |= 0x00000100U;
 		}
 		cmd_buf_cpu_va[methodSize++] = 0x200180c0;
 		cmd_buf_cpu_va[methodSize++] = launch;
 		offset += chunk_size;
 		chunk -= chunk_size;
 	}
 	return methodSize;
 }
 /* global CE app related apis */
 int nvgpu_ce_init_support(struct gk20a *g)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	u32 ce_reset_mask;
 	if (g->ops.ce.set_pce2lce_mapping != NULL) {
 		g->ops.ce.set_pce2lce_mapping(g);
 	}
 	if (unlikely(ce_app == NULL)) {
 		ce_app = nvgpu_kzalloc(g, sizeof(*ce_app));
 		if (ce_app == NULL) {
 			return -ENOMEM;
 		}
 		g->ce_app = ce_app;
 	}
 	ce_reset_mask = nvgpu_engine_get_all_ce_reset_mask(g);
 	g->ops.mc.reset(g, ce_reset_mask);
@@ -429,226 +42,9 @@ int nvgpu_ce_init_support(struct gk20a *g)
 	nvgpu_cg_blcg_ce_load_enable(g);
 	if (ce_app->initialised) {
 		/* assume this happen during poweron/poweroff GPU sequence */
 		ce_app->app_state = NVGPU_CE_ACTIVE;
 		return 0;
 	}
 	nvgpu_log(g, gpu_dbg_fn, "ce: init");
 	nvgpu_mutex_init(&ce_app->app_mutex);
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_init_list_node(&ce_app->allocated_contexts);
 	ce_app->ctx_count = 0;
 	ce_app->next_ctx_id = 0;
 	ce_app->initialised = true;
 	ce_app->app_state = NVGPU_CE_ACTIVE;
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	if (g->ops.ce.init_prod_values != NULL) {
 		g->ops.ce.init_prod_values(g);
 	}
 	nvgpu_log(g, gpu_dbg_cde_ctx, "ce: init finished");
 	return 0;
 }
 void nvgpu_ce_destroy(struct gk20a *g)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
 	if (ce_app == NULL) {
 		return;
 	}
 	if (ce_app->initialised == false) {
 		goto free;
 	}
 	ce_app->app_state = NVGPU_CE_SUSPEND;
 	ce_app->initialised = false;
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
 			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
 		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
 	}
 	nvgpu_init_list_node(&ce_app->allocated_contexts);
 	ce_app->ctx_count = 0;
 	ce_app->next_ctx_id = 0;
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	nvgpu_mutex_destroy(&ce_app->app_mutex);
 free:
 	nvgpu_kfree(g, ce_app);
 	g->ce_app = NULL;
 }
 void nvgpu_ce_suspend(struct gk20a *g)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	if (ce_app == NULL || !ce_app->initialised) {
 		return;
 	}
 	ce_app->app_state = NVGPU_CE_SUSPEND;
 }
 /* CE app utility functions */
 u32 nvgpu_ce_create_context(struct gk20a *g,
 		u32 runlist_id,
 		int timeslice,
 		int runlist_level)
 {
 	struct nvgpu_ce_gpu_ctx *ce_ctx;
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_setup_bind_args setup_bind_args;
 	u32 ctx_id = NVGPU_CE_INVAL_CTX_ID;
 	int err = 0;
 	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
 		return ctx_id;
 	}
 	ce_ctx = nvgpu_kzalloc(g, sizeof(*ce_ctx));
 	if (ce_ctx == NULL) {
 		return ctx_id;
 	}
 	nvgpu_mutex_init(&ce_ctx->gpu_ctx_mutex);
 	ce_ctx->g = g;
 	ce_ctx->cmd_buf_read_queue_offset = 0;
 	ce_ctx->vm = g->mm.ce.vm;
 	/* allocate a tsg if needed */
 	ce_ctx->tsg = nvgpu_tsg_open(g, nvgpu_current_pid(g));
 	if (ce_ctx->tsg == NULL) {
 		nvgpu_err(g, "ce: gk20a tsg not available");
 		goto end;
 	}
 	/* this TSG should never be aborted */
 	ce_ctx->tsg->abortable = false;
 	/* always kernel client needs privileged channel */
 	ce_ctx->ch = nvgpu_channel_open_new(g, runlist_id, true,
 				nvgpu_current_pid(g), nvgpu_current_tid(g));
 	if (ce_ctx->ch == NULL) {
 		nvgpu_err(g, "ce: gk20a channel not available");
 		goto end;
 	}
 #ifdef CONFIG_NVGPU_CHANNEL_WDT
 	ce_ctx->ch->wdt.enabled = false;
 #endif
 	/* bind the channel to the vm */
 	err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
 	if (err != 0) {
 		nvgpu_err(g, "ce: could not bind vm");
 		goto end;
 	}
 	err = nvgpu_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
 	if (err != 0) {
 		nvgpu_err(g, "ce: unable to bind to tsg");
 		goto end;
 	}
 	setup_bind_args.num_gpfifo_entries = 1024;
 	setup_bind_args.num_inflight_jobs = 0;
 	setup_bind_args.flags = 0;
 	err = nvgpu_channel_setup_bind(ce_ctx->ch, &setup_bind_args);
 	if (err != 0) {
 		nvgpu_err(g, "ce: unable to setup and bind channel");
 		goto end;
 	}
 	/* allocate command buffer from sysmem */
 	err = nvgpu_dma_alloc_map_sys(ce_ctx->vm,
 			NVGPU_CE_MAX_INFLIGHT_JOBS *
 			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
 			&ce_ctx->cmd_buf_mem);
 	 if (err != 0) {
 		nvgpu_err(g,
 			"ce: alloc command buffer failed");
 		goto end;
 	}
 	(void) memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00,
 		ce_ctx->cmd_buf_mem.size);
 #ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
 	/* -1 means default channel timeslice value */
 	if (timeslice != -1) {
 		err = g->ops.tsg.set_timeslice(ce_ctx->tsg, timeslice);
 		if (err != 0) {
 			nvgpu_err(g, "ce: set timesliced failed for CE context");
 			goto end;
 		}
 	}
 	/* -1 means default channel runlist level */
 	if (runlist_level != -1) {
 		err = nvgpu_tsg_set_interleave(ce_ctx->tsg, runlist_level);
 		if (err != 0) {
 			nvgpu_err(g, "ce: set runlist interleave failed");
 			goto end;
 		}
 	}
 #endif
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
 	nvgpu_list_add(&ce_ctx->list, &ce_app->allocated_contexts);
 	++ce_app->next_ctx_id;
 	++ce_app->ctx_count;
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
 end:
 	if (ctx_id == NVGPU_CE_INVAL_CTX_ID) {
 		nvgpu_mutex_acquire(&ce_app->app_mutex);
 		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
 		nvgpu_mutex_release(&ce_app->app_mutex);
 	}
 	return ctx_id;
 }
 void nvgpu_ce_delete_context(struct gk20a *g,
 		u32 ce_ctx_id)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
 	if (ce_app == NULL || !ce_app->initialised ||
 		ce_app->app_state != NVGPU_CE_ACTIVE) {
 		return;
 	}
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
 			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
 		if (ce_ctx->ctx_id == ce_ctx_id) {
 			nvgpu_ce_delete_gpu_context_locked(ce_ctx);
 			--ce_app->ctx_count;
 			break;
 		}
 	}
 	nvgpu_mutex_release(&ce_app->app_mutex);
 }
--- a/drivers/gpu/nvgpu/common/ce/ce_app.c
+++ b/drivers/gpu/nvgpu/common/ce/ce_app.c
@@ -0,0 +1,639 @@
 /*
 * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <nvgpu/types.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/engines.h>
 #include <nvgpu/os_sched.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/dma.h>
 #include <nvgpu/utils.h>
 #include <nvgpu/fence.h>
 #include <nvgpu/ce.h>
 #include <nvgpu/power_features/cg.h>
 #include "common/ce/ce_priv.h"
 static inline u32 nvgpu_ce_get_valid_launch_flags(struct gk20a *g,
 		u32 launch_flags)
 {
 #ifdef CONFIG_NVGPU_DGPU
 	/*
 	 * there is no local memory available,
 	 * don't allow local memory related CE flags
 	 */
 	if (g->mm.vidmem.size == 0ULL) {
 		launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
 			NVGPU_CE_DST_LOCATION_LOCAL_FB);
 	}
 #endif
 	return launch_flags;
 }
 int nvgpu_ce_execute_ops(struct gk20a *g,
 		u32 ce_ctx_id,
 		u64 src_buf,
 		u64 dst_buf,
 		u64 size,
 		unsigned int payload,
 		u32 launch_flags,
 		u32 request_operation,
 		u32 submit_flags,
 		struct nvgpu_fence_type **fence_out)
 {
 	int ret = -EPERM;
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
 	bool found = false;
 	u32 *cmd_buf_cpu_va;
 	u64 cmd_buf_gpu_va = 0UL;
 	u32 method_size;
 	u32 cmd_buf_read_offset;
 	u32 dma_copy_class;
 	struct nvgpu_gpfifo_entry gpfifo;
 	struct nvgpu_channel_fence fence = {0U, 0U};
 	struct nvgpu_fence_type *ce_cmd_buf_fence_out = NULL;
 	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
 		goto end;
 	}
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
 			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
 		if (ce_ctx->ctx_id == ce_ctx_id) {
 			found = true;
 			break;
 		}
 	}
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	if (!found) {
 		ret = -EINVAL;
 		goto end;
 	}
 	if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
 		ret = -ENODEV;
 		goto end;
 	}
 	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
 	ce_ctx->cmd_buf_read_queue_offset %= NVGPU_CE_MAX_INFLIGHT_JOBS;
 	cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
 			(NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF /
 			U32(sizeof(u32))));
 	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
 	if (ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] != NULL) {
 		struct nvgpu_fence_type **prev_post_fence =
 			&ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset];
 		ret = nvgpu_fence_wait(g, *prev_post_fence,
 				       nvgpu_get_poll_timeout(g));
 		nvgpu_fence_put(*prev_post_fence);
 		*prev_post_fence = NULL;
 		if (ret != 0) {
 			goto noop;
 		}
 	}
 	cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va +
 			(u64)(cmd_buf_read_offset * sizeof(u32)));
 	dma_copy_class = g->ops.get_litter_value(g, GPU_LIT_DMA_COPY_CLASS);
 	method_size = nvgpu_ce_prepare_submit(src_buf,
 			dst_buf,
 			size,
 			&cmd_buf_cpu_va[cmd_buf_read_offset],
 			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
 			payload,
 			nvgpu_ce_get_valid_launch_flags(g, launch_flags),
 			request_operation,
 			dma_copy_class);
 	if (method_size != 0U) {
 		/* store the element into gpfifo */
 		g->ops.pbdma.format_gpfifo_entry(g, &gpfifo,
 				cmd_buf_gpu_va, method_size);
 		/*
 		 * take always the postfence as it is needed for protecting the
 		 * ce context
 		 */
 		submit_flags |= NVGPU_SUBMIT_FLAGS_FENCE_GET;
 		nvgpu_smp_wmb();
 		ret = nvgpu_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo,
 				1, submit_flags, &fence, &ce_cmd_buf_fence_out);
 		if (ret == 0) {
 			ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] =
 				ce_cmd_buf_fence_out;
 			if (fence_out != NULL) {
 				nvgpu_fence_get(ce_cmd_buf_fence_out);
 				*fence_out = ce_cmd_buf_fence_out;
 			}
 			/* Next available command buffer queue Index */
 			++ce_ctx->cmd_buf_read_queue_offset;
 		}
 	} else {
 		ret = -ENOMEM;
 	}
 noop:
 	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
 end:
 	return ret;
 }
 /* static CE app api */
 static void nvgpu_ce_put_fences(struct nvgpu_ce_gpu_ctx *ce_ctx)
 {
 	u32 i;
 	for (i = 0U; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) {
 		struct nvgpu_fence_type **fence = &ce_ctx->postfences[i];
 		if (*fence != NULL) {
 			nvgpu_fence_put(*fence);
 		}
 		*fence = NULL;
 	}
 }
 /* caller must hold ce_app->app_mutex */
 static void nvgpu_ce_delete_gpu_context_locked(struct nvgpu_ce_gpu_ctx *ce_ctx)
 {
 	struct nvgpu_list_node *list = &ce_ctx->list;
 	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
 	ce_ctx->tsg->abortable = true;
 	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
 	if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) {
 		nvgpu_ce_put_fences(ce_ctx);
 		nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
 	}
 	/*
 	 * free the channel
 	 * nvgpu_channel_close() will also unbind the channel from TSG
 	 */
 	nvgpu_channel_close(ce_ctx->ch);
 	nvgpu_ref_put(&ce_ctx->tsg->refcount, nvgpu_tsg_release);
 	/* housekeeping on app */
 	if ((list->prev != NULL) && (list->next != NULL)) {
 		nvgpu_list_del(list);
 	}
 	nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
 	nvgpu_mutex_destroy(&ce_ctx->gpu_ctx_mutex);
 	nvgpu_kfree(ce_ctx->g, ce_ctx);
 }
 static inline unsigned int nvgpu_ce_get_method_size(u32 request_operation,
 			u64 size)
 {
 	/* failure size */
 	unsigned int methodsize = UINT_MAX;
 	unsigned int iterations = 0;
 	u32 shift;
 	u64 chunk = size;
 	u32 height, width;
 	while (chunk != 0ULL) {
 		iterations++;
 		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
 				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
 				MAX_CE_SHIFT;
 		width = chunk >> shift;
 		height = BIT32(shift);
 		width = MAX_CE_ALIGN(width);
 		chunk -= (u64) height * width;
 	}
 	if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
 		methodsize = (2U + (16U * iterations)) *
 				(unsigned int)sizeof(u32);
 	} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
 		methodsize = (2U + (15U * iterations)) *
 				(unsigned int)sizeof(u32);
 	}
 	return methodsize;
 }
 u32 nvgpu_ce_prepare_submit(u64 src_buf,
 		u64 dst_buf,
 		u64 size,
 		u32 *cmd_buf_cpu_va,
 		u32 max_cmd_buf_size,
 		unsigned int payload,
 		u32 launch_flags,
 		u32 request_operation,
 		u32 dma_copy_class)
 {
 	u32 launch = 0;
 	u32 methodSize = 0;
 	u64 offset = 0;
 	u64 chunk_size = 0;
 	u64 chunk = size;
 	/* failure case handling */
 	if ((nvgpu_ce_get_method_size(request_operation, size) >
 		max_cmd_buf_size) || (size == 0ULL) ||
 		(request_operation > NVGPU_CE_MEMSET)) {
 		return 0;
 	}
 	/* set the channel object */
 	cmd_buf_cpu_va[methodSize++] = 0x20018000;
 	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
 	/*
 	 * The purpose clear the memory in 2D rectangles. We get the ffs to
 	 * determine the number of lines to copy. The only constraint is that
 	 * maximum number of pixels per line is 4Gpix - 1, which is awkward for
 	 * calculation, so we settle to 2Gpix per line to make calculatione
 	 * more agreable
 	 */
 	/*
 	 * The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
 	 * a single submit, we are going to try to clear a range of up to 2Gpix
 	 * multiple lines. Because we want to copy byte aligned we will be
 	 * setting 1 byte pixels
 	 */
 	/*
 	 * per iteration
 	 * <------------------------- 40 bits ------------------------------>
 	 *                                             1 <------ ffs ------->
 	 *        <-----------up to 30 bits----------->
 	 */
 	while (chunk != 0ULL) {
 		u32 width, height, shift;
 		/*
 		 * We will be aligning to bytes, making the maximum number of
 		 * pix per line 2Gb
 		 */
 		shift = (MAX_CE_ALIGN(chunk) != 0ULL) ?
 				(nvgpu_ffs(MAX_CE_ALIGN(chunk)) - 1UL) :
 				MAX_CE_SHIFT;
 		height = chunk >> shift;
 		width = BIT32(shift);
 		height = MAX_CE_ALIGN(height);
 		chunk_size = (u64) height * width;
 		/* reset launch flag */
 		launch = 0;
 		if ((request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) != 0U) {
 			/* setup the source */
 			cmd_buf_cpu_va[methodSize++] = 0x20028100;
 			cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
 				offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
 			cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
 				offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
 			cmd_buf_cpu_va[methodSize++] = 0x20018098;
 			if ((launch_flags &
 			     NVGPU_CE_SRC_LOCATION_LOCAL_FB) != 0U) {
 				cmd_buf_cpu_va[methodSize++] = 0x00000000;
 			} else if ((launch_flags &
 			     NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
 				cmd_buf_cpu_va[methodSize++] = 0x00000002;
 			} else {
 				cmd_buf_cpu_va[methodSize++] = 0x00000001;
 			}
 			launch |= 0x00001000U;
 		} else if ((request_operation & NVGPU_CE_MEMSET) != 0U) {
 			/* Remap from component A on 1 byte wide pixels */
 			cmd_buf_cpu_va[methodSize++] = 0x200181c2;
 			cmd_buf_cpu_va[methodSize++] = 0x00000004;
 			cmd_buf_cpu_va[methodSize++] = 0x200181c0;
 			cmd_buf_cpu_va[methodSize++] = payload;
 			launch |= 0x00000400U;
 		} else {
 			/* Illegal size */
 			return 0;
 		}
 		/* setup the destination/output */
 		cmd_buf_cpu_va[methodSize++] = 0x20068102;
 		cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
 			offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
 		cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
 			offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
 		/* Pitch in/out */
 		cmd_buf_cpu_va[methodSize++] = width;
 		cmd_buf_cpu_va[methodSize++] = width;
 		/* width and line count */
 		cmd_buf_cpu_va[methodSize++] = width;
 		cmd_buf_cpu_va[methodSize++] = height;
 		cmd_buf_cpu_va[methodSize++] = 0x20018099;
 		if ((launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) != 0U) {
 			cmd_buf_cpu_va[methodSize++] = 0x00000000;
 		} else if ((launch_flags &
 			   NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) != 0U) {
 			cmd_buf_cpu_va[methodSize++] = 0x00000002;
 		} else {
 			cmd_buf_cpu_va[methodSize++] = 0x00000001;
 		}
 		launch |= 0x00002005U;
 		if ((launch_flags &
 		     NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
 			launch |= 0x00000000U;
 		} else {
 			launch |= 0x00000080U;
 		}
 		if ((launch_flags &
 		     NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) != 0U) {
 			launch |= 0x00000000U;
 		} else {
 			launch |= 0x00000100U;
 		}
 		cmd_buf_cpu_va[methodSize++] = 0x200180c0;
 		cmd_buf_cpu_va[methodSize++] = launch;
 		offset += chunk_size;
 		chunk -= chunk_size;
 	}
 	return methodSize;
 }
 /* global CE app related apis */
 int nvgpu_ce_app_init_support(struct gk20a *g)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	if (unlikely(ce_app == NULL)) {
 		ce_app = nvgpu_kzalloc(g, sizeof(*ce_app));
 		if (ce_app == NULL) {
 			return -ENOMEM;
 		}
 		g->ce_app = ce_app;
 	}
 	if (ce_app->initialised) {
 		/* assume this happen during poweron/poweroff GPU sequence */
 		ce_app->app_state = NVGPU_CE_ACTIVE;
 		return 0;
 	}
 	nvgpu_log(g, gpu_dbg_fn, "ce: init");
 	nvgpu_mutex_init(&ce_app->app_mutex);
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_init_list_node(&ce_app->allocated_contexts);
 	ce_app->ctx_count = 0;
 	ce_app->next_ctx_id = 0;
 	ce_app->initialised = true;
 	ce_app->app_state = NVGPU_CE_ACTIVE;
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	nvgpu_log(g, gpu_dbg_cde_ctx, "ce: init finished");
 	return 0;
 }
 void nvgpu_ce_app_destroy(struct gk20a *g)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
 	if (ce_app == NULL) {
 		return;
 	}
 	if (ce_app->initialised == false) {
 		goto free;
 	}
 	ce_app->app_state = NVGPU_CE_SUSPEND;
 	ce_app->initialised = false;
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
 			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
 		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
 	}
 	nvgpu_init_list_node(&ce_app->allocated_contexts);
 	ce_app->ctx_count = 0;
 	ce_app->next_ctx_id = 0;
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	nvgpu_mutex_destroy(&ce_app->app_mutex);
 free:
 	nvgpu_kfree(g, ce_app);
 	g->ce_app = NULL;
 }
 void nvgpu_ce_app_suspend(struct gk20a *g)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	if (ce_app == NULL || !ce_app->initialised) {
 		return;
 	}
 	ce_app->app_state = NVGPU_CE_SUSPEND;
 }
 /* CE app utility functions */
 u32 nvgpu_ce_app_create_context(struct gk20a *g,
 		u32 runlist_id,
 		int timeslice,
 		int runlist_level)
 {
 	struct nvgpu_ce_gpu_ctx *ce_ctx;
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_setup_bind_args setup_bind_args;
 	u32 ctx_id = NVGPU_CE_INVAL_CTX_ID;
 	int err = 0;
 	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
 		return ctx_id;
 	}
 	ce_ctx = nvgpu_kzalloc(g, sizeof(*ce_ctx));
 	if (ce_ctx == NULL) {
 		return ctx_id;
 	}
 	nvgpu_mutex_init(&ce_ctx->gpu_ctx_mutex);
 	ce_ctx->g = g;
 	ce_ctx->cmd_buf_read_queue_offset = 0;
 	ce_ctx->vm = g->mm.ce.vm;
 	/* allocate a tsg if needed */
 	ce_ctx->tsg = nvgpu_tsg_open(g, nvgpu_current_pid(g));
 	if (ce_ctx->tsg == NULL) {
 		nvgpu_err(g, "ce: gk20a tsg not available");
 		goto end;
 	}
 	/* this TSG should never be aborted */
 	ce_ctx->tsg->abortable = false;
 	/* always kernel client needs privileged channel */
 	ce_ctx->ch = nvgpu_channel_open_new(g, runlist_id, true,
 				nvgpu_current_pid(g), nvgpu_current_tid(g));
 	if (ce_ctx->ch == NULL) {
 		nvgpu_err(g, "ce: gk20a channel not available");
 		goto end;
 	}
 #ifdef CONFIG_NVGPU_CHANNEL_WDT
 	ce_ctx->ch->wdt.enabled = false;
 #endif
 	/* bind the channel to the vm */
 	err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
 	if (err != 0) {
 		nvgpu_err(g, "ce: could not bind vm");
 		goto end;
 	}
 	err = nvgpu_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
 	if (err != 0) {
 		nvgpu_err(g, "ce: unable to bind to tsg");
 		goto end;
 	}
 	setup_bind_args.num_gpfifo_entries = 1024;
 	setup_bind_args.num_inflight_jobs = 0;
 	setup_bind_args.flags = 0;
 	err = nvgpu_channel_setup_bind(ce_ctx->ch, &setup_bind_args);
 	if (err != 0) {
 		nvgpu_err(g, "ce: unable to setup and bind channel");
 		goto end;
 	}
 	/* allocate command buffer from sysmem */
 	err = nvgpu_dma_alloc_map_sys(ce_ctx->vm,
 			NVGPU_CE_MAX_INFLIGHT_JOBS *
 			NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
 			&ce_ctx->cmd_buf_mem);
 	if (err != 0) {
 		nvgpu_err(g,
 			"ce: alloc command buffer failed");
 		goto end;
 	}
 	(void) memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00,
 		ce_ctx->cmd_buf_mem.size);
 #ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
 	/* -1 means default channel timeslice value */
 	if (timeslice != -1) {
 		err = g->ops.tsg.set_timeslice(ce_ctx->tsg, timeslice);
 		if (err != 0) {
 			nvgpu_err(g, "ce: set timesliced failed for CE context");
 			goto end;
 		}
 	}
 	/* -1 means default channel runlist level */
 	if (runlist_level != -1) {
 		err = nvgpu_tsg_set_interleave(ce_ctx->tsg, runlist_level);
 		if (err != 0) {
 			nvgpu_err(g, "ce: set runlist interleave failed");
 			goto end;
 		}
 	}
 #endif
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
 	nvgpu_list_add(&ce_ctx->list, &ce_app->allocated_contexts);
 	++ce_app->next_ctx_id;
 	++ce_app->ctx_count;
 	nvgpu_mutex_release(&ce_app->app_mutex);
 	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
 end:
 	if (ctx_id == NVGPU_CE_INVAL_CTX_ID) {
 		nvgpu_mutex_acquire(&ce_app->app_mutex);
 		nvgpu_ce_delete_gpu_context_locked(ce_ctx);
 		nvgpu_mutex_release(&ce_app->app_mutex);
 	}
 	return ctx_id;
 }
 void nvgpu_ce_app_delete_context(struct gk20a *g,
 		u32 ce_ctx_id)
 {
 	struct nvgpu_ce_app *ce_app = g->ce_app;
 	struct nvgpu_ce_gpu_ctx *ce_ctx, *ce_ctx_save;
 	if (ce_app == NULL || !ce_app->initialised ||
 		ce_app->app_state != NVGPU_CE_ACTIVE) {
 		return;
 	}
 	nvgpu_mutex_acquire(&ce_app->app_mutex);
 	nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
 			&ce_app->allocated_contexts, nvgpu_ce_gpu_ctx, list) {
 		if (ce_ctx->ctx_id == ce_ctx_id) {
 			nvgpu_ce_delete_gpu_context_locked(ce_ctx);
 			--ce_app->ctx_count;
 			break;
 		}
 	}
 	nvgpu_mutex_release(&ce_app->app_mutex);
 }
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -47,9 +47,6 @@
 #include <nvgpu/channel_sync.h>
 #include <nvgpu/gr/gr.h>
 #include <nvgpu/nvgpu_init.h>
 #ifndef CONFIG_NVGPU_CE
 #include <nvgpu/engines.h>
 #endif
 #ifdef CONFIG_NVGPU_TRACE
 #include <trace/events/gk20a.h>
@@ -136,10 +133,8 @@ int nvgpu_prepare_poweroff(struct gk20a *g)
 	nvgpu_falcon_sw_free(g, FALCON_ID_GSPLITE);
 	nvgpu_falcon_sw_free(g, FALCON_ID_NVDEC);
 	nvgpu_falcon_sw_free(g, FALCON_ID_SEC2);
 #endif
-#ifdef CONFIG_NVGPU_CE
+	nvgpu_ce_app_suspend(g);
 	nvgpu_ce_suspend(g);
 #endif
 #ifdef CONFIG_NVGPU_DGPU
@@ -494,17 +489,19 @@ int nvgpu_finalize_poweron(struct gk20a *g)
 	g->ops.fb.set_debug_mode(g, g->mmu_debug_ctrl);
 #endif
 #ifdef CONFIG_NVGPU_CE
 	err = nvgpu_ce_init_support(g);
 	if (err != 0) {
 		nvgpu_err(g, "failed to init ce");
 		goto done;
 	}
 #else
 	g->ops.mc.reset(g, nvgpu_engine_get_all_ce_reset_mask(g));
 #endif
 #ifdef CONFIG_NVGPU_DGPU
 	err = nvgpu_ce_app_init_support(g);
 	if (err != 0) {
 		nvgpu_err(g, "failed to init ce app");
 		goto done;
 	}
 	if (g->ops.xve.available_speeds != NULL) {
 		u32 speed;
@@ -655,8 +652,8 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount)
 	nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!");
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
-	nvgpu_ce_destroy(g);
+	nvgpu_ce_app_destroy(g);
 #endif
 #ifdef CONFIG_NVGPU_COMPRESSION
--- a/drivers/gpu/nvgpu/common/mm/mm.c
+++ b/drivers/gpu/nvgpu/common/mm/mm.c
@@ -114,19 +114,17 @@ static int nvgpu_alloc_sysmem_flush(struct gk20a *g)
 	return nvgpu_dma_alloc_sys(g, SZ_4K, &g->mm.sysmem_flush);
 }
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 static void nvgpu_remove_mm_ce_support(struct mm_gk20a *mm)
 {
 #ifdef CONFIG_NVGPU_DGPU
 	struct gk20a *g = gk20a_from_mm(mm);
 	if (mm->vidmem.ce_ctx_id != NVGPU_CE_INVAL_CTX_ID) {
-		nvgpu_ce_delete_context(g, mm->vidmem.ce_ctx_id);
+		nvgpu_ce_app_delete_context(g, mm->vidmem.ce_ctx_id);
 	}
 	mm->vidmem.ce_ctx_id = NVGPU_CE_INVAL_CTX_ID;
 	nvgpu_vm_put(mm->ce.vm);
 #endif
 }
 #endif
@@ -303,14 +301,13 @@ static int nvgpu_init_mmu_debug(struct mm_gk20a *mm)
 	return -ENOMEM;
 }
-#ifdef CONFIG_NVGPU_CE
+#if defined(CONFIG_NVGPU_DGPU)
 void nvgpu_init_mm_ce_context(struct gk20a *g)
 {
 #if defined(CONFIG_NVGPU_DGPU)
 	if (g->mm.vidmem.size > 0U &&
 	   (g->mm.vidmem.ce_ctx_id == NVGPU_CE_INVAL_CTX_ID)) {
 		g->mm.vidmem.ce_ctx_id =
-			nvgpu_ce_create_context(g,
+			nvgpu_ce_app_create_context(g,
 				nvgpu_engine_get_fast_ce_runlist_id(g),
 				-1,
 				-1);
@@ -320,9 +317,8 @@ void nvgpu_init_mm_ce_context(struct gk20a *g)
 				"Failed to allocate CE context for vidmem page clearing support");
 		}
 	}
 #endif
 }
-#endif /* NVGPU_FENCE_CE */
+#endif
 static int nvgpu_init_mm_reset_enable_hw(struct gk20a *g)
 {
@@ -523,7 +519,7 @@ static int nvgpu_init_mm_setup_sw(struct gk20a *g)
 	}
 	mm->remove_support = nvgpu_remove_mm_support;
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	mm->remove_ce_support = nvgpu_remove_mm_ce_support;
 #endif
--- a/drivers/gpu/nvgpu/common/mm/vidmem.c
+++ b/drivers/gpu/nvgpu/common/mm/vidmem.c
@@ -103,7 +103,7 @@ static int nvgpu_vidmem_do_clear_all(struct gk20a *g)
 	vidmem_dbg(g, "Clearing all VIDMEM:");
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	err = nvgpu_ce_execute_ops(g,
 			mm->vidmem.ce_ctx_id,
 			0,
@@ -119,6 +119,9 @@ static int nvgpu_vidmem_do_clear_all(struct gk20a *g)
 			"Failed to clear vidmem : %d", err);
 		return err;
 	}
 #else
 	/* fail due to lack of ce app support */
 	return -ENOSYS;
 #endif
 	if (fence_out != NULL) {
@@ -464,7 +467,7 @@ int nvgpu_vidmem_clear(struct gk20a *g, struct nvgpu_mem *mem)
 			nvgpu_fence_put(last_fence);
 		}
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 		err = nvgpu_ce_execute_ops(g,
 			g->mm.vidmem.ce_ctx_id,
 			0,
@@ -476,12 +479,12 @@ int nvgpu_vidmem_clear(struct gk20a *g, struct nvgpu_mem *mem)
 			0,
 			&fence_out);
 #else
-		/* fail due to lack of ce support */
+		/* fail due to lack of ce app support */
 		err = -ENOSYS;
 #endif
 		if (err != 0) {
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 			nvgpu_err(g,
 				"Failed nvgpu_ce_execute_ops[%d]", err);
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/ce.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/ce.h
@@ -69,17 +69,19 @@ enum {
 	NVGPU_CE_GPU_CTX_DELETED           = (1 << 1),
 };
 /* global CE app related apis */
 int nvgpu_ce_init_support(struct gk20a *g);
-void nvgpu_ce_suspend(struct gk20a *g);
+
-void nvgpu_ce_destroy(struct gk20a *g);
+/* global CE app related apis */
 int nvgpu_ce_app_init_support(struct gk20a *g);
 void nvgpu_ce_app_suspend(struct gk20a *g);
 void nvgpu_ce_app_destroy(struct gk20a *g);
 /* CE app utility functions */
-u32 nvgpu_ce_create_context(struct gk20a *g,
+u32 nvgpu_ce_app_create_context(struct gk20a *g,
 		u32 runlist_id,
 		int timeslice,
 		int runlist_level);
-void nvgpu_ce_delete_context(struct gk20a *g,
+void nvgpu_ce_app_delete_context(struct gk20a *g,
 		u32 ce_ctx_id);
 int nvgpu_ce_execute_ops(struct gk20a *g,
 		u32 ce_ctx_id,
--- a/drivers/gpu/nvgpu/include/nvgpu/mm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/mm.h
@@ -242,7 +242,7 @@ struct mm_gk20a {
 	struct nvgpu_mem hw_fault_buf[NVGPU_MMU_FAULT_TYPE_NUM];
 	struct mmu_fault_info fault_info[NVGPU_MMU_FAULT_TYPE_NUM];
 	struct nvgpu_mutex hub_isr_mutex;
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	/*
 	 * Separate function to cleanup the CE since it requires a channel to
 	 * be closed which must happen before fifo cleanup.
@@ -317,7 +317,7 @@ static inline u64 nvgpu_gmmu_va_small_page_limit(void)
 	return ((u64)SZ_1G * 56U);
 }
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 void nvgpu_init_mm_ce_context(struct gk20a *g);
 #endif
 int nvgpu_init_mm_support(struct gk20a *g);
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -469,7 +469,7 @@ int gk20a_pm_finalize_poweron(struct device *dev)
 	if (err)
 		goto done;
-#ifdef CONFIG_NVGPU_CE
+#ifdef CONFIG_NVGPU_DGPU
 	nvgpu_init_mm_ce_context(g);
 #endif