nvdla: kmd: switch to on-demand cleanup tasklist

[1] In the event of busy CPU, the delayed interrupt or the delayed callback from nvhost will result in the submission failure. [2] This commit fixes the issue by performing mandatory and controlled cleanups. - Atmost 2 slots are cleaned up prior to the submission. - Complete queue cleanup during the suspend preparation. - Complete queue cleanup after successful abort operation. [3] Additionally, the commit fixes some potential leaks in the error path. Bug 4503438 Bug 4414867 Change-Id: Ic40f0c4b1f3c653d5d5e613adab01d3cbc3b9722 Signed-off-by: Arvind M <am@nvidia.com> Signed-off-by: Akshata Bhat <akshatab@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3105861 (cherry picked from commit 438e8f8e96483971798e2d9014ed4a999143d082) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3120798 (cherry picked from commit 5668f6439643d44b3384bcd750a645d8db6ee0c9) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3196672 GVS: buildbot_gerritrpt <buildbot_gerritrpt@nvidia.com> Reviewed-by: Ken Adams <kadams@nvidia.com>
2025-12-24 10:11:26 +03:00 · 2024-03-29 01:07:00 +00:00
parent cd5a44401b
commit a917df5927
5 changed files with 224 additions and 147 deletions
--- a/drivers/video/tegra/host/nvdla/dla_queue.c
+++ b/drivers/video/tegra/host/nvdla/dla_queue.c
@@ -1,6 +1,6 @@
-// SPDX-License-Identifier: GPL-2.0-only
-// SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-/*
+// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+/* SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
 * NVDLA queue management
 */

@@ -38,8 +38,6 @@
 * alloc_table		Keep track of the index being assigned
 *			and freed for a task
 * max_task_cnt	Maximum task count that can be supported.
- * cleanup_done	Completion status of cleanup wait.
- * cleanup_wait	Records wait for cleanup action.
 */

 struct nvdla_queue_task_pool {
@@ -50,9 +48,6 @@ struct nvdla_queue_task_pool {

 	unsigned long alloc_table;
 	unsigned long max_task_cnt;
-
-	struct completion cleanup_done;
-	int cleanup_wait;
 };

 static int nvdla_queue_task_pool_alloc(struct platform_device *pdev,
@@ -91,9 +86,6 @@ static int nvdla_queue_task_pool_alloc(struct platform_device *pdev,

 	mutex_init(&task_pool->lock);

-	init_completion(&task_pool->cleanup_done);
-	task_pool->cleanup_wait = 0;
-
 	return err;

 err_alloc_task_pool:
@@ -233,6 +225,24 @@ void nvdla_queue_deinit(struct nvdla_queue_pool *pool)
 	pool = NULL;
 }

+static void nvdla_queue_cleanup(struct nvdla_queue *queue)
+{
+	struct nvdla_queue_pool *pool = queue->pool;
+
+	if (pool->ops && pool->ops->cleanup)
+		pool->ops->cleanup(queue);
+}
+
+static void nvdla_queue_cleanup_all(struct nvdla_queue_pool *pool)
+{
+	u32 id;
+
+	mutex_lock(&pool->queue_lock);
+	for_each_set_bit(id, &pool->alloc_table, pool->max_queue_cnt)
+		nvdla_queue_cleanup(&pool->queues[id]);
+	mutex_unlock(&pool->queue_lock);
+}
+
 #ifdef CONFIG_PM
 int nvdla_queue_pool_prepare_suspend(struct nvdla_queue_pool *qpool)
 {
@@ -247,6 +257,9 @@ int nvdla_queue_pool_prepare_suspend(struct nvdla_queue_pool *qpool)
 		struct nvdla_queue_task_pool *tpool = queue->task_pool;
 		bool nvdla_queue_is_idle;

+		/* Cleanup the queue before checking the idleness. */
+		nvdla_queue_cleanup(queue);
+
 		mutex_lock(&tpool->lock);
 		nvdla_queue_is_idle = (tpool->alloc_table == 0ULL);
 		mutex_unlock(&tpool->lock);
@@ -324,11 +337,22 @@ struct nvdla_queue *nvdla_queue_alloc(struct nvdla_queue_pool *pool,
 	index = find_first_zero_bit(&pool->alloc_table,
 				    pool->max_queue_cnt);

-	/* quit if we found a queue */
+	/* Queue not found on first attempt. */
 	if (index >= pool->max_queue_cnt) {
-		dev_err(&pdev->dev, "failed to get free Queue\n");
-		err = -ENOMEM;
-		goto err_alloc_queue;
+
+		mutex_unlock(&pool->queue_lock);
+
+		/* Cleanup and retry one more time before erroring out */
+		nvdla_queue_cleanup_all(pool);
+
+		mutex_lock(&pool->queue_lock);
+		index = find_first_zero_bit(&pool->alloc_table,
+				    pool->max_queue_cnt);
+		if (index >= pool->max_queue_cnt) {
+			dev_err(&pdev->dev, "failed to get free Queue\n");
+			err = -ENOMEM;
+			goto err_alloc_queue;
+		}
 	}
 	spec_bar(); /* break_spec_p#1 */

@@ -578,18 +602,6 @@ int nvdla_queue_alloc_task_memory(
 	struct nvdla_queue_task_pool *task_pool =
 		(struct nvdla_queue_task_pool *)queue->task_pool;

-	if (task_pool->cleanup_wait == 1) {
-		unsigned long timeout =
-			msecs_to_jiffies(NVDLA_TASK_MEM_AVAIL_RETRY_PERIOD);
-
-		/**
-		 * Error intentionally ignored to be catpured as part of
-		 * out-of-range index during allocation.
-		 **/
-		(void) wait_for_completion_timeout(&task_pool->cleanup_done,
-				timeout);
-	}
-
 	mutex_lock(&task_pool->lock);

 	index = find_first_zero_bit(&task_pool->alloc_table,
@@ -597,8 +609,7 @@ int nvdla_queue_alloc_task_memory(

 	/* quit if pre-allocated task array is not free */
 	if (index >= task_pool->max_task_cnt) {
-		dev_warn(&pdev->dev, "failed to get Task Pool Memory\n");
-		task_pool->cleanup_wait = 1; // wait for cleanup
+		dev_err(&pdev->dev, "failed to get Task Pool Memory\n");
 		err = -EAGAIN;
 		goto err_alloc_task_mem;
 	}
@@ -638,9 +649,5 @@ void nvdla_queue_free_task_memory(struct nvdla_queue *queue, int index)
 	mutex_lock(&task_pool->lock);
 	clear_bit(index, &task_pool->alloc_table);

-	if (task_pool->cleanup_wait == 1) {
-		task_pool->cleanup_wait = 0;
-		complete(&task_pool->cleanup_done);
-	}
 	mutex_unlock(&task_pool->lock);
 }
--- a/drivers/video/tegra/host/nvdla/dla_queue.h
+++ b/drivers/video/tegra/host/nvdla/dla_queue.h
@@ -1,8 +1,7 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (c) 2016-2023, NVIDIA Corporation.  All rights reserved.
+/* SPDX-License-Identifier: LicenseRef-NvidiaProprietary */
+/* SPDX-FileCopyrightText: Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 *
- * NVHOST Queue management header for T194/T23x
+ * NVDLA queue management header
 */

 #ifndef __NVHOST_NVDLA_QUEUE_H__
@@ -10,9 +9,6 @@

 #include <linux/kref.h>

-#define NVDLA_TASK_MEM_AVAIL_TIMEOUT_MS 10  /* 10 ms */
-#define NVDLA_TASK_MEM_AVAIL_RETRY_PERIOD 1 /* 1 ms */
-
 struct nvdla_queue_task_pool;

 /**
@@ -82,7 +78,7 @@ struct nvdla_queue {
 * submit		submit the given list of tasks to hardware
 * get_task_size	get the dma size needed for the task in hw
 *			and the kernel memory size needed for task.
- *
+ * cleanup		cleans expired tasks from the tasklist.
 */
 struct nvdla_queue_ops {
 	void (*dump)(struct nvdla_queue *queue, struct seq_file *s);
@@ -90,6 +86,7 @@ struct nvdla_queue_ops {
 	int (*submit)(struct nvdla_queue *queue, void *task_arg);
 	void (*get_task_size)(size_t *dma_size, size_t *kmem_size);
 	int (*set_attribute)(struct nvdla_queue *queue, void *arg);
+	void (*cleanup)(struct nvdla_queue *queue);
 };

 /**
--- a/drivers/video/tegra/host/nvdla/nvdla.h
+++ b/drivers/video/tegra/host/nvdla/nvdla.h
@@ -462,6 +462,17 @@ int nvdla_send_cmd(struct platform_device *pdev,
 */
 void nvdla_task_put(struct nvdla_task *task);

+/**
+ * nvdla_task_init()	initializes task reference count to 1
+ *
+ * @task		Pointer to task in operation
+ *
+ * Return		void
+ *
+ * This function initializes task reference count
+ */
+void nvdla_task_init(struct nvdla_task *task);
+
 /**
 * nvdla_task_get()	increase task reference count
 *
@@ -474,18 +485,28 @@ void nvdla_task_put(struct nvdla_task *task);
 void nvdla_task_get(struct nvdla_task *task);

 /**
- * nvdla_task_alloc()	allocate task for a give queue
+ * nvdla_fill_task_desc()	fills the task descriptor
 *
 * @task		Pointer to nvdla_task.
 * @bypass_exec		Task is marked to bypass its execution.
 *
- * Return		allocated task in success, otherwise pointer to err
+ * Return		zero on success, non-zero otherwise.
 *
- * This function allocates task desc and fills up initial task descriptor as
- * task parameter detais
+ * This function fills up initial task descriptor using the task parameters.
 */
 int nvdla_fill_task_desc(struct nvdla_task *task, bool bypass_exec);

+/**
+ * nvdla_unmap_task_memory()	unmaps the task memory
+ *
+ * @task		Pointer to nvdla_task.
+ *
+ * Return		zero on success, non-zero otherwise.
+ *
+ * This function unmaps the task memory, mapped with nvdla_fill_task_desc.
+ */
+int nvdla_unmap_task_memory(struct nvdla_task *task);
+
 /**
 * nvdla_send_postfences()	send back fences to UMD
 *
--- a/drivers/video/tegra/host/nvdla/nvdla_ioctl.c
+++ b/drivers/video/tegra/host/nvdla/nvdla_ioctl.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0-only
-// SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-/*
- * NVDLA IOCTL for T194
+// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+/* SPDX-FileCopyrightText: Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVDLA IOCTL
 */

 #include <nvidia/conftest.h>
@@ -1080,7 +1080,7 @@ static int nvdla_submit(struct nvdla_private *priv, void *arg)
 		nvdla_dbg_info(pdev, "task[%d] mem allocate done", i + 1);

 		/* Initialize ref for task submit preparation */
-		kref_init(&task->ref);
+		nvdla_task_init(task);

 		/* fill local task param from user args */
 		err = nvdla_fill_task(queue, buffers, &local_task, task);
@@ -1130,7 +1130,7 @@ static int nvdla_submit(struct nvdla_private *priv, void *arg)
 			goto fail_to_submit_task;
 		}
 		nvdla_dbg_info(pdev, "task[%d] submitted", i + 1);
-		kref_put(&task->ref, task_free);
+		nvdla_task_put(task);
 	}
 	nvdla_dbg_fn(pdev, "Task submitted, done!");

@@ -1146,12 +1146,18 @@ static int nvdla_submit(struct nvdla_private *priv, void *arg)
 fail_to_submit_task:
 fail_to_update_postfences:
 fail_to_get_fences:
+	/* Restore to the last successful sequence. */
+	if (likely(queue->sequence > 0U))
+		queue->sequence = queue->sequence - 1U;
+	else
+		queue->sequence = UINT_MAX;
+
+	/* Unmap the memory mapped when populating task descriptor. */
+	nvdla_unmap_task_memory(task);
 fail_to_fill_task_desc:
 fail_to_fill_task:
 	/* Remove ref corresponding task submit preparation */
-	kref_put(&task->ref, task_free);
-
-	/*TODO: traverse list in reverse and delete jobs */
+	nvdla_task_put(task);
 fail_to_get_task_mem:
 fail_to_copy_task:
 	return err;
--- a/drivers/video/tegra/host/nvdla/nvdla_queue.c
+++ b/drivers/video/tegra/host/nvdla/nvdla_queue.c
@@ -1,8 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2016-2023, NVIDIA Corporation.  All rights reserved.
+// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+/* SPDX-FileCopyrightText: Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 *
- * NVDLA queue and task management for T194
+ * NVDLA queue and task management.
 */

 #include <linux/arm64-barrier.h>
@@ -28,6 +27,12 @@
 #define NVDLA_QUEUE_ABORT_TIMEOUT	10000	/* 10 sec */
 #define NVDLA_QUEUE_ABORT_RETRY_PERIOD	500	/* 500 ms */

+/* Limit on maximum number of DLA tasks to cleanup during submit. */
+#define NVDLA_QUEUE_SUBMIT_CLEANUP_LIMIT 2U
+
+/* Limit to cleanup all DLA and/or EMU tasks in queue */
+#define NVDLA_QUEUE_CLEANUP_LIMIT (MAX_NVDLA_TASK_COUNT + 1U)
+
 /* struct to hold information required for nvdla_add_fence_action_cb */
 struct nvdla_add_fence_action_cb_args {
 	struct nvdla_queue *queue;
@@ -79,6 +84,9 @@ static void nvdla_queue_dump_op(struct nvdla_queue *queue, struct seq_file *s)
 	mutex_unlock(&queue->list_lock);
 }

+static void nvdla_queue_task_cleanup(struct nvdla_queue *queue,
+	uint32_t max_dla_cleanup_depth);
+
 int nvdla_get_task_mem(struct nvdla_queue *queue,
 			struct nvdla_task **ptask)
 {
@@ -86,16 +94,16 @@ int nvdla_get_task_mem(struct nvdla_queue *queue,
 	struct nvdla_task *task = NULL;
 	struct nvdla_queue_task_mem_info task_mem_info;
 	struct platform_device *pdev = queue->pool->pdev;
-	int n_retries = (NVDLA_TASK_MEM_AVAIL_TIMEOUT_MS /
-					NVDLA_TASK_MEM_AVAIL_RETRY_PERIOD);

 	nvdla_dbg_fn(pdev, "");

+	/* Cleanup at most 2 completed tasks. */
+	nvdla_dbg_info(pdev, "on-demand cleanup prior to submission.");
+	nvdla_queue_task_cleanup(queue, NVDLA_QUEUE_SUBMIT_CLEANUP_LIMIT);
+
 	/* get mem task descriptor and task mem from task_mem_pool */
-	do {
-		n_retries = n_retries - 1;
-		err = nvdla_queue_alloc_task_memory(queue, &task_mem_info);
-	} while ((n_retries > 0) && (err == -EAGAIN));
+	task_mem_info.pool_index = -1;
+	err = nvdla_queue_alloc_task_memory(queue, &task_mem_info);

 	task = task_mem_info.kmem_addr;
 	if ((err < 0) || !task)
@@ -107,13 +115,18 @@ int nvdla_get_task_mem(struct nvdla_queue *queue,
 		goto fail_to_aligned_dma;
 	}

+	task->queue = queue;
 	task->task_desc = task_mem_info.va;
 	task->task_desc_pa = task_mem_info.dma_addr;
 	task->pool_index = task_mem_info.pool_index;

 	*ptask = task;

+	return 0;
+
 fail_to_aligned_dma:
+	if (task_mem_info.pool_index != -1)
+		nvdla_queue_free_task_memory(queue, task_mem_info.pool_index);
 fail_to_assign_pool:
 	return err;
 }
@@ -149,6 +162,18 @@ void nvdla_task_put(struct nvdla_task *task)
 	nvdla_queue_put(queue);
 }

+void nvdla_task_init(struct nvdla_task *task)
+{
+	struct nvdla_queue *queue = task->queue;
+	struct platform_device *pdev = queue->pool->pdev;
+
+	nvdla_dbg_fn(pdev, "task:[%p]", task);
+
+	/* update queue refcnt */
+	nvdla_queue_get(task->queue);
+	kref_init(&task->ref);
+}
+
 void nvdla_task_get(struct nvdla_task *task)
 {
 	struct nvdla_queue *queue = task->queue;
@@ -158,11 +183,10 @@ void nvdla_task_get(struct nvdla_task *task)

 	/* update queue refcnt */
 	nvdla_queue_get(task->queue);
-
 	kref_get(&task->ref);
 }

-static int nvdla_unmap_task_memory(struct nvdla_task *task)
+int nvdla_unmap_task_memory(struct nvdla_task *task)
 {
 	int ii;
 	struct nvdla_queue *queue = task->queue;
@@ -320,35 +344,44 @@ static inline size_t nvdla_profile_status_offset(struct nvdla_task *task)
 	return offset;
 }

-
-#if IS_ENABLED(CONFIG_TEGRA_GRHOST)
-/*
- * This function definition can be removed once support
- * for the NVIDIA Linux v5.10 kernel is removed.
+/**
+ * Description:
+ *
+ * Clean up the completed tasks.
+ *
+ * This function cleans up atmost max_dla_cleanup_depth completed dla tasks
+ * from the task list.
+ *
+ * @param[in] queue Pointer to the queue
+ * @param[in] max_dla_cleanup_depth limit on number of DLA completed tasks
+ *                  that can be cleaned up
+ *
 */
-static void nvdla_queue_update(void *priv, int unused)
-#else
-static void nvdla_queue_update(void *priv)
-#endif
+static void nvdla_queue_task_cleanup(struct nvdla_queue *queue,
+	uint32_t max_dla_cleanup_depth)
 {
 	int task_complete;
 	struct nvdla_task *task, *safe;
-	struct nvdla_queue *queue = priv;
 	struct platform_device *pdev = queue->pool->pdev;
 	struct nvhost_notification *tsp_notifier;
 	u64 timestamp_start, timestamp_end;
 	u64 *timestamp_ptr;
-	int n_tasks_completed = 0;
+	uint32_t dla_cleanup_depth = 0U;
 	uint32_t task_id;
 	int i;
+
 	mutex_lock(&queue->list_lock);

 	nvdla_dbg_fn(pdev, "");

 	/* check which task(s) finished */
 	list_for_each_entry_safe(task, safe, &queue->tasklist, list) {
+		if (dla_cleanup_depth >= max_dla_cleanup_depth)
+			break;
+
 		task_id = nvdla_compute_task_id(task->task_desc->sequence,
 				task->task_desc->queue_id);
+
 		task_complete = nvhost_syncpt_is_expired_ext(pdev,
 					queue->syncpt_id, task->fence);

@@ -376,16 +409,26 @@ static void nvdla_queue_update(void *priv)
 			}
 		}
 			nvdla_task_free_locked(task);
-			n_tasks_completed++;
+
+			/* Stay at UINT_MAX, once reached the UINT_MAX. */
+			if (unlikely(dla_cleanup_depth >= (UINT_MAX - 1U)))
+				dla_cleanup_depth = UINT_MAX - 1U;
+
+			dla_cleanup_depth = dla_cleanup_depth + 1U;
 		}
 	}

 	/* put pm refcount */
-	nvhost_module_idle_mult(pdev, n_tasks_completed);
+	nvhost_module_idle_mult(pdev, dla_cleanup_depth);

 	mutex_unlock(&queue->list_lock);
 }

+static void nvdla_queue_cleanup_op(struct nvdla_queue *queue)
+{
+	nvdla_queue_task_cleanup(queue, NVDLA_QUEUE_CLEANUP_LIMIT);
+}
+
 static size_t nvdla_get_task_desc_size(void)
 {
 	size_t size = 0;
@@ -1061,9 +1104,10 @@ int nvdla_fill_task_desc(struct nvdla_task *task, bool bypass_exec)
 	}

 	/* update current task sequeue, make sure wrap around condition */
-	queue->sequence = queue->sequence + 1;
-	if (unlikely(queue->sequence >= (UINT_MAX - 1)))
-		queue->sequence = 0;
+	if (likely(queue->sequence < UINT_MAX))
+		queue->sequence = queue->sequence + 1U;
+	else
+		queue->sequence = 0U;

 	task_desc->sequence = queue->sequence;

@@ -1119,6 +1163,11 @@ int nvdla_fill_task_desc(struct nvdla_task *task, bool bypass_exec)
 	return 0;

 fail_to_map_mem:
+	if (likely(queue->sequence > 0U))
+		queue->sequence = queue->sequence - 1U;
+	else
+		queue->sequence = UINT_MAX;
+
 	(void) nvdla_unmap_task_memory(task);
 	return err;
 }
@@ -1300,34 +1349,21 @@ static int nvdla_queue_submit_op(struct nvdla_queue *queue, void *in_task)

 	mutex_lock(&queue->list_lock);

-	/* Get a reference before registration or submission */
+	/* Get a reference before submission to the firmware */
 	nvdla_task_get(task);

 	task_id = nvdla_compute_task_id(task->task_desc->sequence, task->task_desc->queue_id);

-	/* get fence from nvhost for MMIO mode*/
-	if (nvdla_dev->submit_mode == NVDLA_SUBMIT_MODE_MMIO) {
-		task->fence = nvhost_syncpt_incr_max_ext(pdev,
-						queue->syncpt_id,
-						task->fence_counter);
-	}
-
 	/* update last task desc's next */
 	if (!list_empty(&queue->tasklist)) {
 		last_task = list_last_entry(&queue->tasklist,
 						struct nvdla_task, list);
-		last_task->task_desc->next = (uint64_t)task->task_desc_pa;

-		nvdla_dbg_info(pdev, "last task[%p] last_task_desc_pa[%llu]",
-				last_task, task->task_desc_pa);
+		/* Hold the last task reference until task chaining. */
+		nvdla_task_get(last_task);
 	}
-	list_add_tail(&task->list, &queue->tasklist);

-	nvdla_dbg_info(pdev, "task[%p] added to list", task);
-
-	nvdla_dbg_fn(pdev, "syncpt[%d] fence[%d] task[%p] fence_counter[%u]",
-				queue->syncpt_id, task->fence,
-				task, task->fence_counter);
+	mutex_unlock(&queue->list_lock);

 	/* enable INT_ON_COMPLETE and INT_ON_ERROR falcon interrupts */
 	method_id = (DLA_CMD_SUBMIT_TASK & DLA_METHOD_ID_CMD_MASK) |
@@ -1342,41 +1378,32 @@ static int nvdla_queue_submit_op(struct nvdla_queue *queue, void *in_task)
 	if (nvhost_module_busy(pdev))
 		goto fail_to_poweron;

-	/* prepare command for channel submit */
-	if (nvdla_dev->submit_mode == NVDLA_SUBMIT_MODE_CHANNEL) {
+	/* prepare command for submit */
+	cmd_data.method_id = method_id;
+	cmd_data.method_data = method_data;
+	cmd_data.wait = true;

-		cmd_data.method_id = method_id;
-		cmd_data.method_data = method_data;
-		cmd_data.wait = true;
-
-		/* submit task to engine */
+	if (unlikely(nvdla_dev->submit_mode == NVDLA_SUBMIT_MODE_CHANNEL)) {
 		err = nvdla_send_cmd_channel(pdev, queue, &cmd_data, task);
 		if (err) {
 			nvdla_dbg_err(pdev, "task[%p] submit failed", task);
-			goto fail_to_channel_submit;
+			goto fail_to_submit;
 		}
 	}

-	/* register notifier with fence */
-	err = nvhost_intr_register_notifier(pdev, queue->syncpt_id,
-		task->fence, nvdla_queue_update, queue);
-	if (err)
-		goto fail_to_register;
-
-	/* prepare command for MMIO submit */
-	if (nvdla_dev->submit_mode == NVDLA_SUBMIT_MODE_MMIO) {
-		cmd_data.method_id = method_id;
-		cmd_data.method_data = method_data;
-		cmd_data.wait = true;
-
-		/* submit task to engine */
+	if (likely(nvdla_dev->submit_mode == NVDLA_SUBMIT_MODE_MMIO)) {
 		err = nvdla_send_cmd(pdev, &cmd_data);
 		if (err) {
 			nvdla_dbg_err(pdev, "task[%p] submit failed", task);
-			/* deletes invalid task from queue, puts refs */
-			nvhost_syncpt_set_min_update(pdev, queue->syncpt_id,
-							 task->fence);
+			goto fail_to_submit;
 		}
+
+		task->fence = nvhost_syncpt_incr_max_ext(pdev,
+						queue->syncpt_id,
+						task->fence_counter);
+		nvdla_dbg_fn(pdev, "syncpt[%d] fence[%d] task[%p] fence_counter[%u]",
+				queue->syncpt_id, task->fence,
+				task, task->fence_counter);
 	}

 	if (IS_ENABLED(CONFIG_TRACING)) {
@@ -1392,14 +1419,35 @@ static int nvdla_queue_submit_op(struct nvdla_queue *queue, void *in_task)
 		}
 	}

-	mutex_unlock(&queue->list_lock);
-	return err;
+	mutex_lock(&queue->list_lock);

-fail_to_register:
-fail_to_channel_submit:
+	/* Chain the successfully submited task if last task available. */
+	if (last_task != NULL) {
+		last_task->task_desc->next = (uint64_t)task->task_desc_pa;
+
+		nvdla_dbg_info(pdev, "last task[%p] last_task_desc_pa[%llu]",
+				last_task, task->task_desc_pa);
+
+		nvdla_task_put(last_task);
+	}
+
+	/* Add a queue entry upon successful submission. */
+	list_add_tail(&task->list, &queue->tasklist);
+	nvdla_dbg_info(pdev, "task[%p] added to list", task);
+
+	mutex_unlock(&queue->list_lock);
+
+	return 0;
+
+fail_to_submit:
 	nvhost_module_idle(pdev);
 fail_to_poweron:
-	nvdla_task_free_locked(task);
+	mutex_lock(&queue->list_lock);
+	if (last_task != NULL)
+		nvdla_task_put(last_task);
+
+	/* Put back the task reference if failure. */
+	nvdla_task_put(task);
 	mutex_unlock(&queue->list_lock);

 	return err;
@@ -1446,22 +1494,24 @@ fail_to_poweron:
 static int nvdla_queue_abort_op(struct nvdla_queue *queue)
 {
 	int err = 0, fence;
-	struct nvdla_task *t;
 	struct nvdla_cmd_data cmd_data;
 	struct platform_device *pdev = queue->pool->pdev;
 	int retry = NVDLA_QUEUE_ABORT_TIMEOUT / NVDLA_QUEUE_ABORT_RETRY_PERIOD;
+	bool queue_empty;

 	nvdla_dbg_fn(pdev, "");

 	mutex_lock(&queue->list_lock);
-	if (list_empty(&queue->tasklist))
-		goto list_empty;
+	queue_empty = list_empty(&queue->tasklist);
+	mutex_unlock(&queue->list_lock);
+	if (queue_empty)
+		goto done;

 	/* get pm refcount */
 	err = nvhost_module_busy(pdev);
 	if (err) {
 		nvdla_dbg_err(pdev, "failed to poweron, err: %d", err);
-		goto fail_to_poweron;
+		goto done;
 	}

 	/* prepare command */
@@ -1482,29 +1532,24 @@ static int nvdla_queue_abort_op(struct nvdla_queue *queue)
 		nvdla_dbg_err(pdev,
 		"Q %d abort fail. err:%d, retry:%d",
 			queue->id, err, retry);
-		goto done;
+		goto poweroff;
 	}

 	nvdla_dbg_info(pdev, "Engine Q[%d] flush done", queue->id);

-	/* if task present free them by reset syncpoint */
-	if (!list_empty(&queue->tasklist)) {
-		t = list_last_entry(&queue->tasklist, struct nvdla_task, list);
+	/* reset syncpoint to release all tasks */
+	fence = nvhost_syncpt_read_maxval(pdev, queue->syncpt_id);
+	nvhost_syncpt_set_min_update(pdev, queue->syncpt_id, fence);

-		/* reset syncpoint to release all tasks */
-		fence = nvhost_syncpt_read_maxval(pdev, queue->syncpt_id);
-		nvhost_syncpt_set_min_update(pdev, queue->syncpt_id, fence);
-
-		/* dump details */
-		nvdla_dbg_info(pdev, "Q id %d reset syncpt[%d] done",
+	/* dump details */
+	nvdla_dbg_info(pdev, "Q id %d reset syncpt[%d] done",
 			queue->id, queue->syncpt_id);
-	}

-done:
+	nvdla_queue_cleanup_op(queue);
+
+poweroff:
 	nvhost_module_idle(pdev);
-fail_to_poweron:
-list_empty:
-	mutex_unlock(&queue->list_lock);
+done:
 	return err;
 }

@@ -1513,4 +1558,5 @@ struct nvdla_queue_ops nvdla_queue_ops = {
 	.submit = nvdla_queue_submit_op,
 	.get_task_size =  nvdla_get_task_desc_memsize_op,
 	.dump = nvdla_queue_dump_op,
+	.cleanup = nvdla_queue_cleanup_op,
 };