pva: mirror from gitlab cv/pva-sys-sw

Gitlab commit a307885246be7 ("umd: port intf tests to umd - ...") Changes since last deployment: - umd: port intf tests to umd - add NegativeTest_MaxContextCreate_Single... - Remove VPU auth default macro - kmd: Add a null check to fix static defect - tests: Fix sync unregistration test - fw: Handle DMA error when fetching chunk - umd_tests: add requirements tests - Fix error path of Linux KMD memory API - Add kernel code style check script - address review comments for access flag feat - fix memory leak in access buffer tests - kmd: use correct formatting - fw: update license and use macro - tests: add tests for buffer access type - feat: respect buffer access flags - Update deployment document - Add a default fence wait timeout - Fix error path of KMD sync ops submit - Move recovery state check to pva_kmd_device_busy - Fix error path of profiling level update - Increase max CCQ FIFO depth to 14 - kmd: zero initialize all commands - Make KMD robust against firmware abort - Add multi user stress tests - Conditional VMEM Clear Check - Conditional VMEM Clear Check - Fix static defects in KMD - Reading auth for all the PVA devices - Add support for VPU Device Tree authentication - UMD: Add Null and range checks - Remove assert and propogate error - Add error injection tests - Bug fix - 5207608 - Update CUPVA tests in CI to 2.7.0 rc3 - tests: add register_exec_noop_with_bit_flip - fw: Fix static defects - kmd: freeze PVA before freeing code/data memory - Add missing test_mode parameter for run_test_plan - deploy: allow deploying to different branches - pva kmd: linux: handle test mode dt entry - fw: baremetal: bound breaklink params - coverity: Set expiry for code quality report - kmd: Remove PVA_IS_DEBUG from native timeout calc - Reduce iterations of long duration UMD tests - Fix IOVA leak in Linux KMD - fw:common: fix order of enable/disable virt - umd_test: add missing requirement test specs - umd_test: add test for perf spikes - Fix nsight fence logging - deploy: fix GVS build failure - Add FSP Abort Hook - Execution timeout - Trace fences for NSIGHT - Fix shared buffer handling during suspend/resume - tests: add more tests for resource unregistration - Add MODS test support - KMD:Fix static defect - umd: fix double free in cuextend - umd: Free pva_memory object on free() - Unify VPU and PPE syscall ID - Clean up public API - cuextend: Cleanup implementation - cuextend: Add API to get stream payload - compat: Fix missing flushes of event fences - cuExtend: Unified code path for stream submit - cuExtend: Implementation of cuExtend Stream Submit - cuExtend: Stream submit API definitions - cuExtend: Sync to new cuExtend header - Set test mode default through macro - fw: Add PPE error codes - Use zalloc when allocating resource record - Allocate Temporary Buffers for DMA Config Loading - Fix fast reset failure test - Add DMA config allocator - kmd: Add unsafe API for block allocator - Add missing warning for Linux kernel build - Set err cmd idx to zero if there's no error - ci: Run tests for MODS test mode - Use 1K command buffer chunk size in MODS test mode - Allow developer to provide its own target lease - tests: add nvsci prefence_postfence_test - kmd: Sha calculation static defects fix - kmd: fix INT30-c static defect - Fix command index logging for PVA_FW_EVENT_RUN_VPU - Enable vpucfg_destroy_after_submit - tests: add tests spec for deterministic test - test: add cpu_signaller_pva_waiter_deterministic - tests: add cpu_waiter_pva_signaller_deterministic - Disable verbosity control of FW log - Ops free API should accept NULL ptr - Report TCM usage for t26x as well - Support non-contiguous syncpoints - umd: fix new top 25 CWE - License header update - L2SRAM flush command changes - debugfs: disable tests for broken nodes - debugfs: handle 0 input size for allowlist path - Move pva_kmd_device_idle to context destroy - Refactor interrupt handler binding in PVA KMD - Fix DMA registration error path - debugfs: Add read support for fw log level - Add stress test suites to CI - Fix error path for context init - Add stress test suites - umd: add NULL checks - ci: Perf Test Updates - ci: perf test updates - Enable boot from GSC in L4T GVS - Updating comment Change-Id: I98be7ec270ba5f6fd5bc0978d084d731a88e70b6 Signed-off-by: abhinayaa <abhinayaa@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3348508 GVS: buildbot_gerritrpt <buildbot_gerritrpt@nvidia.com> Reviewed-by: Mohnish Jain <mohnishj@nvidia.com>
2025-12-22 17:25:35 +03:00 · 2025-04-24 04:43:25 +00:00
parent 0ac4068c89
commit 63f6f2f159
69 changed files with 2205 additions and 1891 deletions
--- a/drivers/video/tegra/host/pva/Makefile
+++ b/drivers/video/tegra/host/pva/Makefile
@@ -22,6 +22,7 @@ pva_objs += \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_context.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_debugfs.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_device.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_devmem_pool.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg_binding.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg_validate.o \
@@ -70,6 +71,7 @@ pva_inc_flags += \
    -I$(PVA_SYS_ABSDIR)/src/kmd/include \
    -I$(PVA_SYS_ABSDIR)/src/kmd/linux/include \
    -I$(PVA_SYS_ABSDIR)/src/libs/pva/include \
    -I$(PVA_SYS_ABSDIR)/src/private_api \
 pva_def_flags += \
    -DPVA_BUILD_MODE=PVA_BUILD_MODE_L4T \
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-checkpoint.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-checkpoint.h
@@ -224,6 +224,11 @@
 #define PVA_ABORT_NOC_BIST (0xfcU)
 //! @endcond
 /**
 * @brief Minor code for abort in case of FSP abort.
 */
 #define PVA_ABORT_FSP 0x42U
 /** @} */
 /**
@@ -299,4 +304,36 @@
 #define PVA_ABORT_IRQ_TEST_HOST (0xE002U)
 #endif
 /** @} */
 /**
 * @defgroup PVA_ABORT_ARGUMENTS_FSP Argument to pva_abort() from FSP abort
 * @ingroup PVA_ABORT_ARGUMENTS
 * @{
 */
 /**
 * @brief Minor Code for FSP aborts because of safertos errors
 */
 #define PVA_ABORT_FSP_SAFERTOS (0xE001U)
 /**
 * @brief Minor Code for FSP aborts because of asserts in fsp
 */
 #define PVA_ABORT_FSP_ASSERT (0xE002U)
 /**
 * @brief Minor Code for FSP aborts because of exception in fsp
 */
 #define PVA_ABORT_FSP_EXCEPTION (0xE003U)
 /**
 * @brief Minor Code for FSP aborts because of stack guard failure
 */
 #define PVA_ABORT_FSP_STACK (0xE004U)
 /**
 * @brief Minor Code for Unknown FSP aborts
 */
 #define PVA_ABORT_FSP_UNKNOWN (0xE005U)
 /** @} */
 #endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-errors.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-errors.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+/* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
 #ifndef PVA_ERRORS_H
 #define PVA_ERRORS_H
@@ -154,15 +154,6 @@ typedef uint16_t pva_errors_t;
 */
 //! @cond DISABLE_DOCUMENTATION
 /**
 * @brief Error in case of Floating point NAN.
 */
 #define PVA_ERR_PPE_DIVIDE_BY_0 (0x34U)
 /**
 * @brief Error in case of Floating point NAN.
 */
 #define PVA_ERR_PPE_ILLEGAL_DEBUG (0x36U)
 #define PVA_ERR_PPE_ILLEGAL_INSTR_ALIGN (0x37U)
 /**
@@ -270,40 +261,6 @@ typedef uint16_t pva_errors_t;
 * more than HW Seq RAM size.
 */
 #define PVA_ERR_DMA_HWSEQ_PROGRAM_TOO_LONG (0x217U)
 /** @} */
 /**
 * @defgroup PVA_MISR_ERRORS
 *
 * @brief MISR error codes used across PVA.
 * @{
 */
 /**
 * @brief Error status when DMA MISR test is not run.
 */
 #define PVA_ERR_MISR_NOT_RUN (0x280U)
 /**
 * @brief Error status when DMA MISR test did not complete.
 */
 #define PVA_ERR_MISR_NOT_DONE (0x281U)
 /**
 * @brief Error status when DMA MISR test timed out.
 */
 #define PVA_ERR_MISR_TIMEOUT (0x282U)
 /**
 * @brief Error status in case of DMA MISR test address failure.
 */
 #define PVA_ERR_MISR_ADDR (0x283U)
 /**
 * @brief Error status in case of DMA MISR test data failure.
 */
 #define PVA_ERR_MISR_DATA (0x284U)
 /**
 * @brief Error status in case of DMA MISR test data and address failure.
 */
 #define PVA_ERR_MISR_ADDR_DATA (0x285U)
 /** @} */
 /**
 * @defgroup PVA_VPU_ISR_ERRORS
 *
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-vpu-syscall-interface.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-vpu-syscall-interface.h
@@ -6,150 +6,4 @@
 #include <stdint.h>
 /**
 * @breif Write syscall parameter will be a pointer to this struct
 * @{
 */
 typedef union {
 	struct {
 		uint32_t addr;
 		uint32_t size;
 	} in;
 	struct {
 		uint32_t written_size;
 	} out;
 } pva_fw_pe_syscall_write;
 /** @} */
 /**
 * @defgroup PVA_VPU_SYSCALL_PERFMON_SAMPLE_PARAM_GROUP
 *
 * @brief Parameter specification for syscall perfmon_sample
 *
 * @{
 */
 /**
 * @brief Perfmon sample syscall parameter will be a pointer to this struct
 */
 typedef struct {
 	/** counter_mask[0] is for ID: 0-31; counter_mask[1] is for ID: 32-63 */
 	uint32_t counter_mask[2];
 	uint32_t output_addr;
 } pva_fw_pe_syscall_perfmon_sample;
 /**
 * @brief Index for t26x performance counters for VPU
 */
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
 #define PERFMON_COUNTER_ID_VPS_ID_VALID_T26X (1U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T26X (2U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T26X (3U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T26X (4U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T26X (5U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T26X (6U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T26X (7U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T26X (8U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T26X (9U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T26X (10U)
 #define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T26X (11U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T26X (12U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T26X (13U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T26X (14U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T26X (15U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T26X (16U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T26X (17U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T26X (18U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T26X (19U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_FETCH_REQ_T26X (20U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_T26X (21U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_PREEMPT_T26X (22U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_LINES_T26X (23U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_DUR_T26X (24U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_DUR_T26X (25U)
 #define PERFMON_COUNTER_ID_DLUT_BUSY_T26X (26U)
 #define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T26X (27U)
 #define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T26X (28U)
 #define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T26X (29U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T26X (30U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T26X (31U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T26X (32U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T26X (33U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T26X (34U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T26X (35U)
 #define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T26X (36U)
 /**
 * @brief Index for t23x performance counters
 */
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T23X (0U)
 #define PERFMON_COUNTER_ID_VPS_ID_VALID_T23X (1U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T23X (2U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T23X (3U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T23X (4U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T23X (5U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T23X (6U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T23X (7U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T23X (8U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T23X (9U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T23X (10U)
 #define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T23X (11U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T23X (12U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T23X (13U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T23X (14U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T23X (15U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T23X (16U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T23X (17U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T23X (18U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T23X (19U)
 #define PERFMON_COUNTER_ID_ICACHE_FETCH_REQ_T23X (20U)
 #define PERFMON_COUNTER_ID_ICACHE_MISS_T23X (21U)
 #define PERFMON_COUNTER_ID_ICACHE_PREEMP_T23X (22U)
 #define PERFMON_COUNTER_ID_ICACHE_PREFETCH_LINES_T23X (23U)
 #define PERFMON_COUNTER_ID_ICACHE_MISS_DUR_T23X (24U)
 #define PERFMON_COUNTER_ID_ICACHE_PREFETCH_DUR_T23X (25U)
 #define PERFMON_COUNTER_ID_DLUT_BUSY_T23X (26U)
 #define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T23X (27U)
 #define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T23X (28U)
 #define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T23X (29U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T23X (30U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T23X (31U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T23X (32U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T23X (33U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T23X (34U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T23X (35U)
 #define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T23X (36U)
 /**
 * @brief Index for t26x performance counters for PPE
 */
 #define PERFMON_COUNTER_ID_PPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
 #define PERFMON_COUNTER_ID_PPS_ID_VALID_T26X (1U)
 #define PERFMON_COUNTER_ID_PPS_STALL_ID_REG_DEPEND_T26X (2U)
 #define PERFMON_COUNTER_ID_PPS_STALL_ID_ONLY_T26X (3U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX1_ONLY_T26X (4U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_LD_DEPENDENCY_T26X (5U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_ST_DEPENDENCY_T26X (6U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_DEPENDENCY_T26X (7U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STRM_STORE_FLUSH_T26X (8U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_STORE_FLUSH_T26X (9U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STORE_FLUSH_T26X (10U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_LD_T26X (11U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_ST_T26X (12U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_T26X (13U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LD_T26X (14U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_ST_T26X (15U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LDST_T26X (16U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_PUSHBACK_T26X (17U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STQ_PUSHBACK_T26X (18U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_FLUSH_T26X (19U)
 #define PERFMON_COUNTER_ID_PPS_WFE_GPI_EX_STATE_T26X (20U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_FETCH_REQ_T26X (21U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_T26X (22U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_PREEMPT_T26X (23U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_LINES_T26X (24U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_DUR_T26X (25U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_DUR_T26X (26U)
 /** @} */
 #endif /*PVA_VPU_SYSCALL_INTERFACE_H*/
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_fw.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_fw.h
@@ -11,6 +11,138 @@
 /* The sizes of these structs must be explicitly padded to align to 4 bytes */
 #define PVA_CMD_PRIV_OPCODE_FLAG (1U << 7U)
 #define PVA_RESOURCE_ID_BASE 1U
 struct pva_resource_entry {
 	uint8_t access_flags : 2; // 1: RO, 2: WO, 3: RW
 	uint8_t reserved : 4;
 #define PVA_RESOURCE_TYPE_INVALID 0U
 #define PVA_RESOURCE_TYPE_DRAM 1U
 #define PVA_RESOURCE_TYPE_EXEC_BIN 2U
 #define PVA_RESOURCE_TYPE_DMA_CONFIG 3U
 	uint8_t type : 2;
 	uint8_t smmu_context_id;
 	uint8_t addr_hi;
 	uint8_t size_hi;
 	uint32_t addr_lo;
 	uint32_t size_lo;
 };
 struct pva_cmd_init_resource_table {
 #define PVA_CMD_OPCODE_INIT_RESOURCE_TABLE (0U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	/**< Resource table id is from 0 to 7, 0 is the device's resource table,
 	 * 1-7 are users'. */
 	uint8_t resource_table_id;
 	uint8_t resource_table_addr_hi;
 	uint8_t pad[2];
 	uint32_t resource_table_addr_lo;
 	uint32_t max_n_entries;
 };
 struct pva_cmd_deinit_resource_table {
 #define PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE (1U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t resource_table_id;
 	uint8_t pad[3];
 };
 struct pva_cmd_update_resource_table {
 #define PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE (2U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t resource_table_id;
 	uint8_t pad[3];
 	uint32_t resource_id;
 	struct pva_resource_entry entry;
 };
 struct pva_cmd_init_queue {
 #define PVA_CMD_OPCODE_INIT_QUEUE (3U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t ccq_id;
 	uint8_t queue_id;
 	uint8_t queue_addr_hi;
 	uint8_t syncpt_addr_hi;
 	uint32_t queue_addr_lo;
 	uint32_t max_n_submits;
 	uint32_t syncpt_addr_lo;
 	uint32_t syncpt_id;
 };
 struct pva_cmd_deinit_queue {
 #define PVA_CMD_OPCODE_DEINIT_QUEUE (4U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t ccq_id;
 	uint8_t queue_id;
 	uint8_t pad[2];
 };
 struct pva_cmd_enable_fw_profiling {
 #define PVA_CMD_OPCODE_ENABLE_FW_PROFILING (5U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t timestamp_type;
 	uint8_t pad[3];
 	uint32_t filter;
 };
 struct pva_cmd_disable_fw_profiling {
 #define PVA_CMD_OPCODE_DISABLE_FW_PROFILING (6U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 };
 struct pva_cmd_get_tegra_stats {
 #define PVA_CMD_OPCODE_GET_TEGRA_STATS (7U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t buffer_offset_hi;
 	bool enabled;
 	uint8_t pad[2];
 	uint32_t buffer_resource_id;
 	uint32_t buffer_size;
 	uint32_t buffer_offset_lo;
 };
 struct pva_cmd_suspend_fw {
 #define PVA_CMD_OPCODE_SUSPEND_FW (8U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 };
 struct pva_cmd_resume_fw {
 #define PVA_CMD_OPCODE_RESUME_FW (9U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 };
 struct pva_cmd_init_shared_dram_buffer {
 #define PVA_CMD_OPCODE_INIT_SHARED_DRAM_BUFFER (10U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t interface;
 	uint8_t buffer_iova_hi;
 	uint8_t pad[2];
 	uint32_t buffer_iova_lo;
 	uint32_t buffer_size;
 };
 struct pva_cmd_deinit_shared_dram_buffer {
 #define PVA_CMD_OPCODE_DEINIT_SHARED_DRAM_BUFFER                               \
 	(11U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t interface;
 	uint8_t pad[3];
 };
 struct pva_cmd_set_debug_log_level {
 #define PVA_CMD_OPCODE_SET_DEBUG_LOG_LEVEL (12U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint32_t log_level;
 };
 struct pva_cmd_set_profiling_level {
 #define PVA_CMD_OPCODE_SET_PROFILING_LEVEL (13U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint32_t level;
 };
 #define PVA_CMD_PRIV_OPCODE_COUNT 14U
 struct pva_fw_prefence {
 	uint8_t offset_hi;
 	uint8_t pad0[3];
@@ -301,7 +433,8 @@ struct pva_fw_shared_buffer_header {
 struct pva_kmd_fw_buffer_msg_header {
 #define PVA_KMD_FW_BUF_MSG_TYPE_FW_EVENT 0
 #define PVA_KMD_FW_BUF_MSG_TYPE_VPU_TRACE 1
-#define PVA_KMD_FW_BUF_MSG_TYPE_RES_UNREG 2
+#define PVA_KMD_FW_BUF_MSG_TYPE_FENCE_TRACE 2
 #define PVA_KMD_FW_BUF_MSG_TYPE_RES_UNREG 3
 	uint32_t type : 8;
 	// Size of payload in bytes. Includes the size of the header.
 	uint32_t size : 24;
@@ -334,6 +467,27 @@ struct pva_kmd_fw_msg_vpu_trace {
 	uint64_t submit_id;
 };
 struct pva_kmd_fw_msg_fence_trace {
 	uint64_t submit_id;
 	uint64_t timestamp;
 	// For syncpt fences, fence_id is the syncpt index
 	// For semaphore fences, fence_id is the serial ID of the semaphore NvRM memory
 	uint64_t fence_id;
 	// 'offset' is the offset into the semaphore memory where the value is stored
 	// This is only valid for semaphore fences
 	// Note: Trace APIs in KMD only support 32-bit offset
 	uint32_t offset;
 	uint32_t value;
 	uint8_t ccq_id;
 	uint8_t queue_id;
 #define PVA_KMD_FW_BUF_MSG_FENCE_ACTION_WAIT 0U
 #define PVA_KMD_FW_BUF_MSG_FENCE_ACTION_SIGNAL 1U
 	uint8_t action;
 #define PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SYNCPT 0U
 #define PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SEMAPHORE 1U
 	uint8_t type;
 };
 // Resource unregister message
 struct pva_kmd_fw_msg_res_unreg {
 	uint32_t resource_id;
@@ -345,4 +499,11 @@ struct pva_kmd_fw_tegrastats {
 	uint64_t total_utilization[PVA_NUM_PVE];
 };
 #define PVA_MAX_CMDBUF_CHUNK_LEN 1024
 #define PVA_MAX_CMDBUF_CHUNK_SIZE (sizeof(uint32_t) * PVA_MAX_CMDBUF_CHUNK_LEN)
 #define PVA_TEST_MODE_MAX_CMDBUF_CHUNK_LEN 256
 #define PVA_TEST_MODE_MAX_CMDBUF_CHUNK_SIZE                                    \
 	(sizeof(uint32_t) * PVA_TEST_MODE_MAX_CMDBUF_CHUNK_LEN)
 #endif // PVA_FW_H
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_fw_hyp.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_fw_hyp.h
@@ -49,7 +49,9 @@
 * |    23-21     |  Reserved             |  Reserved for future use                                                                                                                                  |
 * |     20       |  CG DISABLE           |  To indicate the PVA R5 FW should disable the clock gating feature                                                                                        |
 * |     19       |  VMEM RD WAR DISABLE  |  To disable the VMEM Read fail workaround feature                                                                                                         |
- * |    18-16     |  Reserved             |  Reserved for future use                                                                                                                                  |
+ * |     18       |  TEST_MODE_ENABLE     |  To enter test mode. See Documentation.                                                                                                                   |
 * |     17       |  USE_XBAR_RAW         |  Reserved for future use                                                                                                                                  |
 * |     16       |  Reserved             |  Reserved for future use                                                                                                                                  |
 *
 * The table below shows the mapping which is sent by FW to KMD
 *
@@ -72,11 +74,20 @@
 #define PVA_BOOT_SEMA_CG_DISABLE PVA_BIT(20U)
 //! @cond DISABLE_DOCUMENTATION
 /** Tell firmware to enter test mode */
 #define PVA_BOOT_SEMA_TEST_MODE_ENABLE PVA_BIT(18U)
 /** Tell firmware that block linear surfaces are in XBAR_RAW format instead of
 * TEGRA_RAW format */
 #define PVA_BOOT_SEMA_USE_XBAR_RAW PVA_BIT(17U)
 /** Tell firmware to enable test mode */
 #define PVA_BOOT_SEMA_TEST_MODE PVA_BIT(16U)
 #define PVA_BOOT_SEMA 0U
 #define PVA_RO_SYNC_BASE_SEMA 1U
 #define PVA_RW_SYNC_BASE_SEMA 2U
 #define PVA_RW_SYNC_SIZE_SEMA 3U
 /**
 * @brief This macro has the value to be set by KMD in the shared semaphores
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_resource.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_resource.h
@@ -62,8 +62,10 @@ struct pva_fw_dma_slot {
 * to block linear surface. */
 #define PVA_FW_DMA_SLOT_FLAG_CB (1u << 4u)
 #define PVA_FW_DMA_SLOT_FLAG_BOUND (1u << 5u)
-	uint8_t flags;
+#define PVA_FW_DMA_SLOT_FLAG_MASKED (1u << 6u)
-	uint8_t pad;
+#define PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB 7u
 #define PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB 8u
 	uint16_t flags;
 	/** Bitmask of channels that use this slot */
 	uint16_t ch_use_mask;
--- a/drivers/video/tegra/host/pva/src/include/pva_api.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api.h
@@ -16,6 +16,7 @@ extern "C" {
 /* Core APIs */
 #define PVA_MAX_NUM_RESOURCES_PER_CONTEXT (16U * 1024U)
 /**
 * @brief Create a PVA context.
 *
@@ -37,6 +38,18 @@ enum pva_error pva_context_create(uint32_t pva_index,
 */
 void pva_context_destroy(struct pva_context *ctx);
 /**
 * @brief Get the value of a context attribute.
 *
 * @param[in] ctx Pointer to the context.
 * @param[in] attr Attribute to get.
 * @param[out] out_value Pointer to the value of the attribute.
 * @param[size] size of the attribute structure
 */
 enum pva_error pva_get_attribute(struct pva_context *ctx, enum pva_attr attr,
 				 void *out_value, uint64_t size);
 #define PVA_MAX_NUM_SUBMISSIONS_PER_QUEUE (8U * 1024U)
 /**
 * @brief Create a PVA queue.
 *
@@ -97,7 +110,7 @@ void pva_memory_free(struct pva_memory *mem);
 * @param[in] ctx Pointer to the context.
 * @param[in] syncpiont_id Syncpoint ID to wait on.
 * @param[in] value Value to wait for.
- * @param[in] timeout_us Timeout in microseconds. PVA_TIMEOUT_INF for infinite.
+ * @param[in] timeout_us Timeout in microseconds. PVA_SUBMIT_TIMEOUT_INF for infinite.
 */
 enum pva_error pva_syncpoint_wait(struct pva_context *ctx,
 				  uint32_t syncpiont_id, uint32_t value,
@@ -109,7 +122,7 @@ enum pva_error pva_syncpoint_wait(struct pva_context *ctx,
 * @param[in] queue Pointer to the queue.
 * @param[in] submit_infos Array of submit info structures.
 * @param[in] count Number of submit info structures.
- * @param[in] timeout_us Timeout in microseconds. PVA_TIMEOUT_INF for infinite.
+ * @param[in] timeout_us Timeout in microseconds. PVA_SUBMIT_TIMEOUT_INF for infinite.
 *
 * @note Concurrent submission to the same queue needs to be serialized by the
 *       caller.
@@ -206,26 +219,6 @@ enum pva_error pva_memory_import_id_destroy(uint64_t import_id);
 /** \brief Specifies the PVA system software minor version. */
 #define PVA_SYSSW_MINOR_VERSION (7U)
 /**
 * @brief Get PVA system software version.
 *
 * PVA system software version is defined as the latest version of cuPVA which is fully supported
 * by this version of the PVA system software.
 *
 * @param[out] version version of currently running system SW, computed as:
 	       (PVA_SYSSW_MAJOR_VERSION * 1000) + PVA_SYSSW_MINOR_VERSION
 * @return PVA_SUCCESS on success, else error code indicating the failure.
 */
 enum pva_error pva_get_version(uint32_t *version);
 /**
 * @brief Get the hardware characteristics of the PVA.
 *
 * @param[out] pva_hw_char Pointer to the hardware characteristics.
 */
 enum pva_error
 pva_get_hw_characteristics(struct pva_characteristics *pva_hw_char);
 #ifdef __cplusplus
 }
 #endif
--- a/drivers/video/tegra/host/pva/src/include/pva_api_cmdbuf.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_cmdbuf.h
@@ -5,13 +5,9 @@
 #define PVA_API_CMDBUF_H
 #include "pva_api_types.h"
 //Maximum number of slots for maintaining Timestamps
 #define PVA_MAX_QUERY_SLOTS_COUNT 32U
 /** The common header for all commands.
 */
 struct pva_cmd_header {
 #define PVA_CMD_PRIV_OPCODE_FLAG (1U << 7U)
 	/** Opcode for the command. MSB of opcode indicates whether this command is
 	 * privileged or not */
 	uint8_t opcode;
@@ -35,6 +31,26 @@ struct pva_cmd_header {
 	uint8_t len;
 };
 struct pva_dma_misr_config {
 #define PVA_DMA_FLAG_MISR_ENABLE 1u
 	uint8_t enabled;
 	uint8_t reserved;
 	uint16_t channel_mask;
 	uint32_t seed_crc0;
 	uint32_t seed_crc1;
 	uint32_t ref_addr;
 	uint32_t ref_data_1;
 	uint32_t ref_data_2;
 	uint32_t misr_timeout;
 };
 struct pva_dma_misr {
 	uint32_t slot_mask_low0;
 	uint32_t slot_mask_low1;
 	uint32_t slot_mask_high;
 	struct pva_dma_misr_config misr_config;
 };
 struct pva_user_dma_allowance {
 #define PVA_USER_DMA_ALLOWANCE_ADB_STEP_SIZE 8
 	uint32_t channel_idx : 4;
@@ -189,11 +205,6 @@ struct pva_cmd_set_vpu_parameter_with_buffer {
 	uint32_t src_dram_offset_lo;
 };
 /** For set_vpu_parameter_with_address command, set this flag in header.flags to
 * indicate that the target symbol is the legacy pointer symbol type:
 * pva_fw_vpu_legacy_ptr_symbol, which only supports 32bit offset and 32bit
 * size. */
 #define PVA_CMD_FLAGS_USE_LEGACY_POINTER 0x1
 /** Copy the address of a DRAM buffer to a VPU variable. The variable must be
 * laid out exactly according to pva_fw_vpu_ptr_symbol
 */
@@ -208,7 +219,6 @@ struct pva_cmd_set_vpu_parameter_with_address {
 };
 #define PVA_MAX_DMA_SETS_PER_DMA_ENGINE 4
 #define PVA_DMA_CONFIG_FETCH_BUFFER_PER_DMA_ENGINE 1
 /** This command first acquires the TCM scratch and then fetches DMA configuration
 * into the scratch. The command does not modify DMA
@@ -291,17 +301,7 @@ struct pva_cmd_run_ppe {
 	uint32_t entry_point_index;
 };
 #define PVA_BARRIER_GROUP_0 0U
 #define PVA_BARRIER_GROUP_1 1U
 #define PVA_BARRIER_GROUP_2 2U
 #define PVA_BARRIER_GROUP_3 3U
 #define PVA_BARRIER_GROUP_4 4U
 #define PVA_BARRIER_GROUP_5 5U
 #define PVA_BARRIER_GROUP_6 6U
 #define PVA_BARRIER_GROUP_7 7U
 #define PVA_MAX_BARRIER_GROUPS 8U
 #define PVA_BARRIER_GROUP_INVALID 0xFFU
 /**
@@ -464,29 +464,15 @@ struct pva_cmd_set_vpu_instance_parameter {
 	uint32_t symbol_id;
 };
-struct pva_cmd_run_unit_tests {
+struct pva_cmd_set_vpu_print_buffer {
-#define PVA_CMD_OPCODE_RUN_UNIT_TESTS 30U
+#define PVA_CMD_OPCODE_SET_VPU_PRINT_BUFFER 30U
 	struct pva_cmd_header header;
-#define PVA_FW_UTESTS_MAX_ARGC 16U
+	uint32_t resource_id;
-	uint8_t argc;
+	uint32_t offset;
 	uint8_t pad[3];
 	uint32_t in_resource_id;
 	uint32_t in_offset;
 	uint32_t in_size;
 	uint32_t out_resource_id;
 	uint32_t out_offset;
 	uint32_t out_size;
 };
 struct pva_cmd_set_vpu_print_cb {
 #define PVA_CMD_OPCODE_SET_VPU_PRINT_CB 31U
 	struct pva_cmd_header header;
 	uint32_t cb_resource_id;
 	uint32_t cb_offset;
 };
 struct pva_cmd_invalidate_l2sram {
-#define PVA_CMD_OPCODE_INVALIDATE_L2SRAM 32U
+#define PVA_CMD_OPCODE_INVALIDATE_L2SRAM 31U
 	struct pva_cmd_header header;
 	uint8_t dram_offset_hi;
 	uint8_t pad[3];
@@ -496,19 +482,18 @@ struct pva_cmd_invalidate_l2sram {
 };
 struct pva_cmd_flush_l2sram {
-#define PVA_CMD_OPCODE_FLUSH_L2SRAM 33U
+#define PVA_CMD_OPCODE_FLUSH_L2SRAM 32U
 	struct pva_cmd_header header;
 	uint8_t dram_offset_hi;
 	uint8_t pad[3];
 	uint32_t dram_resource_id;
 	uint32_t dram_offset_lo;
 	uint32_t l2sram_size;
 	struct pva_user_dma_allowance user_dma;
 };
 struct pva_cmd_err_inject {
 #define PVA_CMD_OPCODE_ERR_INJECT 34U
 	struct pva_cmd_header header;
 	enum pva_error_inject_codes err_inject_code;
 };
 struct pva_cmd_patch_l2sram_offset {
-#define PVA_CMD_OPCODE_PATCH_L2SRAM_OFFSET 35U
+#define PVA_CMD_OPCODE_PATCH_L2SRAM_OFFSET 33U
 	struct pva_cmd_header header;
 	uint8_t dma_set_id;
 	uint8_t slot_id;
@@ -520,130 +505,16 @@ struct pva_cmd_patch_l2sram_offset {
 * mapped to a new logical barrier group. This allows re-using barrier ids within a command buffer.
 */
 struct pva_cmd_retire_barrier_group {
-#define PVA_CMD_OPCODE_RETIRE_BARRIER_GROUP 36U
+#define PVA_CMD_OPCODE_RETIRE_BARRIER_GROUP 34U
 	struct pva_cmd_header header;
 };
-struct pva_cmd_gr_check {
+struct pva_cmd_setup_misr {
-#define PVA_CMD_OPCODE_GR_CHECK 37U
+#define PVA_CMD_OPCODE_SETUP_MISR 35U
 	struct pva_cmd_header header;
 	struct pva_dma_misr misr_params;
 };
-#define PVA_CMD_OPCODE_COUNT 38U
+#define PVA_CMD_OPCODE_MAX 36U
 struct pva_cmd_init_resource_table {
 #define PVA_CMD_OPCODE_INIT_RESOURCE_TABLE (0U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	/**< Resource table id is from 0 to 7, 0 is the device's resource table,
 	 * 1-7 are users'. */
 	uint8_t resource_table_id;
 	uint8_t resource_table_addr_hi;
 	uint8_t pad[2];
 	uint32_t resource_table_addr_lo;
 	uint32_t max_n_entries;
 };
 struct pva_cmd_deinit_resource_table {
 #define PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE (1U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t resource_table_id;
 	uint8_t pad[3];
 };
 struct pva_cmd_update_resource_table {
 #define PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE (2U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t resource_table_id;
 	uint8_t pad[3];
 	uint32_t resource_id;
 	struct pva_resource_entry entry;
 };
 struct pva_cmd_init_queue {
 #define PVA_CMD_OPCODE_INIT_QUEUE (3U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t ccq_id;
 	uint8_t queue_id;
 	uint8_t queue_addr_hi;
 	uint8_t pad;
 	uint32_t queue_addr_lo;
 	uint32_t max_n_submits;
 };
 struct pva_cmd_deinit_queue {
 #define PVA_CMD_OPCODE_DEINIT_QUEUE (4U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t ccq_id;
 	uint8_t queue_id;
 	uint8_t pad[2];
 };
 struct pva_cmd_enable_fw_profiling {
 #define PVA_CMD_OPCODE_ENABLE_FW_PROFILING (5U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t timestamp_type;
 	uint8_t pad[3];
 	uint32_t filter;
 };
 struct pva_cmd_disable_fw_profiling {
 #define PVA_CMD_OPCODE_DISABLE_FW_PROFILING (6U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 };
 struct pva_cmd_get_tegra_stats {
 #define PVA_CMD_OPCODE_GET_TEGRA_STATS (7U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t buffer_offset_hi;
 	bool enabled;
 	uint8_t pad[2];
 	uint32_t buffer_resource_id;
 	uint32_t buffer_size;
 	uint32_t buffer_offset_lo;
 };
 struct pva_cmd_suspend_fw {
 #define PVA_CMD_OPCODE_SUSPEND_FW (8U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 };
 struct pva_cmd_resume_fw {
 #define PVA_CMD_OPCODE_RESUME_FW (9U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 };
 struct pva_cmd_init_shared_dram_buffer {
 #define PVA_CMD_OPCODE_INIT_SHARED_DRAM_BUFFER (10U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t interface;
 	uint8_t buffer_iova_hi;
 	uint8_t pad[2];
 	uint32_t buffer_iova_lo;
 	uint32_t buffer_size;
 };
 struct pva_cmd_deinit_shared_dram_buffer {
 #define PVA_CMD_OPCODE_DEINIT_SHARED_DRAM_BUFFER                               \
 	(11U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint8_t interface;
 	uint8_t pad[3];
 };
 struct pva_cmd_set_debug_log_level {
 #define PVA_CMD_OPCODE_SET_DEBUG_LOG_LEVEL (12U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint32_t log_level;
 };
 struct pva_cmd_set_profiling_level {
 #define PVA_CMD_OPCODE_SET_PROFILING_LEVEL (13U | PVA_CMD_PRIV_OPCODE_FLAG)
 	struct pva_cmd_header header;
 	uint32_t level;
 };
 #define PVA_CMD_PRIV_OPCODE_COUNT 14U
 #define PVA_MAX_CMDBUF_CHUNK_LEN 1024
 #define PVA_MAX_CMDBUF_CHUNK_SIZE (sizeof(uint32_t) * PVA_MAX_CMDBUF_CHUNK_LEN)
 #endif // PVA_API_CMDBUF_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_cuda.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_cuda.h
@@ -11,14 +11,14 @@ extern "C" {
 #include "cuda.h"
 #include "pva_api_types.h"
-/**
+#define PVA_CUEXTEND_MAX_NUM_PREFENCES 16
- *  @brief Structure for cuExtend queue data needed for command submission.
+#define PVA_CUEXTEND_MAX_NUM_POSTFENCES 16
- */
+
-struct pva_cuextend_queue_data {
+struct pva_cuextend_submit_events {
-	/*! Holds a pointer to pva queue object */
+	struct pva_fence prefences[PVA_CUEXTEND_MAX_NUM_PREFENCES];
-	struct pva_queue *queue;
+	struct pva_fence postfences[PVA_CUEXTEND_MAX_NUM_POSTFENCES];
-	/*! Holds engine affinity for command submission*/
+	uint32_t num_prefences;
-	uint32_t affinity;
+	uint32_t num_postfences;
 };
 /**
@@ -71,27 +71,16 @@ typedef enum pva_error (*pva_cuextend_stream_unregister)(void *callback_args,
 							 uint64_t flags);
 /**
- *  @brief Function type for cuExtend acquire queue callback.
+ * @brief Function type for submitting a batch of command buffers via a CUDA stream.
 *
 * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
 * @param[in] stream_payload Client data returned by \ref pva_cuextend_stream_register.
- * @param[out] queue_data Output pointer to a pva_cuextend_queue_data object.
+ * @param[in] submit_payload Pointer to the submit payload.
- * @return \ref pva_error The completion status of acquire queue operation.
+ * @return \ref pva_error The completion status of the submit operation.
 */
-typedef enum pva_error (*pva_cuextend_queue_acquire)(
+typedef enum pva_error (*pva_cuextend_stream_submit)(
-	void *callback_args, void *stream_payload,
+	void *callback_args, void *stream_payload, void *submit_payload,
-	struct pva_cuextend_queue_data **queue_data);
+	struct pva_cuextend_submit_events *submit_events);
 /**
 *  @brief Function type for cuExtend release queue callback.
 *
 * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
 * @param[in] stream_payload Client data returned by \ref pva_cuextend_stream_register.
 * @return \ref pva_error The completion status of release  queue operation.
 */
 typedef enum pva_error (*pva_cuextend_queue_release)(void *callback_args,
 						     void *stream_payload,
 						     void *queue_data);
 /**
 * @brief Function type for retrieving error code from cuExtend.
@@ -128,12 +117,10 @@ struct pva_cuextend_callbacks {
 	pva_cuextend_stream_register stream_reg;
 	/*! Holds the unregister stream callback */
 	pva_cuextend_stream_unregister stream_unreg;
 	/*! Holds the acquire queue callback */
 	pva_cuextend_queue_acquire queue_acquire;
 	/*! Holds the release queue callback */
 	pva_cuextend_queue_release queue_release;
 	/*! Holds the teardown callback */
 	pva_cuextend_teardown teardown;
 	/*! Holds the stream submit callback */
 	pva_cuextend_stream_submit stream_submit;
 	/*! Pointer to the callback arguments provided by client during cuExtend initialization */
 	void *args;
 };
@@ -188,22 +175,32 @@ enum pva_error pva_cuextend_memory_import(struct pva_context *ctx,
 /**
 * @brief Submit a batch of command buffers via a CUDA stream.
 *
- * @param[in] queue Pointer to the queue. If queue is not NULL, this API will try to submit the client tasks to this queue directly.
+ * @param[in] ctx Pointer to the PVA context.
- *                  Otherwise, it will call queue_acquire callback to query a pva_queue object from stream payload, and then submit
+ * @param[in] cuStream A CUDA stream.
- *                  the tasks to the queried queue.
+ * @param[in] client_stream A client stream.
- * @param[in] stream A CUDA stream.
+ * @param[in] submit_payload Pointer to the submit payload.
 * @param[in] submit_infos Array of submit info structures.
 * @param[in] count Number of submit info structures.
 * @param[in] timeout_ms Timeout in milliseconds. PVA_TIMEOUT_INF for infinite.
 * @return \ref pva_error The completion status of the submit operation.
 *
 * @note Concurrent submission to the same queue needs to be serialized by the
 *       caller.
 */
-enum pva_error
+enum pva_error pva_cuextend_cmdbuf_batch_submit(struct pva_context *ctx,
-pva_cuextend_cmdbuf_batch_submit(struct pva_queue *queue, CUstream stream,
+						CUstream cuStream,
-				 struct pva_cmdbuf_submit_info *submit_infos,
+						void *client_stream,
-				 uint32_t count, uint64_t timeout_ms);
+						void *submit_payload);
 /**
 * @brief Get the payload associated with a CUDA stream.
 *
 * Returns the payload which was associated with the CUDA stream during registration callback.
 *
 * @param[in] ctx Pointer to the PVA context.
 * @param[in] cuStream A CUDA stream.
 * @param[out] stream_payload Pointer to the stream payload.
 * @return PVA_SUCCESS if the stream payload is successfully retrieved
 *         PVA_BAD_PARAMETER_ERROR if any of the parameters are NULL
 *         PVA_CUDA_INIT_FAILED if the cuExtend was not initialized for the context
 */
 enum pva_error pva_cuextend_get_stream_payload(struct pva_context *ctx,
 					       CUstream cuStream,
 					       void **stream_payload);
 #ifdef __cplusplus
 }
--- a/drivers/video/tegra/host/pva/src/include/pva_api_dma.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_dma.h
@@ -24,73 +24,8 @@ enum pva_gpio_bit {
 	GPIO_WRITE6_BIT = 29U
 };
-enum pva_dma_descriptor_id {
+#define PVA_DMA_DESC_ID_NULL 0
-	PVA_DMA_DESC_NONE = 0,
+#define PVA_DMA_DESC_ID_BASE 1
 	PVA_DMA_DESC0 = 1,
 	PVA_DMA_DESC1 = 2,
 	PVA_DMA_DESC2 = 3,
 	PVA_DMA_DESC3 = 4,
 	PVA_DMA_DESC4 = 5,
 	PVA_DMA_DESC5 = 6,
 	PVA_DMA_DESC6 = 7,
 	PVA_DMA_DESC7 = 8,
 	PVA_DMA_DESC8 = 9,
 	PVA_DMA_DESC9 = 10,
 	PVA_DMA_DESC10 = 11,
 	PVA_DMA_DESC11 = 12,
 	PVA_DMA_DESC12 = 13,
 	PVA_DMA_DESC13 = 14,
 	PVA_DMA_DESC14 = 15,
 	PVA_DMA_DESC15 = 16,
 	PVA_DMA_DESC16 = 17,
 	PVA_DMA_DESC17 = 18,
 	PVA_DMA_DESC18 = 19,
 	PVA_DMA_DESC19 = 20,
 	PVA_DMA_DESC20 = 21,
 	PVA_DMA_DESC21 = 22,
 	PVA_DMA_DESC22 = 23,
 	PVA_DMA_DESC23 = 24,
 	PVA_DMA_DESC24 = 25,
 	PVA_DMA_DESC25 = 26,
 	PVA_DMA_DESC26 = 27,
 	PVA_DMA_DESC27 = 28,
 	PVA_DMA_DESC28 = 29,
 	PVA_DMA_DESC29 = 30,
 	PVA_DMA_DESC30 = 31,
 	PVA_DMA_DESC31 = 32,
 	PVA_DMA_DESC32 = 33,
 	PVA_DMA_DESC33 = 34,
 	PVA_DMA_DESC34 = 35,
 	PVA_DMA_DESC35 = 36,
 	PVA_DMA_DESC36 = 37,
 	PVA_DMA_DESC37 = 38,
 	PVA_DMA_DESC38 = 39,
 	PVA_DMA_DESC39 = 40,
 	PVA_DMA_DESC40 = 41,
 	PVA_DMA_DESC41 = 42,
 	PVA_DMA_DESC42 = 43,
 	PVA_DMA_DESC43 = 44,
 	PVA_DMA_DESC44 = 45,
 	PVA_DMA_DESC45 = 46,
 	PVA_DMA_DESC46 = 47,
 	PVA_DMA_DESC47 = 48,
 	PVA_DMA_DESC48 = 49,
 	PVA_DMA_DESC49 = 50,
 	PVA_DMA_DESC50 = 51,
 	PVA_DMA_DESC51 = 52,
 	PVA_DMA_DESC52 = 53,
 	PVA_DMA_DESC53 = 54,
 	PVA_DMA_DESC54 = 55,
 	PVA_DMA_DESC55 = 56,
 	PVA_DMA_DESC56 = 57,
 	PVA_DMA_DESC57 = 58,
 	PVA_DMA_DESC58 = 59,
 	PVA_DMA_DESC59 = 60,
 	PVA_DMA_DESC60 = 61,
 	PVA_DMA_DESC61 = 62,
 	PVA_DMA_DESC62 = 63,
 	PVA_DMA_DESC63 = 64
 };
 /**
 * The values of the enum members conform to the definitions of DMA descriptors'
@@ -266,8 +201,6 @@ struct pva_dma_config_header {
 * means that every allocation of descriptors will start at an alignment of 4. The following
 * macros control the alignment/grouping requirement of DMA resources.
 */
 // TODO: Add compile time asserts to ensure the following alignment requirments don't result
 //	 in fractional resource partitions?
 #define PVA_DMA_CHANNEL_ALIGNMENT 1
 #define PVA_DMA_DESCRIPTOR_ALIGNMENT 4
 #define PVA_DMA_ADB_ALIGNMENT 16
--- a/drivers/video/tegra/host/pva/src/include/pva_api_ops.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_ops.h
@@ -11,7 +11,7 @@
 */
 struct pva_ops_memory {
 	uint32_t handle; /**< Memory handle */
-	uint32_t size; /**< Size of memory */
+	uint64_t size; /**< Size of memory */
 	void *va; /**< Virtual address */
 };
@@ -27,8 +27,8 @@ struct pva_ops_memory {
 */
 struct pva_ops_buffer {
 	struct pva_ops_memory *memory; /**< Pointer to buffer memory */
-	uint32_t start_offset; /**< Start offset in buffer memory */
+	uint64_t start_offset; /**< Start offset in buffer memory */
-	uint32_t end_offset; /**< End offset (exclusive) in buffer memory */
+	uint64_t end_offset; /**< End offset (exclusive) in buffer memory */
 };
 /**
@@ -45,9 +45,9 @@ struct pva_ops_buffer {
 * @brief Header structure for PVA operations.
 */
 struct pva_ops_header {
-	uint32_t opcode; /**< Operation code identifying the operation type */
+	uint64_t opcode; /**< Operation code identifying the operation type */
 	/** Size of the operation in bytes. This size must be a multiple of 8 bytes. */
-	uint32_t size;
+	uint64_t size;
 };
 /**
@@ -56,8 +56,7 @@ struct pva_ops_header {
 struct pva_ops_executable_register {
 #define PVA_OPS_OPCODE_EXECUTABLE_REGISTER 1U
 	struct pva_ops_header header; /**< Operation header */
-	uint32_t exec_size; /**< Size of executable data */
+	uint64_t exec_size; /**< Size of executable data */
 	uint32_t pad; /**< Padding for 8 bytes alignment */
 	//followed by executable data
 };
@@ -144,7 +143,7 @@ enum pva_error pva_ops_parse_unregister_resp(struct pva_ops_buffer *resp_buf);
 *
 * @return PVA_SUCCESS on success, appropriate error code otherwise.
 */
-enum pva_error pva_ops_memory_alloc(struct pva_context *ctx, uint32_t size,
+enum pva_error pva_ops_memory_alloc(struct pva_context *ctx, uint64_t size,
 				    struct pva_ops_memory *ops_buf);
 /**
--- a/drivers/video/tegra/host/pva/src/include/pva_api_types.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_types.h
@@ -118,6 +118,16 @@
 	ACT(PVA_ERR_MATH_OP)                                                   \
 	ACT(PVA_ERR_HWSEQ_INVALID)                                             \
 	ACT(PVA_ERR_FW_ABORTED)                                                \
 	ACT(PVA_ERR_PPE_DIVIDE_BY_0)                                           \
 	ACT(PVA_ERR_PPE_FP_NAN)                                                \
 	ACT(PVA_ERR_INVALID_ACCESS_MODE_COMBINATION)                           \
 	ACT(PVA_ERR_CMD_TCM_BUF_OUT_OF_RANGE)                                  \
 	ACT(PVA_ERR_MISR_NOT_RUN)                                              \
 	ACT(PVA_ERR_MISR_DATA)                                                 \
 	ACT(PVA_ERR_MISR_ADDR)                                                 \
 	ACT(PVA_ERR_MISR_NOT_DONE)                                             \
 	ACT(PVA_ERR_MISR_ADDR_DATA)                                            \
 	ACT(PVA_ERR_MISR_TIMEOUT)                                              \
 	ACT(PVA_ERR_CODE_COUNT)
 enum pva_error {
@@ -207,12 +217,6 @@ struct pva_fw_vpu_ptr_symbol {
 	uint64_t size;
 };
 struct pva_fw_vpu_legacy_ptr_symbol {
 	uint64_t base;
 	uint32_t offset;
 	uint32_t size;
 };
 enum pva_surface_format {
 	PVA_SURF_FMT_PITCH_LINEAR = 0,
 	PVA_SURF_FMT_BLOCK_LINEAR
@@ -243,25 +247,6 @@ enum pva_symbol_type {
 	PVA_SYM_TYPE_MAX,
 };
 /**
 * \brief Holds PVA Sync Client Type.
 * Currently NvSciSync supports NvSciSyncFences with syncpoint primitive type only.
 */
 enum pva_sync_client_type {
 	/*! For a given SyncObj PVA acts as a signaler. This type corresponds to
      * postfences from PVA. */
 	PVA_SYNC_CLIENT_TYPE_SIGNALER,
 	/*! For a given SyncObj PVA acts as a waiter. This type corresponds to
      * prefences to PVA. */
 	PVA_SYNC_CLIENT_TYPE_WAITER,
 	/*! For a given SyncObj PVA acts as both signaler and waiter. */
 	PVA_SYNC_CLIENT_TYPE_SIGNALER_WAITER,
 	/*! Specifies the non inclusive upper bound of valid values. */
 	PVA_SYNC_CLIENT_TYPE_MAX,
 	/*! Reserved bound of valid values. */
 	PVA_SYNC_CLIENT_TYPE_RESERVED = 0x7FFFFFFF,
 };
 #define PVA_SYMBOL_ID_INVALID 0U
 #define PVA_SYMBOL_ID_BASE 1U
 #define PVA_MAX_SYMBOL_NAME_LEN 64U
@@ -275,19 +260,6 @@ struct pva_symbol_info {
 };
 #define PVA_RESOURCE_ID_INVALID 0U
 #define PVA_RESOURCE_ID_BASE 1U
 struct pva_resource_entry {
 #define PVA_RESOURCE_TYPE_INVALID 0U
 #define PVA_RESOURCE_TYPE_DRAM 1U
 #define PVA_RESOURCE_TYPE_EXEC_BIN 2U
 #define PVA_RESOURCE_TYPE_DMA_CONFIG 3U
 	uint8_t type;
 	uint8_t smmu_context_id;
 	uint8_t addr_hi;
 	uint8_t size_hi;
 	uint32_t addr_lo;
 	uint32_t size_lo;
 };
 /** \brief Maximum number of queues per context */
 #define PVA_MAX_QUEUES_PER_CONTEXT (8)
@@ -300,7 +272,8 @@ struct pva_resource_entry {
 #define PVA_ACCESS_RW                                                          \
 	(PVA_ACCESS_RO | PVA_ACCESS_WO) /**< Read and write access */
-#define PVA_TIMEOUT_INF UINT64_MAX /**< Infinite timeout */
+// unify timeout to uint64_t, in microseconds
 #define PVA_SUBMIT_TIMEOUT_INF UINT64_MAX /**< Infinite timeout */
 #define PVA_MAX_NUM_INPUT_STATUS 2 /**< Maximum number of input statuses */
 #define PVA_MAX_NUM_OUTPUT_STATUS 2 /**< Maximum number of output statuses */
@@ -329,8 +302,9 @@ struct pva_cmdbuf_submit_info {
 	uint64_t submit_id;
 	/** Offset of the first chunk within the resource */
 	uint64_t first_chunk_offset;
-#define PVA_EXEC_TIMEOUT_REUSE 0xFFFFFFFFU
+/** Execution timeout is in ms */
-#define PVA_EXEC_TIMEOUT_INF 0U
+#define PVA_EXEC_TIMEOUT_INF UINT32_MAX
 #define PVA_EXEC_TIMEOUT_REUSE (UINT32_MAX - 1)
 	/** Execution Timeout */
 	uint32_t execution_timeout_ms;
 	struct pva_fence prefences[PVA_MAX_NUM_PREFENCES];
@@ -351,13 +325,13 @@ struct pva_cmdbuf_status {
 	uint16_t status;
 };
-/** \brief Holds the PVA capabilities. */
+/** @brief Holds the PVA capabilities. */
 struct pva_characteristics {
-	/*! Holds the number of PVA engines. */
+	/** Holds the number of PVA engines. */
 	uint32_t pva_engine_count;
-	/*! Holds the number of VPUs per PVA engine. */
+	/** Holds the number of VPUs per PVA engine. */
 	uint32_t pva_pve_count;
-	/*! Holds the PVA generation information */
+	/** Holds the PVA generation information */
 	enum pva_hw_gen hw_version;
 	uint16_t max_desc_count;
 	uint16_t max_ch_count;
@@ -370,11 +344,6 @@ struct pva_characteristics {
 	uint16_t reserved_adb_count;
 };
 enum pva_error_inject_codes {
 	PVA_ERR_INJECT_WDT_HW_ERR, // watchdog Hardware error
 	PVA_ERR_INJECT_WDT_TIMEOUT, // watchdog Timeout error
 };
 /*
 * !!!! DO NOT MODIFY !!!!!!
 * These values are defined as per DriveOS guidelines
@@ -382,4 +351,20 @@ enum pva_error_inject_codes {
 #define PVA_INPUT_STATUS_SUCCESS (0)
 #define PVA_INPUT_STATUS_INVALID (0xFFFF)
 /**
 * @brief Context attribute keys.
 */
 enum pva_attr {
 	PVA_CONTEXT_ATTR_MAX_CMDBUF_CHUNK_SIZE,
 	PVA_ATTR_HW_CHARACTERISTICS,
 	PVA_ATTR_VERSION
 };
 /**
 * @brief Maximum size of a command buffer chunk.
 */
 struct pva_ctx_attr_max_cmdbuf_chunk_size {
 	uint16_t max_size;
 };
 #endif // PVA_API_TYPES_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_vpu.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_vpu.h
@@ -23,113 +23,10 @@ struct pva_vpu_instance_data {
 };
 /**
- * @defgroup PVA_VPU_SYSCALL
+ * @brief Used to store VPU Syscall IDs, that represent the
- *
+ *        vpu syscall id between FW and VPU kernel.
 * @brief PVA VPU SYS call IDs for each type of
 * SYS call.
 * @{
 */
 //! @cond DISABLE_DOCUMENTATION
 /**
 * @brief VPU Syscall id for vpu printf write.
 */
 #define PVA_FW_PE_SYSCALL_ID_WRITE (1U)
 //! @endcond
 /**
 * @brief VPU Syscall id for Icache prefetch.
 */
 #define PVA_FW_PE_SYSCALL_ID_ICACHE_PREFETCH (2U)
 /**
 * @brief VPU Syscall id for masking exceptions.
 */
 #define PVA_FW_PE_SYSCALL_ID_MASK_EXCEPTION (3U)
 /**
 * @brief VPU Syscall id for unmasking exceptions.
 */
 #define PVA_FW_PE_SYSCALL_ID_UNMASK_EXCEPTION (4U)
 //! @cond DISABLE_DOCUMENTATION
 /**
 * @brief VPU Syscall id for sampling VPU performance counters
 */
 #define PVA_FW_PE_SYSCALL_ID_PERFMON_SAMPLE (5U)
 //! @endcond
 /** @} */
 /**
 * @defgroup PVA_PPE_SYSCALL
 *
 * @brief PVA PPE SYS call IDs for each type of
 * SYS call.
 * @{
 */
 //! @cond DISABLE_DOCUMENTATION
 /**
 * @brief PPE Syscall id for ppe printf write.
 */
 #define PVA_FW_PPE_SYSCALL_ID_WRITE (1U)
 /**
 * @brief PPE Syscall id for masking exceptions.
 */
 #define PVA_FW_PPE_SYSCALL_ID_MASK_EXCEPTION (2U)
 /**
 * @brief PPE Syscall id for unmasking exceptions.
 */
 #define PVA_FW_PPE_SYSCALL_ID_UNMASK_EXCEPTION (3U)
 /**
 * @brief VPU Syscall id for sampling VPU performance counters
 */
 #define PVA_FW_PPE_SYSCALL_ID_PERFMON_SAMPLE (4U)
 /**
 * @brief PPE Syscall id for Icache prefetch.
 */
 #define PVA_FW_PPE_SYSCALL_ID_ICACHE_PREFETCH (5U)
 //! @endcond
 /** @} */
 /**
 * @brief Lookup table to convert PPE syscall IDs to VPU syscall IDs
 * Index is PPE syscall ID, value is corresponding VPU syscall ID
 */
 #define PVA_FW_PPE_TO_VPU_SYSCALL_LUT                                                          \
 	{                                                                                      \
 		0U, /* Index 0: Invalid */                                                     \
 			PVA_FW_PE_SYSCALL_ID_WRITE, /* Index 1: Write */                       \
 			PVA_FW_PE_SYSCALL_ID_MASK_EXCEPTION, /* Index 2: Mask Exception */     \
 			PVA_FW_PE_SYSCALL_ID_UNMASK_EXCEPTION, /* Index 3: Unmask Exception */ \
 			PVA_FW_PE_SYSCALL_ID_PERFMON_SAMPLE, /* Index 4: Perfmon Sample */     \
 			PVA_FW_PE_SYSCALL_ID_ICACHE_PREFETCH /* Index 5: ICache Prefetch */    \
 	}
 /**
 * @brief Maximum valid PPE syscall ID
 */
 #define PVA_FW_PPE_SYSCALL_ID_MAX PVA_FW_PPE_SYSCALL_ID_ICACHE_PREFETCH
 /**
 * @defgroup PVA_VPU_SYSCALL_WRITE_PARAM_GROUP
 *
 * @brief Parameter specification for syscall write
 */
 /**
 * @defgroup PVA_VPU_SYSCALL_COMMAND_FIELDS_GROUP
 *
 * @brief The command format to be used while issuing vpu syscall command from VPU kernel to R5.
 * The fields mentioned in this group is used for submitting the command
 * through the Signal_R5 interface from VPU kernel.
 *
 * @{
 */
 typedef uint32_t pva_vpu_syscall_id_t;
 /**
 * @brief The most significant bit of the vpu syscall ID field in
@@ -154,17 +51,56 @@ struct pva_vpu_instance_data {
 * the vpu syscall command interface
 */
 #define PVA_FW_PE_SYSCALL_PARAM_LSB (0U)
 /** @} */
 /**
- * @defgroup PVA_VPU_SYSCALL_ICACHE_PREFETCH_PARAM_FIELDS_GROUP
+ * @brief VPU Syscall id for vpu printf write.
 *
 * @brief The parameter format to be used while issuing vpu syscall command from VPU kernel to R5 for syscall icache prefetch.
 * The fields mentioned in this group is used for submitting the icache prefetch command
 * through the Signal_R5 interface from VPU kernel.
 *
 * @{
 */
 #define PVA_FW_PE_SYSCALL_ID_WRITE (1U)
 /**
 * @brief VPU Syscall id for Icache prefetch.
 */
 #define PVA_FW_PE_SYSCALL_ID_ICACHE_PREFETCH (2U)
 /**
 * @brief VPU Syscall id for masking exceptions.
 */
 #define PVA_FW_PE_SYSCALL_ID_MASK_EXCEPTION (3U)
 /**
 * @brief VPU Syscall id for unmasking exceptions.
 */
 #define PVA_FW_PE_SYSCALL_ID_UNMASK_EXCEPTION (4U)
 /**
 * @brief VPU Syscall id for sampling VPU performance counters
 */
 #define PVA_FW_PE_SYSCALL_ID_PERFMON_SAMPLE (5U)
 /**
 * @brief PPE Syscall id for ppe printf write.
 */
 #define PVA_FW_PPE_SYSCALL_ID_WRITE (1U)
 /**
 * @brief PPE Syscall id for Icache prefetch.
 */
 #define PVA_FW_PPE_SYSCALL_ID_ICACHE_PREFETCH (2U)
 /**
 * @brief PPE Syscall id for masking exceptions.
 */
 #define PVA_FW_PPE_SYSCALL_ID_MASK_EXCEPTION (3U)
 /**
 * @brief PPE Syscall id for unmasking exceptions.
 */
 #define PVA_FW_PPE_SYSCALL_ID_UNMASK_EXCEPTION (4U)
 /**
 * @brief PPE Syscall id for sampling PPE performance counters
 */
 #define PVA_FW_PPE_SYSCALL_ID_PERFMON_SAMPLE (5U)
 /**
 * @brief The most significant bit of the prefetch cache line count field in
@@ -189,23 +125,146 @@ struct pva_vpu_instance_data {
 * the vpu syscall command interface
 */
 #define PVA_FW_PE_SYSCALL_PREFETCH_ADDR_LSB (0U)
 /** @} */
 /**
 * @defgroup PVA_VPU_SYSCALL_MASK_UNMASK_PARAM_FIELDS_GROUP
 *
 * @brief The parameter format to be used while issuing vpu syscall command from VPU kernel
 * to R5 for masking or unmasking FP NaN Exception.
 * The fields mentioned in this group is used for submitting the mask and unmask FP NaN eception command
 * through the Signal_R5 interface from VPU kernel.
 *
 * @{
 */
 /**
 * @brief Parameter specification for syscall mask/unmask exceptions
 */
 #define PVA_FW_PE_MASK_DIV_BY_0 (1U << 1U)
 #define PVA_FW_PE_MASK_FP_INV_NAN (1U << 2U)
-/** @} */
+
 /**
 * @breif Write syscall parameter will be a pointer to this struct
 */
 union pva_fw_pe_syscall_write {
 	struct {
 		uint32_t addr;
 		uint32_t size;
 	} in;
 	struct {
 		uint32_t written_size;
 	} out;
 };
 /**
 * @brief Perfmon sample syscall parameter will be a pointer to this struct
 */
 struct pva_fw_pe_syscall_perfmon_sample {
 	/** counter_mask[0] is for ID: 0-31; counter_mask[1] is for ID: 32-63 */
 	uint32_t counter_mask[2];
 	uint32_t output_addr;
 };
 /**
 * @brief Index for t26x performance counters for VPU
 */
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
 #define PERFMON_COUNTER_ID_VPS_ID_VALID_T26X (1U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T26X (2U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T26X (3U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T26X (4U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T26X (5U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T26X (6U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T26X (7U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T26X (8U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T26X (9U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T26X (10U)
 #define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T26X (11U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T26X (12U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T26X (13U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T26X (14U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T26X (15U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T26X (16U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T26X (17U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T26X (18U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T26X (19U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_FETCH_REQ_T26X (20U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_T26X (21U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_PREEMPT_T26X (22U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_LINES_T26X (23U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_DUR_T26X (24U)
 #define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_DUR_T26X (25U)
 #define PERFMON_COUNTER_ID_DLUT_BUSY_T26X (26U)
 #define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T26X (27U)
 #define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T26X (28U)
 #define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T26X (29U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T26X (30U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T26X (31U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T26X (32U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T26X (33U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T26X (34U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T26X (35U)
 #define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T26X (36U)
 /**
 * @brief Index for t23x performance counters
 */
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T23X (0U)
 #define PERFMON_COUNTER_ID_VPS_ID_VALID_T23X (1U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T23X (2U)
 #define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T23X (3U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T23X (4U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T23X (5U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T23X (6U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T23X (7U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T23X (8U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T23X (9U)
 #define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T23X (10U)
 #define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T23X (11U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T23X (12U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T23X (13U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T23X (14U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T23X (15U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T23X (16U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T23X (17U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T23X (18U)
 #define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T23X (19U)
 #define PERFMON_COUNTER_ID_ICACHE_FETCH_REQ_T23X (20U)
 #define PERFMON_COUNTER_ID_ICACHE_MISS_T23X (21U)
 #define PERFMON_COUNTER_ID_ICACHE_PREEMP_T23X (22U)
 #define PERFMON_COUNTER_ID_ICACHE_PREFETCH_LINES_T23X (23U)
 #define PERFMON_COUNTER_ID_ICACHE_MISS_DUR_T23X (24U)
 #define PERFMON_COUNTER_ID_ICACHE_PREFETCH_DUR_T23X (25U)
 #define PERFMON_COUNTER_ID_DLUT_BUSY_T23X (26U)
 #define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T23X (27U)
 #define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T23X (28U)
 #define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T23X (29U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T23X (30U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T23X (31U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T23X (32U)
 #define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T23X (33U)
 #define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T23X (34U)
 #define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T23X (35U)
 #define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T23X (36U)
 /**
 * @brief Index for t26x performance counters for PPE
 */
 #define PERFMON_COUNTER_ID_PPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
 #define PERFMON_COUNTER_ID_PPS_ID_VALID_T26X (1U)
 #define PERFMON_COUNTER_ID_PPS_STALL_ID_REG_DEPEND_T26X (2U)
 #define PERFMON_COUNTER_ID_PPS_STALL_ID_ONLY_T26X (3U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX1_ONLY_T26X (4U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_LD_DEPENDENCY_T26X (5U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_ST_DEPENDENCY_T26X (6U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_DEPENDENCY_T26X (7U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STRM_STORE_FLUSH_T26X (8U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_STORE_FLUSH_T26X (9U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STORE_FLUSH_T26X (10U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_LD_T26X (11U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_ST_T26X (12U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_T26X (13U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LD_T26X (14U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_ST_T26X (15U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LDST_T26X (16U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_PUSHBACK_T26X (17U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_STQ_PUSHBACK_T26X (18U)
 #define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_FLUSH_T26X (19U)
 #define PERFMON_COUNTER_ID_PPS_WFE_GPI_EX_STATE_T26X (20U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_FETCH_REQ_T26X (21U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_T26X (22U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_PREEMPT_T26X (23U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_LINES_T26X (24U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_DUR_T26X (25U)
 #define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_DUR_T26X (26U)
 #endif // PVA_API_VPU_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.c
@@ -2,17 +2,17 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #include "pva_kmd_abort.h"
-#include "pva_kmd_shim_init.h"
+#include "pva_kmd_device.h"
 #include "pva_kmd_regs.h"
 #include "pva_kmd_silicon_utils.h"
-void pva_kmd_abort(struct pva_kmd_device *pva)
+void pva_kmd_abort_fw(struct pva_kmd_device *pva)
 {
-	//TODO: Report to FSI first about the SW error code.
+	// HW watchdog may fire repeatedly if PVA is hung. Therefore, disable all
-	pva_kmd_log_err("Abort: FW Reset Assert");
+	// interrupts to protect KMD from potential interrupt floods.
-	/* Put the FW in reset ASSERT so the user space
+	pva_kmd_disable_all_interrupts_nosync(pva);
-    cannot access the CCQ and thus force them to 
+
-    destroy the contexts. On destroy all the contexts.
+	// We will handle firmware reboot after all contexts are closed and a new
-    KMD poweroff the FW whereas on first new contexts creation,
+	// one is re-opened again
    KMD will load the firmware image & poweron device */
 	pva_kmd_fw_reset_assert(pva);
 	pva->recovery = true;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.h
@@ -5,6 +5,6 @@
 #include "pva_kmd_device.h"
 #include "pva_kmd_utils.h"
-void pva_kmd_abort(struct pva_kmd_device *pva);
+void pva_kmd_abort_fw(struct pva_kmd_device *pva);
 #endif //PVA_KMD_ABORT_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.c
@@ -53,13 +53,12 @@ static inline uint32_t next_slot(struct pva_kmd_block_allocator *allocator,
 	return *next;
 }
-void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
+void *pva_kmd_alloc_block_unsafe(struct pva_kmd_block_allocator *allocator,
 				 uint32_t *out_id)
 {
 	void *block = NULL;
 	uint32_t slot = INVALID_ID;
 	pva_kmd_mutex_lock(&allocator->allocator_lock);
 	if (allocator->free_slot_head != INVALID_ID) {
 		slot = allocator->free_slot_head;
 		allocator->free_slot_head =
@@ -69,18 +68,24 @@ void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
 			slot = allocator->next_free_slot;
 			allocator->next_free_slot++;
 		} else {
-			goto unlock;
+			return NULL;
 		}
 	}
 	allocator->slot_in_use[slot] = true;
 	pva_kmd_mutex_unlock(&allocator->allocator_lock);
 	*out_id = slot + allocator->base_id;
 	block = get_block(allocator, slot);
 	return block;
-unlock:
+}
 void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
 			  uint32_t *out_id)
 {
 	void *block = NULL;
 	pva_kmd_mutex_lock(&allocator->allocator_lock);
 	block = pva_kmd_alloc_block_unsafe(allocator, out_id);
 	pva_kmd_mutex_unlock(&allocator->allocator_lock);
-	return NULL;
+	return block;
 }
 static bool is_slot_valid(struct pva_kmd_block_allocator *allocator,
@@ -103,16 +108,15 @@ void *pva_kmd_get_block_unsafe(struct pva_kmd_block_allocator *allocator,
 	return get_block(allocator, slot);
 }
-enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
+enum pva_error
 pva_kmd_free_block_unsafe(struct pva_kmd_block_allocator *allocator,
 			  uint32_t id)
 {
 	uint32_t slot = id - allocator->base_id;
 	uint32_t *next;
-	enum pva_error err = PVA_SUCCESS;
+
 	pva_kmd_mutex_lock(&allocator->allocator_lock);
 	if (!is_slot_valid(allocator, slot)) {
-		err = PVA_INVAL;
+		return PVA_INVAL;
 		goto unlock;
 	}
 	allocator->slot_in_use[slot] = false;
@@ -120,7 +124,16 @@ enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
 	*next = allocator->free_slot_head;
 	allocator->free_slot_head = slot;
-unlock:
+	return PVA_SUCCESS;
 }
 enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
 				  uint32_t id)
 {
 	enum pva_error err = PVA_SUCCESS;
 	pva_kmd_mutex_lock(&allocator->allocator_lock);
 	err = pva_kmd_free_block_unsafe(allocator, id);
 	pva_kmd_mutex_unlock(&allocator->allocator_lock);
 	return err;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.h
@@ -24,6 +24,8 @@ pva_kmd_block_allocator_init(struct pva_kmd_block_allocator *allocator,
 void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
 			  uint32_t *out_id);
 void *pva_kmd_alloc_block_unsafe(struct pva_kmd_block_allocator *allocator,
 				 uint32_t *out_id);
 static inline void *
 pva_kmd_zalloc_block(struct pva_kmd_block_allocator *allocator,
 		     uint32_t *out_id)
@@ -47,6 +49,9 @@ void *pva_kmd_get_block_unsafe(struct pva_kmd_block_allocator *allocator,
 			       uint32_t id);
 enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
 				  uint32_t id);
 enum pva_error
 pva_kmd_free_block_unsafe(struct pva_kmd_block_allocator *allocator,
 			  uint32_t id);
 void pva_kmd_block_allocator_deinit(struct pva_kmd_block_allocator *allocator);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.h
@@ -143,6 +143,7 @@ static inline void pva_kmd_set_cmd_init_resource_table(
 	struct pva_cmd_init_resource_table *cmd, uint8_t resource_table_id,
 	uint64_t iova_addr, uint32_t max_num_entries)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_INIT_RESOURCE_TABLE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_table_id = resource_table_id;
@@ -155,6 +156,7 @@ static inline void
 pva_kmd_set_cmd_deinit_resource_table(struct pva_cmd_deinit_resource_table *cmd,
 				      uint8_t resource_table_id)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_table_id = resource_table_id;
@@ -162,22 +164,29 @@ pva_kmd_set_cmd_deinit_resource_table(struct pva_cmd_deinit_resource_table *cmd,
 static inline void pva_kmd_set_cmd_init_queue(struct pva_cmd_init_queue *cmd,
 					      uint8_t ccq_id, uint8_t queue_id,
-					      uint64_t iova_addr,
+					      uint64_t queue_addr,
-					      uint32_t max_num_submit)
+					      uint32_t max_num_submit,
 					      uint32_t syncpt_id,
 					      uint64_t syncpt_addr)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_INIT_QUEUE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->ccq_id = ccq_id;
 	cmd->queue_id = queue_id;
-	cmd->queue_addr_lo = iova_lo(iova_addr);
+	cmd->queue_addr_lo = iova_lo(queue_addr);
-	cmd->queue_addr_hi = iova_hi(iova_addr);
+	cmd->queue_addr_hi = iova_hi(queue_addr);
 	cmd->max_n_submits = max_num_submit;
 	cmd->syncpt_id = syncpt_id;
 	cmd->syncpt_addr_lo = iova_lo(syncpt_addr);
 	cmd->syncpt_addr_hi = iova_hi(syncpt_addr);
 }
 static inline void
 pva_kmd_set_cmd_deinit_queue(struct pva_cmd_deinit_queue *cmd, uint8_t ccq_id,
 			     uint8_t queue_id)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_QUEUE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->ccq_id = ccq_id;
@@ -188,6 +197,7 @@ static inline void pva_kmd_set_cmd_update_resource_table(
 	struct pva_cmd_update_resource_table *cmd, uint32_t resource_table_id,
 	uint32_t resource_id, struct pva_resource_entry const *entry)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_table_id = resource_table_id;
@@ -199,6 +209,7 @@ static inline void
 pva_kmd_set_cmd_unregister_resource(struct pva_cmd_unregister_resource *cmd,
 				    uint32_t resource_id)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_UNREGISTER_RESOURCE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_id = resource_id;
@@ -208,6 +219,7 @@ static inline void
 pva_kmd_set_cmd_enable_fw_profiling(struct pva_cmd_enable_fw_profiling *cmd,
 				    uint32_t filter, uint8_t timestamp_type)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_ENABLE_FW_PROFILING;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->filter = filter;
@@ -217,6 +229,7 @@ pva_kmd_set_cmd_enable_fw_profiling(struct pva_cmd_enable_fw_profiling *cmd,
 static inline void
 pva_kmd_set_cmd_disable_fw_profiling(struct pva_cmd_disable_fw_profiling *cmd)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DISABLE_FW_PROFILING;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 }
@@ -225,6 +238,7 @@ static inline void pva_kmd_set_cmd_get_tegra_stats(
 	struct pva_cmd_get_tegra_stats *cmd, uint32_t buffer_resource_id,
 	uint32_t buffer_size, uint64_t offset, bool enabled)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_GET_TEGRA_STATS;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->buffer_resource_id = buffer_resource_id;
@@ -238,6 +252,7 @@ static inline void
 pva_kmd_set_cmd_set_debug_log_level(struct pva_cmd_set_debug_log_level *cmd,
 				    uint32_t log_level)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_SET_DEBUG_LOG_LEVEL;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->log_level = log_level;
@@ -245,24 +260,23 @@ pva_kmd_set_cmd_set_debug_log_level(struct pva_cmd_set_debug_log_level *cmd,
 static inline void pva_kmd_set_cmd_suspend_fw(struct pva_cmd_suspend_fw *cmd)
 {
-	uint64_t len = (sizeof(*cmd) / sizeof(uint32_t));
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_SUSPEND_FW;
-	ASSERT(len <= 255u);
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->header.len = (uint8_t)(len);
 }
 static inline void pva_kmd_set_cmd_resume_fw(struct pva_cmd_resume_fw *cmd)
 {
-	uint64_t len = (sizeof(*cmd) / sizeof(uint32_t));
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_RESUME_FW;
-	ASSERT(len <= 255u);
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->header.len = (uint8_t)(len);
 }
 static inline void pva_kmd_set_cmd_init_shared_dram_buffer(
 	struct pva_cmd_init_shared_dram_buffer *cmd, uint8_t interface,
 	uint32_t buffer_iova, uint32_t buffer_size)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_INIT_SHARED_DRAM_BUFFER;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->buffer_iova_hi = iova_hi(buffer_iova);
@@ -274,6 +288,7 @@ static inline void pva_kmd_set_cmd_init_shared_dram_buffer(
 static inline void pva_kmd_set_cmd_deinit_shared_dram_buffer(
 	struct pva_cmd_deinit_shared_dram_buffer *cmd, uint8_t interface)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_SHARED_DRAM_BUFFER;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->interface = interface;
@@ -283,8 +298,12 @@ static inline void
 pva_kmd_set_cmd_set_profiling_level(struct pva_cmd_set_profiling_level *cmd,
 				    uint32_t level)
 {
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_SET_PROFILING_LEVEL;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->level = level;
 }
 #define CMD_LEN(cmd_type) (sizeof(cmd_type) / sizeof(uint32_t))
 #endif // PVA_KMD_CMDBUF_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_constants.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_constants.h
@@ -23,7 +23,7 @@
 // clang-format off
 #if PVA_BUILD_MODE == PVA_BUILD_MODE_SIM
    #define PVA_KMD_TIMEOUT_FACTOR 100
-#elif (PVA_BUILD_MODE == PVA_BUILD_MODE_NATIVE) && (PVA_IS_DEBUG == 1)
+#elif (PVA_BUILD_MODE == PVA_BUILD_MODE_NATIVE)
    // On native builds, the FW calls the KMD's shared buffer handler in its
    // own thread. In debug builds, if there are a large number of messages
    // (prints, unregister, etc.), this handler might take a while to execute,
@@ -42,22 +42,16 @@
 #define PVA_KMD_WAIT_FW_POLL_INTERVAL_US PVA_KMD_TIMEOUT(100) /*< 100 us*/
 #define PVA_KMD_FW_BOOT_TIMEOUT_MS PVA_KMD_TIMEOUT(1000) /*< 1 seconds */
-#define PVA_NUM_RW_SYNCPTS 56
+#define PVA_NUM_RW_SYNCPTS (PVA_MAX_NUM_CCQ * PVA_NUM_RW_SYNCPTS_PER_CONTEXT)
 // clang-format off
 #if PVA_DEV_MAIN_COMPATIBLE == 1
    #define PVA_KMD_LOAD_FROM_GSC_DEFAULT true
    #if PVA_SAFETY == 1
        #define PVA_KMD_APP_AUTH_DEFAULT true
    #else
        #define PVA_KMD_APP_AUTH_DEFAULT false
    #endif
 #else
    #define PVA_KMD_LOAD_FROM_GSC_DEFAULT false
    #define PVA_KMD_APP_AUTH_DEFAULT false
 #endif
 // clang-format on
-#define PVA_KMD_MAX_NUM_USER_DMA_CONFIG 1024
+#define PVA_KMD_DMA_CONFIG_POOL_INCR 256
 #endif // PVA_KMD_CONSTANTS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.c
@@ -67,118 +67,86 @@ err_out:
 static enum pva_error notify_fw_context_init(struct pva_kmd_context *ctx)
 {
 	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &ctx->pva->submitter;
 	struct pva_cmd_init_resource_table *res_cmd;
 	struct pva_cmd_init_queue *queue_cmd;
 	struct pva_cmd_update_resource_table *update_cmd;
 	struct pva_resource_entry entry = { 0 };
-	uint32_t fence_val;
+	const struct pva_syncpt_rw_info *syncpt_info;
 	enum pva_error err;
 	uint32_t current_offset = 0;
 	uint32_t cmd_scratch[CMD_LEN(struct pva_cmd_init_resource_table) +
 			     CMD_LEN(struct pva_cmd_init_queue) +
 			     CMD_LEN(struct pva_cmd_update_resource_table)];
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	res_cmd = (struct pva_cmd_init_resource_table *)pva_offset_pointer(
-	if (err != PVA_SUCCESS) {
+		&cmd_scratch[0], current_offset);
-		goto err_out;
+	current_offset += sizeof(*res_cmd);
-	}
+
-	res_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*res_cmd));
+	queue_cmd = (struct pva_cmd_init_queue *)pva_offset_pointer(
-	ASSERT(res_cmd != NULL);
+		&cmd_scratch[0], current_offset);
 	current_offset += sizeof(*queue_cmd);
 	update_cmd = (struct pva_cmd_update_resource_table *)pva_offset_pointer(
 		&cmd_scratch[0], current_offset);
 	current_offset += sizeof(*update_cmd);
 	pva_kmd_set_cmd_init_resource_table(
 		res_cmd, ctx->resource_table_id,
 		ctx->ctx_resource_table.table_mem->iova,
 		ctx->ctx_resource_table.n_entries);
-	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
+	syncpt_info = pva_kmd_queue_get_rw_syncpt_info(ctx, ctx->ccq_id);
 	ASSERT(queue_cmd != NULL);
 	pva_kmd_set_cmd_init_queue(
 		queue_cmd, PVA_PRIV_CCQ_ID,
 		ctx->ccq_id, /* For privileged queues, queue ID == user CCQ ID*/
 		ctx->ctx_queue.queue_memory->iova,
-		ctx->ctx_queue.max_num_submit);
+		ctx->ctx_queue.max_num_submit, syncpt_info->syncpt_id,
-
+		syncpt_info->syncpt_iova);
 	update_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*update_cmd));
 	ASSERT(update_cmd != NULL);
 	err = pva_kmd_make_resource_entry(&ctx->pva->dev_resource_table,
 					  ctx->submit_memory_resource_id,
 					  &entry);
 	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_set_cmd_update_resource_table(update_cmd,
 					      0, /* KMD's resource table ID */
 					      ctx->submit_memory_resource_id,
 					      &entry);
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	err = pva_kmd_submit_cmd_sync(dev_submitter, cmd_scratch,
-	if (err != PVA_SUCCESS) {
+				      sizeof(cmd_scratch),
 		// Error is either QUEUE_FULL or TIMEDOUT
 		goto cancel_builder;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out when initializing context");
 		goto err_out;
 	}
 	return PVA_SUCCESS;
 cancel_builder:
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 err_out:
 	return err;
 }
 static enum pva_error notify_fw_context_deinit(struct pva_kmd_context *ctx)
 {
 	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &ctx->pva->submitter;
 	struct pva_cmd_deinit_resource_table *deinit_table_cmd;
 	struct pva_cmd_deinit_queue *deinit_queue_cmd;
-	uint32_t fence_val;
+	uint32_t cmd_scratch[CMD_LEN(struct pva_cmd_deinit_queue) +
 			     CMD_LEN(struct pva_cmd_deinit_resource_table)];
 	enum pva_error err;
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	deinit_queue_cmd = (struct pva_cmd_deinit_queue *)pva_offset_pointer(
-	if (err != PVA_SUCCESS) {
+		&cmd_scratch[0], 0);
-		goto err_out;
+	deinit_table_cmd =
-	}
+		(struct pva_cmd_deinit_resource_table *)pva_offset_pointer(
 			&cmd_scratch[0], sizeof(struct pva_cmd_deinit_queue));
 	deinit_queue_cmd =
 		pva_kmd_reserve_cmd_space(&builder, sizeof(*deinit_queue_cmd));
 	ASSERT(deinit_queue_cmd != NULL);
 	pva_kmd_set_cmd_deinit_queue(
 		deinit_queue_cmd, PVA_PRIV_CCQ_ID,
 		ctx->ccq_id /* For privileged queues, queue ID == user CCQ ID*/
 	);
 	deinit_table_cmd =
 		pva_kmd_reserve_cmd_space(&builder, sizeof(*deinit_table_cmd));
 	ASSERT(deinit_table_cmd != NULL);
 	pva_kmd_set_cmd_deinit_resource_table(deinit_table_cmd,
 					      ctx->resource_table_id);
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	err = pva_kmd_submit_cmd_sync(dev_submitter, cmd_scratch,
-	if (err != PVA_SUCCESS) {
+				      sizeof(cmd_scratch),
 		goto cancel_builder;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out when deinitializing context");
 		goto err_out;
 	}
 	return PVA_SUCCESS;
 cancel_builder:
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 err_out:
 	return err;
 }
@@ -189,20 +157,24 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 	uint32_t queue_mem_size;
 	uint64_t chunk_mem_size;
 	struct pva_fw_postfence post_fence = { 0 };
 	struct pva_syncpt_rw_info *syncpts;
 	uint64_t size;
-	/* Allocate RW syncpoints for this context */
+	if (ctx->inited) {
-	syncpts = (struct pva_syncpt_rw_info *)pva_kmd_alloc_block(
+		err = PVA_INVAL;
-		&ctx->pva->syncpt_allocator, &ctx->syncpt_block_index);
+		goto err_out;
-	ASSERT(syncpts != NULL);
+	}
 	if (res_table_capacity == 0u) {
 		pva_kmd_log_err("Invalid resource capacity");
 		err = PVA_BAD_PARAMETER_ERROR;
 		goto err_out;
 	}
 	/* Init resource table for this context */
 	err = pva_kmd_resource_table_init(&ctx->ctx_resource_table, ctx->pva,
-					  ctx->smmu_ctx_id, res_table_capacity,
+					  ctx->smmu_ctx_id, res_table_capacity);
 					  PVA_KMD_MAX_NUM_USER_DMA_CONFIG);
 	if (err != PVA_SUCCESS) {
-		goto drop_device;
+		goto err_out;
 	}
 	/* Init privileged queue for this context */
@@ -225,7 +197,8 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 	/* Allocate memory for submission */
 	chunk_mem_size = pva_kmd_cmdbuf_pool_get_required_mem_size(
-		PVA_MAX_CMDBUF_CHUNK_SIZE, PVA_KMD_MAX_NUM_PRIV_CHUNKS);
+		pva_kmd_get_max_cmdbuf_chunk_size(ctx->pva),
 		PVA_KMD_MAX_NUM_PRIV_CHUNKS);
 	/* Allocate one post fence at the end. This memory will be added to
 	 * KMD's own resource table. We don't need to explicitly free it. It
 	 * will be freed after we drop the resource. */
@@ -242,6 +215,8 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 					       ctx->submit_memory,
 					       &ctx->submit_memory_resource_id);
 	if (err != PVA_SUCCESS) {
 		// Ownership of submit memory is transferred to KMD's resource table so
 		// if adding to resource table fails, we need to free it here.
 		pva_kmd_device_memory_free(ctx->submit_memory);
 		goto queue_deinit;
 	}
@@ -249,7 +224,8 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 	/* Init chunk pool */
 	err = pva_kmd_cmdbuf_chunk_pool_init(
 		&ctx->chunk_pool, ctx->submit_memory_resource_id,
-		0 /* offset */, chunk_mem_size, PVA_MAX_CMDBUF_CHUNK_SIZE,
+		0 /* offset */, chunk_mem_size,
 		pva_kmd_get_max_cmdbuf_chunk_size(ctx->pva),
 		PVA_KMD_MAX_NUM_PRIV_CHUNKS, ctx->submit_memory->va);
 	if (err != PVA_SUCCESS) {
 		goto free_dram_buffer_resource;
@@ -283,13 +259,15 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 					 pva_kmd_resource_table_lock,
 					 pva_kmd_resource_table_unlock);
 	if (err != PVA_SUCCESS) {
-		goto deinit_submitter;
+		goto deinit_fw_context;
 	}
 	ctx->inited = true;
 	return PVA_SUCCESS;
 deinit_fw_context:
 	notify_fw_context_deinit(ctx);
 deinit_submitter:
 	pva_kmd_mutex_deinit(&ctx->chunk_pool_lock);
 	pva_kmd_mutex_deinit(&ctx->submit_lock);
@@ -298,12 +276,10 @@ free_dram_buffer_resource:
 	pva_kmd_drop_resource(&ctx->pva->dev_resource_table,
 			      ctx->submit_memory_resource_id);
 queue_deinit:
 	pva_kmd_queue_deinit(&ctx->ctx_queue);
 	pva_kmd_device_memory_free(ctx->ctx_queue_mem);
 deinit_table:
 	pva_kmd_resource_table_deinit(&ctx->ctx_resource_table);
-drop_device:
+err_out:
 	pva_kmd_device_idle(ctx->pva);
 	return err;
 }
@@ -312,25 +288,24 @@ void pva_kmd_context_deinit(struct pva_kmd_context *ctx)
 	enum pva_error err;
 	if (ctx->inited) {
 		if (!ctx->pva->recovery) {
 		err = notify_fw_context_deinit(ctx);
-			ASSERT(err == PVA_SUCCESS);
+		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err(
 				"Failed to notify FW of context deinit");
 		}
 		err = pva_kmd_shared_buffer_deinit(ctx->pva, ctx->ccq_id);
-		ASSERT(err == PVA_SUCCESS);
+		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err("Failed to deinit FW buffer");
 		}
 		pva_kmd_device_idle(ctx->pva);
 		pva_kmd_mutex_deinit(&ctx->submit_lock);
 		pva_kmd_mutex_deinit(&ctx->chunk_pool_lock);
 		pva_kmd_cmdbuf_chunk_pool_deinit(&ctx->chunk_pool);
 		pva_kmd_drop_resource(&ctx->pva->dev_resource_table,
 				      ctx->submit_memory_resource_id);
 		pva_kmd_queue_deinit(&ctx->ctx_queue);
 		pva_kmd_device_memory_free(ctx->ctx_queue_mem);
 		pva_kmd_resource_table_deinit(&ctx->ctx_resource_table);
 		pva_kmd_free_block(&ctx->pva->syncpt_allocator,
 				   ctx->syncpt_block_index);
 		ctx->inited = false;
 	}
 }
@@ -345,14 +320,13 @@ static void pva_kmd_destroy_all_queues(struct pva_kmd_context *ctx)
 		pva_kmd_mutex_lock(&ctx->queue_allocator.allocator_lock);
 		queue = pva_kmd_get_block_unsafe(&ctx->queue_allocator,
 						 queue_id);
 		pva_kmd_mutex_unlock(&ctx->queue_allocator.allocator_lock);
 		if (queue != NULL) {
 			pva_kmd_mutex_unlock(
 				&ctx->queue_allocator.allocator_lock);
 			err = pva_kmd_queue_destroy(ctx, queue_id);
-			ASSERT(err == PVA_SUCCESS);
+			if (err != PVA_SUCCESS) {
-		} else {
+				pva_kmd_log_err_u64(
-			pva_kmd_mutex_unlock(
+					"Failed to destroy queue %d", queue_id);
-				&ctx->queue_allocator.allocator_lock);
+			}
 		}
 	}
 }
@@ -363,11 +337,12 @@ void pva_kmd_context_destroy(struct pva_kmd_context *ctx)
 	pva_kmd_destroy_all_queues(ctx);
 	pva_kmd_context_deinit(ctx);
 	pva_kmd_device_idle(ctx->pva);
 	pva_kmd_block_allocator_deinit(&ctx->queue_allocator);
 	pva_kmd_free(ctx->queue_allocator_mem);
 	pva_kmd_mutex_deinit(&ctx->ccq_lock);
 	err = pva_kmd_free_block(&ctx->pva->context_allocator, ctx->ccq_id);
 	pva_kmd_mutex_deinit(&ctx->ocb_lock);
 	err = pva_kmd_free_block(&ctx->pva->context_allocator, ctx->ccq_id);
 	ASSERT(err == PVA_SUCCESS);
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.h
@@ -63,9 +63,6 @@ struct pva_kmd_context {
 	void *plat_data;
 	uint64_t ccq_shm_handle;
 	/** Index of block of syncpoints allocated for this context */
 	uint32_t syncpt_block_index;
 	uint32_t syncpt_ids[PVA_NUM_RW_SYNCPTS_PER_CONTEXT];
 	pva_kmd_mutex_t ocb_lock;
 };
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.c
@@ -27,45 +27,6 @@ static uint64_t read_from_buffer_to_user(void *to, uint64_t count,
 	return count;
 }
 static enum pva_error
 pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
 				      uint32_t level)
 {
 	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
 	struct pva_cmd_set_profiling_level *cmd;
 	uint32_t fence_val;
 	enum pva_error err;
 	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
 	ASSERT(cmd != NULL);
 	pva_kmd_set_cmd_set_profiling_level(cmd, level);
 	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				     PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out when setting profiling level");
 		goto err_out;
 	}
 	return PVA_SUCCESS;
 err_out:
 	return err;
 }
 static int64_t profiling_level_read(struct pva_kmd_device *dev, void *file_data,
 				    uint8_t *out_buffer, uint64_t offset,
 				    uint64_t size)
@@ -118,92 +79,20 @@ static int64_t profiling_level_write(struct pva_kmd_device *dev,
 				"pva_kmd_device_busy failed when submitting set profiling level cmd");
 			return 0;
 		}
 		err = pva_kmd_notify_fw_set_profiling_level(dev, value);
 		pva_kmd_device_idle(dev);
 		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err(
 				"Failed to notify FW about profiling level change");
 			return 0;
 		}
 		pva_kmd_device_idle(dev);
 	}
 	return size;
 }
 void pva_kmd_debugfs_create_nodes(struct pva_kmd_device *pva)
 {
 	static const char *vpu_ocd_names[NUM_VPU_BLOCKS] = { "ocd_vpu0_v3",
 							     "ocd_vpu1_v3" };
 	struct pva_kmd_file_ops *profiling_fops;
 	pva_kmd_debugfs_create_bool(pva, "stats_enabled",
 				    &pva->debugfs_context.stats_enable);
 	pva_kmd_debugfs_create_bool(pva, "vpu_debug",
 				    &pva->debugfs_context.vpu_debug);
 	// Create profiling_level file operations
 	profiling_fops = &pva->debugfs_context.profiling_level_fops;
 	profiling_fops->read = profiling_level_read;
 	profiling_fops->write = profiling_level_write;
 	profiling_fops->open = NULL;
 	profiling_fops->release = NULL;
 	profiling_fops->pdev = pva;
 	pva_kmd_debugfs_create_file(pva, "profiling_level", profiling_fops);
 	pva->debugfs_context.vpu_fops.read = &get_vpu_stats;
 	pva->debugfs_context.vpu_fops.write = NULL;
 	pva->debugfs_context.vpu_fops.pdev = pva;
 	pva_kmd_debugfs_create_file(pva, "vpu_stats",
 				    &pva->debugfs_context.vpu_fops);
 	for (uint32_t i = 0; i < NUM_VPU_BLOCKS; i++) {
 		pva->debugfs_context.vpu_ocd_fops[i].open =
 			&pva_kmd_vpu_ocd_open;
 		pva->debugfs_context.vpu_ocd_fops[i].release =
 			&pva_kmd_vpu_ocd_release;
 		pva->debugfs_context.vpu_ocd_fops[i].read =
 			&pva_kmd_vpu_ocd_read;
 		pva->debugfs_context.vpu_ocd_fops[i].write =
 			&pva_kmd_vpu_ocd_write;
 		pva->debugfs_context.vpu_ocd_fops[i].pdev = pva;
 		pva->debugfs_context.vpu_ocd_fops[i].file_data =
 			(void *)&pva->regspec.vpu_dbg_instr_reg_offset[i];
 		pva_kmd_debugfs_create_file(
 			pva, vpu_ocd_names[i],
 			&pva->debugfs_context.vpu_ocd_fops[i]);
 	}
 	pva->debugfs_context.allowlist_ena_fops.read =
 		&get_vpu_allowlist_enabled;
 	pva->debugfs_context.allowlist_ena_fops.write = &update_vpu_allowlist;
 	pva->debugfs_context.allowlist_ena_fops.pdev = pva;
 	pva_kmd_debugfs_create_file(pva, "vpu_app_authentication",
 				    &pva->debugfs_context.allowlist_ena_fops);
 	pva->debugfs_context.allowlist_path_fops.read = &get_vpu_allowlist_path;
 	pva->debugfs_context.allowlist_path_fops.write =
 		&update_vpu_allowlist_path;
 	pva->debugfs_context.allowlist_path_fops.pdev = pva;
 	pva_kmd_debugfs_create_file(pva, "allowlist_path",
 				    &pva->debugfs_context.allowlist_path_fops);
 	pva->debugfs_context.fw_debug_log_level_fops.write =
 		&update_fw_debug_log_level;
 	pva->debugfs_context.fw_debug_log_level_fops.read = NULL;
 	pva->debugfs_context.fw_debug_log_level_fops.pdev = pva;
 	pva_kmd_debugfs_create_file(
 		pva, "fw_debug_log_level",
 		&pva->debugfs_context.fw_debug_log_level_fops);
 	pva_kmd_device_init_profiler(pva);
 	pva_kmd_device_init_tegra_stats(pva);
 }
 void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *pva)
 {
 	pva_kmd_device_deinit_tegra_stats(pva);
 	pva_kmd_device_deinit_profiler(pva);
 	pva_kmd_debugfs_remove_nodes(pva);
 }
 static int64_t print_vpu_stats(struct pva_kmd_tegrastats *kmd_tegra_stats,
 			       uint8_t *out_buffer, uint64_t offset,
 			       uint64_t len)
@@ -236,8 +125,9 @@ static int64_t print_vpu_stats(struct pva_kmd_tegrastats *kmd_tegra_stats,
 					formatted_len);
 }
-int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
+static int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
-		      uint8_t *out_buffer, uint64_t offset, uint64_t size)
+			     uint8_t *out_buffer, uint64_t offset,
 			     uint64_t size)
 {
 	struct pva_kmd_tegrastats kmd_tegra_stats;
@@ -251,9 +141,9 @@ int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
 	return print_vpu_stats(&kmd_tegra_stats, out_buffer, offset, size);
 }
-int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva, void *file_data,
+static int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva,
-				  uint8_t *out_buffer, uint64_t offset,
+					 void *file_data, uint8_t *out_buffer,
-				  uint64_t size)
+					 uint64_t offset, uint64_t size)
 {
 	// 1 byte for '0' or '1' and another 1 byte for the Null character
 	char out_str[2];
@@ -267,7 +157,7 @@ int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva, void *file_data,
 					sizeof(out_str));
 }
-int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
+static int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
 				    const uint8_t *in_buffer, uint64_t offset,
 				    uint64_t size)
 {
@@ -302,9 +192,9 @@ int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
 	return size;
 }
-int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
+static int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva,
-			       uint8_t *out_buffer, uint64_t offset,
+				      void *file_data, uint8_t *out_buffer,
-			       uint64_t size)
+				      uint64_t offset, uint64_t size)
 {
 	uint64_t len;
 	pva_kmd_mutex_lock(&(pva->pva_auth->allow_list_lock));
@@ -317,13 +207,18 @@ int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 	return len;
 }
-int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
+static int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva,
-				  const uint8_t *in_buffer, uint64_t offset,
+					 void *file_data,
-				  uint64_t size)
+					 const uint8_t *in_buffer,
 					 uint64_t offset, uint64_t size)
 {
 	char buffer[ALLOWLIST_FILE_LEN];
 	unsigned long retval;
 	if (size == 0) {
 		return 0;
 	}
 	if (size > sizeof(buffer)) {
 		pva_kmd_log_err_u64(
 			"Length of allowlist path is too long. It must be less than ",
@@ -338,7 +233,7 @@ int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 	}
 	//Replacing last character from new-line to null terminator
-	buffer[safe_subu64(size, 1u)] = '\0';
+	buffer[size - 1u] = '\0';
 	pva_kmd_mutex_lock(&(pva->pva_auth->allow_list_lock));
 	pva_kmd_update_allowlist_path(pva, buffer);
@@ -347,9 +242,10 @@ int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 	return size;
 }
-int64_t update_fw_debug_log_level(struct pva_kmd_device *pva, void *file_data,
+static int64_t update_fw_debug_log_level(struct pva_kmd_device *pva,
-				  const uint8_t *in_buffer, uint64_t offset,
+					 void *file_data,
-				  uint64_t size)
+					 const uint8_t *in_buffer,
 					 uint64_t offset, uint64_t size)
 {
 	uint32_t log_level;
 	unsigned long retval;
@@ -387,10 +283,143 @@ int64_t update_fw_debug_log_level(struct pva_kmd_device *pva, void *file_data,
 			goto err_end;
 		}
-		pva_kmd_notify_fw_set_debug_log_level(pva, log_level);
+		err = pva_kmd_notify_fw_set_debug_log_level(pva, log_level);
 		pva_kmd_device_idle(pva);
 		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err(
 				"Failed to notify FW about debug log level change");
 		}
 	}
 err_end:
 	return copy_size;
 }
 static int64_t get_fw_debug_log_level(struct pva_kmd_device *dev,
 				      void *file_data, uint8_t *out_buffer,
 				      uint64_t offset, uint64_t size)
 {
 	char print_buffer[64];
 	int formatted_len;
 	formatted_len = snprintf(print_buffer, sizeof(print_buffer), "%u\n",
 				 dev->fw_debug_log_level);
 	if (formatted_len <= 0) {
 		return -1;
 	}
 	return read_from_buffer_to_user(out_buffer, size, offset, print_buffer,
 					(uint64_t)formatted_len);
 }
 enum pva_error pva_kmd_debugfs_create_nodes(struct pva_kmd_device *pva)
 {
 	static const char *vpu_ocd_names[NUM_VPU_BLOCKS] = { "ocd_vpu0_v3",
 							     "ocd_vpu1_v3" };
 	struct pva_kmd_file_ops *profiling_fops;
 	enum pva_error err;
 	pva_kmd_debugfs_create_bool(pva, "stats_enabled",
 				    &pva->debugfs_context.stats_enable);
 	pva_kmd_debugfs_create_bool(pva, "vpu_debug",
 				    &pva->debugfs_context.vpu_debug);
 	// Create profiling_level file operations
 	profiling_fops = &pva->debugfs_context.profiling_level_fops;
 	profiling_fops->read = profiling_level_read;
 	profiling_fops->write = profiling_level_write;
 	profiling_fops->open = NULL;
 	profiling_fops->release = NULL;
 	profiling_fops->pdev = pva;
 	err = pva_kmd_debugfs_create_file(pva, "profiling_level",
 					  profiling_fops);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Failed to create profiling_level debugfs file");
 		return err;
 	}
 	pva->debugfs_context.vpu_fops.read = &get_vpu_stats;
 	pva->debugfs_context.vpu_fops.write = NULL;
 	pva->debugfs_context.vpu_fops.pdev = pva;
 	err = pva_kmd_debugfs_create_file(pva, "vpu_stats",
 					  &pva->debugfs_context.vpu_fops);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err("Failed to create vpu_stats debugfs file");
 		return err;
 	}
 	for (uint32_t i = 0; i < NUM_VPU_BLOCKS; i++) {
 		pva->debugfs_context.vpu_ocd_fops[i].open =
 			&pva_kmd_vpu_ocd_open;
 		pva->debugfs_context.vpu_ocd_fops[i].release =
 			&pva_kmd_vpu_ocd_release;
 		pva->debugfs_context.vpu_ocd_fops[i].read =
 			&pva_kmd_vpu_ocd_read;
 		pva->debugfs_context.vpu_ocd_fops[i].write =
 			&pva_kmd_vpu_ocd_write;
 		pva->debugfs_context.vpu_ocd_fops[i].pdev = pva;
 		pva->debugfs_context.vpu_ocd_fops[i].file_data =
 			(void *)&pva->regspec.vpu_dbg_instr_reg_offset[i];
 		err = pva_kmd_debugfs_create_file(
 			pva, vpu_ocd_names[i],
 			&pva->debugfs_context.vpu_ocd_fops[i]);
 		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err(
 				"Failed to create vpu_ocd debugfs file");
 			return err;
 		}
 	}
 	pva->debugfs_context.allowlist_ena_fops.read =
 		&get_vpu_allowlist_enabled;
 	pva->debugfs_context.allowlist_ena_fops.write = &update_vpu_allowlist;
 	pva->debugfs_context.allowlist_ena_fops.pdev = pva;
 	err = pva_kmd_debugfs_create_file(
 		pva, "vpu_app_authentication",
 		&pva->debugfs_context.allowlist_ena_fops);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Failed to create vpu_app_authentication debugfs file");
 		return err;
 	}
 	pva->debugfs_context.allowlist_path_fops.read = &get_vpu_allowlist_path;
 	pva->debugfs_context.allowlist_path_fops.write =
 		&update_vpu_allowlist_path;
 	pva->debugfs_context.allowlist_path_fops.pdev = pva;
 	err = pva_kmd_debugfs_create_file(
 		pva, "allowlist_path",
 		&pva->debugfs_context.allowlist_path_fops);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err("Failed to create allowlist_path debugfs file");
 		return err;
 	}
 	pva->debugfs_context.fw_debug_log_level_fops.write =
 		&update_fw_debug_log_level;
 	pva->debugfs_context.fw_debug_log_level_fops.read =
 		&get_fw_debug_log_level;
 	pva->debugfs_context.fw_debug_log_level_fops.pdev = pva;
 	err = pva_kmd_debugfs_create_file(
 		pva, "fw_debug_log_level",
 		&pva->debugfs_context.fw_debug_log_level_fops);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Failed to create fw_debug_log_level debugfs file");
 		return err;
 	}
 	pva_kmd_device_init_profiler(pva);
 	pva_kmd_device_init_tegra_stats(pva);
 	return PVA_SUCCESS;
 }
 void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *pva)
 {
 	pva_kmd_device_deinit_tegra_stats(pva);
 	pva_kmd_device_deinit_profiler(pva);
 	pva_kmd_debugfs_remove_nodes(pva);
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.h
@@ -37,24 +37,7 @@ struct pva_kmd_debugfs_context {
 	struct pva_kmd_file_ops fw_debug_log_level_fops;
 };
-void pva_kmd_debugfs_create_nodes(struct pva_kmd_device *dev);
+enum pva_error pva_kmd_debugfs_create_nodes(struct pva_kmd_device *dev);
 void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *dev);
 int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
 		      uint8_t *out_buffer, uint64_t offset, uint64_t size);
 int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
 			     const uint8_t *in_buffer, uint64_t offset,
 			     uint64_t size);
 int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva, void *file_data,
 				  uint8_t *out_buffer, uint64_t offset,
 				  uint64_t size);
 int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 				  const uint8_t *in_buffer, uint64_t offset,
 				  uint64_t size);
 int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 			       uint8_t *out_buffer, uint64_t offset,
 			       uint64_t size);
 int64_t update_fw_debug_log_level(struct pva_kmd_device *dev, void *file_data,
 				  const uint8_t *in_buffer, uint64_t offset,
 				  uint64_t size);
 #endif //PVA_KMD_DEBUGFS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.c
@@ -33,10 +33,11 @@
 * Initialization through CCQ is only intended for KMD's own resource table (the
 * first resource table created).
 */
-void pva_kmd_send_resource_table_info_by_ccq(
+static enum pva_error pva_kmd_send_resource_table_info_by_ccq(
 	struct pva_kmd_device *pva, struct pva_kmd_resource_table *res_table)
 {
 	enum pva_error err;
 	uint64_t addr = res_table->table_mem->iova;
 	uint32_t n_entries = res_table->n_entries;
 	uint64_t ccq_entry =
@@ -51,8 +52,9 @@ void pva_kmd_send_resource_table_info_by_ccq(
 	err = pva_kmd_ccq_push_with_timeout(pva, PVA_PRIV_CCQ_ID, ccq_entry,
 					    PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 					    PVA_KMD_WAIT_FW_TIMEOUT_US);
 	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_mutex_unlock(&pva->ccq0_lock);
 	return err;
 }
 /**
@@ -61,7 +63,8 @@ void pva_kmd_send_resource_table_info_by_ccq(
 * Initialization through CCQ is only intended for KMD's own queue (the first
 * queue created).
 */
-void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
+static enum pva_error
 pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
 			       struct pva_kmd_queue *queue)
 {
 	enum pva_error err;
@@ -78,8 +81,9 @@ void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
 	err = pva_kmd_ccq_push_with_timeout(pva, PVA_PRIV_CCQ_ID, ccq_entry,
 					    PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 					    PVA_KMD_WAIT_FW_TIMEOUT_US);
 	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_mutex_unlock(&pva->ccq0_lock);
 	return err;
 }
 /**
@@ -113,13 +117,13 @@ static void pva_kmd_device_init_submission(struct pva_kmd_device *pva)
 	/* Init KMD's resource table */
 	err = pva_kmd_resource_table_init(&pva->dev_resource_table, pva,
 					  PVA_R5_SMMU_CONTEXT_ID,
-					  PVA_KMD_MAX_NUM_KMD_RESOURCES,
+					  PVA_KMD_MAX_NUM_KMD_RESOURCES);
 					  PVA_KMD_MAX_NUM_KMD_DMA_CONFIGS);
 	ASSERT(err == PVA_SUCCESS);
 	/* Allocate memory for submission*/
 	chunk_mem_size = pva_kmd_cmdbuf_pool_get_required_mem_size(
-		PVA_MAX_CMDBUF_CHUNK_SIZE, PVA_KMD_MAX_NUM_KMD_CHUNKS);
+		pva_kmd_get_max_cmdbuf_chunk_size(pva),
 		PVA_KMD_MAX_NUM_KMD_CHUNKS);
 	size = safe_addu64(chunk_mem_size, (uint64_t)sizeof(uint32_t));
 	/* Allocate one post fence at the end. We don't need to free this memory
@@ -138,7 +142,7 @@ static void pva_kmd_device_init_submission(struct pva_kmd_device *pva)
 	/* Init chunk pool */
 	pva_kmd_cmdbuf_chunk_pool_init(
 		&pva->chunk_pool, pva->submit_memory_resource_id, 0,
-		chunk_mem_size, PVA_MAX_CMDBUF_CHUNK_SIZE,
+		chunk_mem_size, pva_kmd_get_max_cmdbuf_chunk_size(pva),
 		PVA_KMD_MAX_NUM_KMD_CHUNKS, pva->submit_memory->va);
 	/* Init fence */
@@ -167,21 +171,25 @@ static void pva_kmd_device_deinit_submission(struct pva_kmd_device *pva)
 	pva_kmd_drop_resource(&pva->dev_resource_table,
 			      pva->submit_memory_resource_id);
 	pva_kmd_resource_table_deinit(&pva->dev_resource_table);
 	pva_kmd_queue_deinit(&pva->dev_queue);
 	pva_kmd_device_memory_free(pva->queue_memory);
 }
 struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,
 					     uint32_t device_index,
-					     bool app_authenticate)
+					     bool app_authenticate,
 					     bool test_mode)
 {
 	struct pva_kmd_device *pva;
 	enum pva_error err;
 	uint32_t chunk_size;
 	uint32_t size;
 	if (test_mode) {
 		pva_kmd_log_err("Test mode is enabled");
 	}
 	pva = pva_kmd_zalloc_nofail(sizeof(*pva));
 	pva->test_mode = test_mode;
 	pva->device_index = device_index;
 	pva->load_from_gsc = false;
 	pva->is_hv_mode = true;
@@ -211,13 +219,6 @@ struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,
 	pva_kmd_device_plat_init(pva);
 	chunk_size = safe_mulu32((uint32_t)sizeof(struct pva_syncpt_rw_info),
 				 (uint32_t)PVA_NUM_RW_SYNCPTS_PER_CONTEXT);
 	err = pva_kmd_block_allocator_init(&pva->syncpt_allocator,
 					   pva->syncpt_rw, 0, chunk_size,
 					   PVA_MAX_NUM_USER_CONTEXTS);
 	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_device_init_submission(pva);
 	err = pva_kmd_init_vpu_app_auth(pva, app_authenticate);
@@ -257,7 +258,6 @@ void pva_kmd_device_destroy(struct pva_kmd_device *pva)
 	pva_kmd_wait_for_active_contexts(pva);
 	pva_kmd_device_deinit_submission(pva);
 	pva_kmd_device_plat_deinit(pva);
 	pva_kmd_block_allocator_deinit(&pva->syncpt_allocator);
 	pva_kmd_block_allocator_deinit(&pva->context_allocator);
 	pva_kmd_free(pva->context_mem);
 	pva_kmd_mutex_deinit(&pva->ccq0_lock);
@@ -266,44 +266,71 @@ void pva_kmd_device_destroy(struct pva_kmd_device *pva)
 	pva_kmd_free(pva);
 }
-static enum pva_error
+static enum pva_error config_fw_by_cmds(struct pva_kmd_device *pva)
 pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
 				      uint32_t level)
 {
-	struct pva_kmd_cmdbuf_builder builder;
+	enum pva_error err = PVA_SUCCESS;
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
 	struct pva_cmd_set_profiling_level *cmd;
 	uint32_t fence_val;
 	enum pva_error err;
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	err = pva_kmd_notify_fw_enable_profiling(pva);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
-	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
+	/* Set FW debug log level */
-	ASSERT(cmd != NULL);
+	err = pva_kmd_notify_fw_set_debug_log_level(pva,
-	pva_kmd_set_cmd_set_profiling_level(cmd, level);
+						    pva->fw_debug_log_level);
 	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+	// If the user had set profiling level before power-on, send the update to FW
-				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+	err = pva_kmd_notify_fw_set_profiling_level(
-				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+		pva, pva->debugfs_context.profiling_level);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out when setting profiling level");
 		goto err_out;
 	}
 	return PVA_SUCCESS;
 err_out:
 	return err;
 }
 enum pva_error pva_kmd_config_fw_after_boot(struct pva_kmd_device *pva)
 {
 	enum pva_error err = PVA_SUCCESS;
 	/* Reset KMD queue */
 	pva->dev_queue.queue_header->cb_head = 0;
 	pva->dev_queue.queue_header->cb_tail = 0;
 	err = pva_kmd_send_resource_table_info_by_ccq(pva,
 						      &pva->dev_resource_table);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	err = pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	err = pva_kmd_shared_buffer_init(pva, PVA_PRIV_CCQ_ID,
 					 PVA_KMD_FW_BUF_ELEMENT_SIZE,
 					 PVA_KMD_FW_PROFILING_BUF_NUM_ELEMENTS,
 					 NULL, NULL);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err_u64(
 			"pva kmd buffer initialization failed for interface ",
 			PVA_PRIV_CCQ_ID);
 		goto err_out;
 	}
 	err = config_fw_by_cmds(pva);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 err_out:
 	return err;
 }
 enum pva_error pva_kmd_device_busy(struct pva_kmd_device *pva)
 {
 	enum pva_error err = PVA_SUCCESS;
@@ -321,36 +348,26 @@ enum pva_error pva_kmd_device_busy(struct pva_kmd_device *pva)
 		if (err != PVA_SUCCESS) {
 			goto poweroff;
 		}
 		/* Reset KMD queue */
 		pva->dev_queue.queue_header->cb_head = 0;
 		pva->dev_queue.queue_header->cb_tail = 0;
-		pva_kmd_send_resource_table_info_by_ccq(
+		err = pva_kmd_config_fw_after_boot(pva);
 			pva, &pva->dev_resource_table);
 		pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
 		// TODO: need better error handling here
 		err = pva_kmd_shared_buffer_init(
 			pva, PVA_PRIV_CCQ_ID, PVA_KMD_FW_BUF_ELEMENT_SIZE,
 			PVA_KMD_FW_PROFILING_BUF_NUM_ELEMENTS, NULL, NULL);
 		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err_u64(
 				"pva kmd buffer initialization failed for interface ",
 				PVA_PRIV_CCQ_ID);
 			goto deinit_fw;
 		}
-		pva_kmd_notify_fw_enable_profiling(pva);
+	} else {
-
+		// Once firwmare is aborted, we no longer allow incrementing PVA
-		/* Set FW debug log level */
+		// refcount. This makes sure refcount will eventually reach 0 and allow
-		pva_kmd_notify_fw_set_debug_log_level(pva,
+		// device to be powered off.
-						      pva->fw_debug_log_level);
+		if (pva->recovery) {
-
+			pva_kmd_log_err_u64(
-		// If the user had set profiling level before power-on, send the update to FW
+				"PVA firmware aborted. "
-		pva_kmd_notify_fw_set_profiling_level(
+				"Waiting for active PVA uses to finish. Remaining",
-			pva, pva->debugfs_context.profiling_level);
+				pva->refcount);
 			err = PVA_ERR_FW_ABORTED;
 			goto unlock;
 		}
 	}
 	pva->refcount = safe_addu32(pva->refcount, 1U);
 	pva->refcount = safe_addu32(pva->refcount, 1U);
 	pva_kmd_mutex_unlock(&pva->powercycle_lock);
 	return PVA_SUCCESS;
@@ -371,15 +388,15 @@ void pva_kmd_device_idle(struct pva_kmd_device *pva)
 	ASSERT(pva->refcount > 0);
 	pva->refcount--;
 	if (pva->refcount == 0) {
-		if (!pva->recovery) {
+		err = pva_kmd_notify_fw_disable_profiling(pva);
-			/* Disable FW profiling */
+		if (err != PVA_SUCCESS) {
-			/* TODO: once debugfs is up, move these calls */
+			pva_kmd_log_err(
-			pva_kmd_notify_fw_disable_profiling(pva);
+				"pva_kmd_notify_fw_disable_profiling failed during device idle");
 		}
 		// TOOD: need better error handling here
 		err = pva_kmd_shared_buffer_deinit(pva, PVA_PRIV_CCQ_ID);
 		if (err != PVA_SUCCESS) {
-			pva_kmd_log_err("pva_kmd_shared_buffer_deinit failed");
+			pva_kmd_log_err(
 				"pva_kmd_shared_buffer_deinit failed during device idle");
 		}
 		pva_kmd_deinit_fw(pva);
 		pva_kmd_power_off(pva);
@@ -397,9 +414,12 @@ enum pva_error pva_kmd_ccq_push_with_timeout(struct pva_kmd_device *pva,
 		if (timeout_us == 0) {
 			pva_kmd_log_err(
 				"pva_kmd_ccq_push_with_timeout Timed out");
-			pva_kmd_abort(pva);
+			pva_kmd_abort_fw(pva);
 			return PVA_TIMEDOUT;
 		}
 		if (pva->recovery) {
 			return PVA_ERR_FW_ABORTED;
 		}
 		pva_kmd_sleep_us(sleep_interval_us);
 		timeout_us = sat_sub64(timeout_us, sleep_interval_us);
 	}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.h
@@ -4,6 +4,7 @@
 #ifndef PVA_KMD_DEVICE_H
 #define PVA_KMD_DEVICE_H
 #include "pva_constants.h"
 #include "pva_fw.h"
 #include "pva_kmd_cmdbuf.h"
 #include "pva_kmd_utils.h"
 #include "pva_kmd_mutex.h"
@@ -26,9 +27,6 @@
 #define NV_PVA1_CLASS_ID 0xF2
 struct pva_syncpt_rw_info {
 	/** Dont switch order since syncpt_id and syncpt_iova is prefilled during kmd boot
 	 * and first field gets updated by pva_kmd_allocator everytime its freed */
 	uint32_t syncpt_value;
 	uint32_t syncpt_id;
 	uint64_t syncpt_iova;
 };
@@ -127,12 +125,13 @@ struct pva_kmd_device {
 	uint8_t bl_sector_pack_format;
 	/** Offset between 2 syncpoints */
-	uint32_t syncpt_offset;
+	uint32_t syncpt_page_size;
-	uint64_t syncpt_ro_iova;
+	uint64_t ro_syncpt_base_iova;
-	uint64_t syncpt_rw_iova;
+	uint32_t num_ro_syncpts;
-	uint32_t num_syncpts;
+
-	struct pva_syncpt_rw_info syncpt_rw[PVA_NUM_RW_SYNCPTS];
+	uint64_t rw_syncpt_base_iova;
-	struct pva_kmd_block_allocator syncpt_allocator;
+	uint32_t rw_syncpt_region_size;
 	struct pva_syncpt_rw_info rw_syncpts[PVA_NUM_RW_SYNCPTS];
 	struct vmem_region *vmem_regions_tab;
 	bool support_hwseq_frame_linking;
@@ -145,11 +144,14 @@ struct pva_kmd_device {
 	/** Carveout info for FW */
 	struct pva_co_info fw_carveout;
 	bool test_mode;
 };
 struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,
 					     uint32_t device_index,
-					     bool app_authenticate);
+					     bool app_authenticate,
 					     bool test_mode);
 void pva_kmd_device_destroy(struct pva_kmd_device *pva);
@@ -161,11 +163,7 @@ enum pva_error pva_kmd_ccq_push_with_timeout(struct pva_kmd_device *pva,
 					     uint64_t sleep_interval_us,
 					     uint64_t timeout_us);
-void pva_kmd_send_resource_table_info_by_ccq(
+enum pva_error pva_kmd_config_fw_after_boot(struct pva_kmd_device *pva);
 	struct pva_kmd_device *pva, struct pva_kmd_resource_table *res_table);
 void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
 				    struct pva_kmd_queue *queue);
 bool pva_kmd_device_maybe_on(struct pva_kmd_device *pva);
@@ -177,4 +175,14 @@ static inline uint32_t pva_kmd_get_device_class_id(struct pva_kmd_device *pva)
 		return NV_PVA1_CLASS_ID;
 	}
 }
 static inline uint16_t
 pva_kmd_get_max_cmdbuf_chunk_size(struct pva_kmd_device *pva)
 {
 	if (pva->test_mode) {
 		return PVA_TEST_MODE_MAX_CMDBUF_CHUNK_SIZE;
 	} else {
 		return PVA_MAX_CMDBUF_CHUNK_SIZE;
 	}
 }
 #endif // PVA_KMD_DEVICE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.c
@@ -0,0 +1,266 @@
 // SPDX-License-Identifier: GPL-2.0-only
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #include "pva_kmd_devmem_pool.h"
 #include "pva_kmd_utils.h"
 #include "pva_api.h"
 #include "pva_utils.h"
 static uint64_t get_devmem_offset(struct pva_kmd_devmem_element const *devmem)
 {
 	return (uint64_t)safe_mulu32(devmem->ele_idx,
 				     devmem->segment->owner_pool->element_size);
 }
 uint64_t pva_kmd_get_devmem_iova(struct pva_kmd_devmem_element const *devmem)
 {
 	return safe_addu64(devmem->segment->mem->iova,
 			   get_devmem_offset(devmem));
 }
 void *pva_kmd_get_devmem_va(struct pva_kmd_devmem_element const *devmem)
 {
 	return pva_offset_pointer(devmem->segment->mem->va,
 				  get_devmem_offset(devmem));
 }
 static struct pva_kmd_devmem_pool_segment *
 allocate_segment(struct pva_kmd_devmem_pool *pool)
 {
 	struct pva_kmd_devmem_pool_segment *segment;
 	struct pva_kmd_device_memory *mem = NULL;
 	uint64_t segment_size = safe_mulu64((uint64_t)pool->element_size,
 					    (uint64_t)pool->n_element_incr);
 	void *va;
 	enum pva_error err;
 	/* Allocate the segment structure */
 	segment = pva_kmd_zalloc(sizeof(*segment));
 	if (segment == NULL) {
 		goto err_out;
 	}
 	/* Allocate device memory */
 	mem = pva_kmd_device_memory_alloc_map(
 		segment_size, pool->pva, PVA_ACCESS_RW, pool->smmu_ctx_idx);
 	if (mem == NULL) {
 		goto free_segment;
 	}
 	segment->mem = mem;
 	segment->owner_pool = pool;
 	segment->n_free_ele =
 		pool->n_element_incr; /* Initialize all elements as free */
 	va = mem->va;
 	/* Initialize the segment allocator */
 	err = pva_kmd_block_allocator_init(&segment->elem_allocator, va, 0,
 					   pool->element_size,
 					   pool->n_element_incr);
 	if (err != PVA_SUCCESS) {
 		goto free_mem;
 	}
 	/* Add segment to the pool */
 	segment->next = pool->segment_list_head;
 	pool->segment_list_head = segment;
 	pool->n_free_element =
 		safe_addu32(pool->n_free_element, pool->n_element_incr);
 	return segment;
 free_mem:
 	pva_kmd_device_memory_free(mem);
 free_segment:
 	pva_kmd_free(segment);
 err_out:
 	return NULL;
 }
 enum pva_error pva_kmd_devmem_pool_init(struct pva_kmd_devmem_pool *pool,
 					struct pva_kmd_device *pva,
 					uint8_t smmu_ctx_idx,
 					uint32_t element_size,
 					uint32_t ele_incr_count)
 {
 	struct pva_kmd_devmem_pool_segment *segment;
 	enum pva_error err = PVA_SUCCESS;
 	/* Initialize the pool structure */
 	memset(pool, 0, sizeof(*pool));
 	pool->smmu_ctx_idx = smmu_ctx_idx;
 	pool->element_size =
 		safe_pow2_roundup_u32(element_size, sizeof(uint64_t));
 	pool->n_element_incr = ele_incr_count;
 	pool->n_free_element = 0;
 	pool->segment_list_head = NULL;
 	pool->pva = pva;
 	err = pva_kmd_mutex_init(&pool->pool_lock);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	/* Allocate the first segment */
 	segment = allocate_segment(pool);
 	if (segment == NULL) {
 		err = PVA_NOMEM;
 		goto deinit_mutex;
 	}
 	return PVA_SUCCESS;
 deinit_mutex:
 	pva_kmd_mutex_deinit(&pool->pool_lock);
 err_out:
 	return err;
 }
 static enum pva_error
 pva_kmd_devmem_pool_alloc(struct pva_kmd_devmem_pool *pool,
 			  struct pva_kmd_devmem_element *devmem)
 {
 	struct pva_kmd_devmem_pool_segment *segment = NULL;
 	struct pva_kmd_devmem_pool_segment *new_segment = NULL;
 	uint32_t ele_idx = (uint32_t)-1;
 	enum pva_error err = PVA_SUCCESS;
 	pva_kmd_mutex_lock(&pool->pool_lock);
 	/* Check if we have any free elements */
 	if (pool->n_free_element == 0) {
 		/* Need to allocate a new segment */
 		new_segment = allocate_segment(pool);
 		if (new_segment == NULL) {
 			err = PVA_NOMEM;
 			goto unlock;
 		}
 	}
 	/* Try to find a free element in the pool */
 	segment = pool->segment_list_head;
 	while (segment != NULL) {
 		void *va = NULL;
 		va = pva_kmd_alloc_block_unsafe(&segment->elem_allocator,
 						&ele_idx);
 		if (va != NULL) {
 			/* Found a free element */
 			break;
 		}
 		segment = segment->next;
 	}
 	ASSERT(segment != NULL);
 	devmem->segment = segment;
 	devmem->ele_idx = ele_idx;
 	pool->n_free_element = safe_subu32(pool->n_free_element, 1);
 	segment->n_free_ele = safe_subu32(segment->n_free_ele, 1);
 unlock:
 	pva_kmd_mutex_unlock(&pool->pool_lock);
 	return err;
 }
 enum pva_error pva_kmd_devmem_pool_zalloc(struct pva_kmd_devmem_pool *pool,
 					  struct pva_kmd_devmem_element *devmem)
 {
 	enum pva_error err = pva_kmd_devmem_pool_alloc(pool, devmem);
 	if (err != PVA_SUCCESS) {
 		return err;
 	}
 	memset(pva_kmd_get_devmem_va(devmem), 0, pool->element_size);
 	return PVA_SUCCESS;
 }
 static void free_segment(struct pva_kmd_devmem_pool *pool,
 			 struct pva_kmd_devmem_pool_segment *target_segment)
 {
 	struct pva_kmd_devmem_pool_segment *segment;
 	struct pva_kmd_devmem_pool_segment *prev_segment = NULL;
 	/* Find previous segment to update the linked list */
 	segment = pool->segment_list_head;
 	while (segment != NULL && segment != target_segment) {
 		prev_segment = segment;
 		segment = segment->next;
 	}
 	/* Segment not found in the list */
 	ASSERT(segment != NULL);
 	/* Remove this segment from the list */
 	if (prev_segment == NULL) {
 		/* This is the head segment */
 		pool->segment_list_head = target_segment->next;
 	} else {
 		prev_segment->next = target_segment->next;
 	}
 	/* Free the segment allocator */
 	pva_kmd_block_allocator_deinit(&target_segment->elem_allocator);
 	/* Free the device memory */
 	pva_kmd_device_memory_free(target_segment->mem);
 	/* Free the segment structure */
 	pva_kmd_free(target_segment);
 	/* Update the free element count */
 	pool->n_free_element =
 		safe_subu32(pool->n_free_element, pool->n_element_incr);
 }
 void pva_kmd_devmem_pool_free(struct pva_kmd_devmem_element *devmem)
 {
 	struct pva_kmd_devmem_pool *pool = devmem->segment->owner_pool;
 	struct pva_kmd_devmem_pool_segment *current_segment = devmem->segment;
 	uint32_t threshold;
 	pva_kmd_mutex_lock(&pool->pool_lock);
 	/* Free the element */
 	pva_kmd_free_block_unsafe(&current_segment->elem_allocator,
 				  devmem->ele_idx);
 	pool->n_free_element = safe_addu32(pool->n_free_element, 1);
 	current_segment->n_free_ele =
 		safe_addu32(current_segment->n_free_ele, 1);
 	/* Check if the current segment is now empty using n_free_ele counter */
 	if (current_segment->n_free_ele ==
 	    current_segment->elem_allocator.max_num_blocks) {
 		/* We only free the segment if we still have n_ele_incr free elements
 		after the free */
 		threshold = safe_mulu32(pool->n_element_incr, 2);
 		if (pool->n_free_element >= threshold) {
 			free_segment(pool, current_segment);
 		}
 	}
 	pva_kmd_mutex_unlock(&pool->pool_lock);
 }
 void pva_kmd_devmem_pool_deinit(struct pva_kmd_devmem_pool *pool)
 {
 	struct pva_kmd_devmem_pool_segment *segment = pool->segment_list_head;
 	struct pva_kmd_devmem_pool_segment *next;
 	/* Free all segments */
 	while (segment != NULL) {
 		next = segment->next;
 		/* Free the segment allocator */
 		pva_kmd_block_allocator_deinit(&segment->elem_allocator);
 		/* Free the device memory */
 		pva_kmd_device_memory_free(segment->mem);
 		/* Free the segment structure */
 		pva_kmd_free(segment);
 		segment = next;
 	}
 	pool->segment_list_head = NULL;
 	pva_kmd_mutex_deinit(&pool->pool_lock);
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.h
@@ -0,0 +1,100 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
 #ifndef PVA_KMD_DEVMEM_POOL_H
 #define PVA_KMD_DEVMEM_POOL_H
 #include "pva_api.h"
 #include "pva_kmd_block_allocator.h"
 #include "pva_kmd_device_memory.h"
 /** @brief A segment of a device memory pool.
 *
 * It holds a fixed size array of device memory blocks. A pool is a linked list
 * of segments.
 */
 struct pva_kmd_devmem_pool_segment {
 	/** The owner pool. */
 	struct pva_kmd_devmem_pool *owner_pool;
 	/** The next segment in the pool. */
 	struct pva_kmd_devmem_pool_segment *next;
 	/** The device memory for the segment. */
 	struct pva_kmd_device_memory *mem;
 	/** The allocator for the elements in the segment. */
 	struct pva_kmd_block_allocator elem_allocator;
 	/** The number of free elements in the segment. */
 	uint32_t n_free_ele;
 };
 /** @brief A device memory pool that holds fixed size elements.
 *
 * It allocates memory in segments, each segment contains n_element_incr
 * elements.
 * - element_size will be rounded up to the nearest 8 bytes for alignment.
 * - The pool is initialized with element_size * n_element_incr capacity.
 * - Once exhausted, the pool will allocate a new segment of memory and increase
 *   the capacity by n_element_incr.
 * - When an element is freed, the pool does not immediately release the whole
 *   segment even if the whole segment is empty. However, if there are 2 *
 *   n_element_incr free elements, the pool will release a whole segment, so
 *   that there's still at least n_element_incr free elements.
 * - The pool is thread safe.
 */
 struct pva_kmd_devmem_pool {
 	/** The SMMU context index for the pool. */
 	uint8_t smmu_ctx_idx;
 	/** The size of each element in the pool. */
 	uint32_t element_size;
 	/** The number of elements to allocate in each segment. */
 	uint32_t n_element_incr;
 	/** The total number of free elements in the pool, across all segments. */
 	uint32_t n_free_element;
 	/** The head of the segment list. */
 	struct pva_kmd_devmem_pool_segment *segment_list_head;
 	/** The PVA device. */
 	struct pva_kmd_device *pva;
 	/** The mutex for the pool. */
 	pva_kmd_mutex_t pool_lock;
 };
 /** @brief Device memory from a pool.
 *
 * It is an element in a segment of a pool.
 */
 struct pva_kmd_devmem_element {
 	/** The segment that contains the element. */
 	struct pva_kmd_devmem_pool_segment *segment;
 	/** The index of the element in the segment. */
 	uint32_t ele_idx;
 };
 /** @brief Get the IOVA of a device memory element. */
 uint64_t pva_kmd_get_devmem_iova(struct pva_kmd_devmem_element const *devmem);
 /** @brief Get the virtual address of a device memory element. */
 void *pva_kmd_get_devmem_va(struct pva_kmd_devmem_element const *devmem);
 /** @brief Initialize a device memory pool.
 *
 * @param pool The device memory pool to initialize.
 * @param pva The PVA device.
 * @param smmu_ctx_idx The SMMU context index for the pool.
 * @param element_size The size of each element in the pool.
 * @param ele_incr_count The number of elements to allocate in each segment.
 */
 enum pva_error pva_kmd_devmem_pool_init(struct pva_kmd_devmem_pool *pool,
 					struct pva_kmd_device *pva,
 					uint8_t smmu_ctx_idx,
 					uint32_t element_size,
 					uint32_t ele_incr_count);
 /** @brief Allocate a device memory element from a pool and zero-initialize it. */
 enum pva_error
 pva_kmd_devmem_pool_zalloc(struct pva_kmd_devmem_pool *pool,
 			   struct pva_kmd_devmem_element *devmem);
 /** @brief Free a device memory element from a pool. */
 void pva_kmd_devmem_pool_free(struct pva_kmd_devmem_element *devmem);
 /** @brief Deinitialize a device memory pool. */
 void pva_kmd_devmem_pool_deinit(struct pva_kmd_devmem_pool *pool);
 #endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.c
@@ -62,42 +62,41 @@ pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
 	struct pva_dma_config dma_config;
 	struct pva_fw_dma_slot *dyn_slots;
 	struct pva_fw_dma_reloc *dyn_relocs;
-	struct pva_fw_dma_slot *static_slots = dma_aux->static_slots;
+	struct pva_kmd_dma_scratch_buffer *scratch_buf;
 	struct pva_fw_dma_reloc *static_relocs = dma_aux->static_relocs;
 	struct pva_kmd_dma_access *access_sizes = dma_aux->access_sizes;
 	// Mapping descriptor index to channel index
 	uint8_t desc_to_ch[PVA_MAX_NUM_DMA_DESC];
 	scratch_buf = pva_kmd_zalloc(sizeof(*scratch_buf));
 	if (scratch_buf == NULL) {
 		err = PVA_NOMEM;
 		goto err_out;
 	}
 	for (uint32_t i = 0; i < PVA_MAX_NUM_DMA_DESC; i++) {
 		desc_to_ch[i] = PVA_KMD_INVALID_CH_IDX;
 	}
 	//set access_sizes to 0 by default
 	(void)memset(
 		access_sizes, 0,
 		(PVA_MAX_NUM_DMA_DESC * sizeof(struct pva_kmd_dma_access)));
 	err = pva_kmd_parse_dma_config(dma_cfg_hdr, dma_config_size,
 				       &dma_config,
 				       &resource_table->pva->hw_consts);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}
 	err = pva_kmd_validate_dma_config(&dma_config,
 					  &resource_table->pva->hw_consts,
-					  access_sizes,
+					  scratch_buf->access_sizes,
-					  dma_aux->hw_dma_descs_mask);
+					  scratch_buf->hw_dma_descs_mask);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}
 	trace_dma_channels(&dma_config, desc_to_ch);
-	err = pva_kmd_compute_dma_access(&dma_config, access_sizes,
+	err = pva_kmd_compute_dma_access(&dma_config, scratch_buf->access_sizes,
-					 dma_aux->hw_dma_descs_mask);
+					 scratch_buf->hw_dma_descs_mask);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}
 	dyn_slots = pva_offset_pointer(fw_dma_cfg,
@@ -107,9 +106,10 @@ pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
 					dma_config.header.num_dynamic_slots *
 						sizeof(*dyn_slots));
-	pva_kmd_collect_relocs(&dma_config, access_sizes, static_slots,
+	pva_kmd_collect_relocs(&dma_config, scratch_buf->access_sizes,
 			       scratch_buf->static_slots,
 			       dma_config.header.num_static_slots,
-			       static_relocs, dyn_slots,
+			       scratch_buf->static_relocs, dyn_slots,
 			       dma_config.header.num_dynamic_slots, dyn_relocs,
 			       desc_to_ch);
@@ -117,26 +117,27 @@ pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
 		&dma_config, fw_dma_cfg, &fw_fetch_size,
 		resource_table->pva->support_hwseq_frame_linking);
 	dma_aux->res_table = resource_table;
 	err = pva_kmd_dma_use_resources(&dma_config, dma_aux);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}
-	err = pva_kmd_bind_static_buffers(fw_dma_cfg, dma_aux, static_slots,
+	err = pva_kmd_bind_static_buffers(
-					  dma_config.header.num_static_slots,
+		fw_dma_cfg, dma_aux, scratch_buf->static_slots,
-					  static_relocs,
+		dma_config.header.num_static_slots, scratch_buf->static_relocs,
-					  dma_config.static_bindings,
+		dma_config.static_bindings, dma_config.header.num_static_slots);
 					  dma_config.header.num_static_slots);
 	if (err != PVA_SUCCESS) {
 		goto drop_res;
 	}
 	*out_fw_fetch_size = fw_fetch_size;
 	pva_kmd_free(scratch_buf);
 	return PVA_SUCCESS;
 drop_res:
 	pva_kmd_unload_dma_config_unsafe(dma_aux);
 free_scratch_buf:
 	pva_kmd_free(scratch_buf);
 err_out:
 	return err;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.h
@@ -50,9 +50,10 @@ struct pva_kmd_dma_resource_aux {
 	uint32_t dram_res_count;
 	/** DRAM buffers statically referenced by the DMA configuration */
 	uint32_t static_dram_res_ids[PVA_KMD_MAX_NUM_DMA_DRAM_SLOTS];
 };
-	/* Below are work buffers need during DMA configuration loading. They
+/* Scratch buffers needed during DMA configuration loading. They don't fit on stack. */
-	 * don't fit on stack. */
+struct pva_kmd_dma_scratch_buffer {
 	struct pva_fw_dma_slot static_slots[PVA_KMD_MAX_NUM_DMA_SLOTS];
 	struct pva_fw_dma_reloc static_relocs[PVA_KMD_MAX_NUM_DMA_SLOTS];
 	struct pva_kmd_dma_access access_sizes[PVA_MAX_NUM_DMA_DESC];
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_binding.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_binding.c
@@ -94,6 +94,9 @@ bind_static_dram_slot(struct pva_dma_config_resource *dma_config,
 	int64_t slot_access_end_addr = 0LL;
 	uint64_t slot_surface_combined_offset = 0ULL;
 	pva_math_error math_error = MATH_OP_SUCCESS;
 	uint8_t slot_access_flags =
 		PVA_EXTRACT16(slot->flags, PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB,
 			      PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB, uint8_t);
 	if ((slot->flags & PVA_FW_DMA_SLOT_FLAG_DRAM) == 0) {
 		pva_kmd_log_err("Binding DRAM buffer to incompatible slot");
@@ -101,6 +104,14 @@ bind_static_dram_slot(struct pva_dma_config_resource *dma_config,
 		goto out;
 	}
 	if ((slot_access_flags & dram_res->mem->iova_access_flags) !=
 	    slot_access_flags) {
 		pva_kmd_log_err(
 			"DRAM buffer does not have the required access permissions");
 		err = PVA_INVALID_BINDING;
 		goto out;
 	}
 	if (is_block_linear) {
 		if (slot->flags & PVA_FW_DMA_SLOT_FLAG_CB) {
 			pva_kmd_log_err(
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_validate.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_validate.c
@@ -218,7 +218,7 @@ validate_descriptor(const struct pva_dma_descriptor *desc,
 	/* DMA_DESC_LDID */
 	if ((desc->link_desc_id > cfg_hdr->num_descriptors) ||
 	    ((desc->link_desc_id != 0) &&
-	     pva_is_reserved_desc(desc->link_desc_id - PVA_DMA_DESC0))) {
+	     pva_is_reserved_desc(desc->link_desc_id - PVA_DMA_DESC_ID_BASE))) {
 		pva_kmd_log_err("ERR: Invalid linker Desc ID");
 		return PVA_INVAL;
 	}
@@ -423,6 +423,8 @@ pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,
 			err = PVA_INVAL;
 			goto err_out;
 		}
 		dma_aux->vpu_bin_res_id = dma_cfg->header.vpu_exec_resource_id;
 		if (vpu_bin_rec->type != PVA_RESOURCE_TYPE_EXEC_BIN) {
 			pva_kmd_log_err(
 				"Invalid VPU exec resource id used by DMA config");
@@ -432,9 +434,6 @@ pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,
 		vpu_bin = &vpu_bin_rec->vpu_bin;
 	}
 	dma_aux->vpu_bin_res_id = dma_cfg->header.vpu_exec_resource_id;
 	dma_aux->dram_res_count = 0;
 	/* Increment reference count for all static DRAM buffers; For static
 	 * VMEM buffers, check that symbol ID is valid. */
 	for (i = 0; i < dma_cfg->header.num_static_slots; i++) {
@@ -455,7 +454,8 @@ pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,
 			dma_aux->static_dram_res_ids[dma_aux->dram_res_count] =
 				slot_buf->dram.resource_id;
-			dma_aux->dram_res_count += 1;
+			dma_aux->dram_res_count =
 				safe_addu32(dma_aux->dram_res_count, 1U);
 			if (rec->type != PVA_RESOURCE_TYPE_DRAM) {
 				pva_kmd_log_err(
@@ -505,9 +505,10 @@ static uint16_t get_slot_id(uint16_t slot)
 	return slot & PVA_DMA_SLOT_ID_MASK;
 }
-static uint8_t get_slot_flag(uint8_t transfer_mode, bool cb_enable)
+static uint16_t get_slot_flag(uint8_t transfer_mode, bool cb_enable,
 			      bool is_dst)
 {
-	uint8_t flags = 0;
+	uint16_t flags = 0;
 	if (transfer_mode == PVA_DMA_TRANS_MODE_VMEM) {
 		flags |= PVA_FW_DMA_SLOT_FLAG_VMEM_DATA;
 	} else if (transfer_mode == PVA_DMA_TRANS_MODE_L2SRAM) {
@@ -521,6 +522,15 @@ static uint8_t get_slot_flag(uint8_t transfer_mode, bool cb_enable)
 	if (cb_enable) {
 		flags |= PVA_FW_DMA_SLOT_FLAG_CB;
 	}
 	if (is_dst) {
 		flags |= PVA_INSERT(PVA_ACCESS_WO,
 				    PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB,
 				    PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB);
 	} else {
 		flags |= PVA_INSERT(PVA_ACCESS_RO,
 				    PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB,
 				    PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB);
 	}
 	return flags;
 }
@@ -529,7 +539,7 @@ static void update_reloc_count(uint16_t slot, uint8_t transfer_mode,
 			       struct pva_fw_dma_slot *out_static_slots,
 			       uint16_t num_static_slots,
 			       struct pva_fw_dma_slot *out_dyn_slots,
-			       uint16_t num_dyn_slots)
+			       uint16_t num_dyn_slots, bool is_dst)
 {
 	uint8_t slot_id = get_slot_id(slot);
@@ -537,13 +547,12 @@ static void update_reloc_count(uint16_t slot, uint8_t transfer_mode,
 		out_dyn_slots[slot_id].reloc_count =
 			safe_addu16(out_dyn_slots[slot_id].reloc_count, 1U);
 		out_dyn_slots[slot_id].flags |=
-			get_slot_flag(transfer_mode, cb_enable);
+			get_slot_flag(transfer_mode, cb_enable, is_dst);
 	} else if (slot & PVA_DMA_STATIC_SLOT) {
 		out_static_slots[slot_id].reloc_count =
 			safe_addu16(out_static_slots[slot_id].reloc_count, 1U);
 		;
 		out_static_slots[slot_id].flags |=
-			get_slot_flag(transfer_mode, cb_enable);
+			get_slot_flag(transfer_mode, cb_enable, is_dst);
 	}
 }
@@ -567,17 +576,17 @@ static void count_relocs(struct pva_dma_config const *dma_cfg,
 		update_reloc_count(desc->src.slot, desc->src.transfer_mode,
 				   desc->src.cb_enable, out_static_slots,
 				   num_static_slots, out_dyn_slots,
-				   num_dyn_slots);
+				   num_dyn_slots, false);
 		update_reloc_count(desc->dst.slot, desc->dst.transfer_mode,
 				   desc->dst.cb_enable, out_static_slots,
 				   num_static_slots, out_dyn_slots,
-				   num_dyn_slots);
+				   num_dyn_slots, true);
 		update_reloc_count(desc->dst2_slot, desc->dst.transfer_mode,
 				   desc->dst.cb_enable, out_static_slots,
 				   num_static_slots, out_dyn_slots,
-				   num_dyn_slots);
+				   num_dyn_slots, true);
 	}
 }
@@ -867,10 +876,6 @@ void pva_kmd_collect_relocs(struct pva_dma_config const *dma_cfg,
 	uint8_t static_reloc_off[PVA_MAX_NUM_DMA_DESC * 3];
 	uint8_t dyn_reloc_off[PVA_MAX_NUM_DMA_DESC * 3];
 	memset(out_static_slots, 0,
 	       num_static_slots * sizeof(*out_static_slots));
 	memset(out_dyn_slots, 0, num_dyn_slots * sizeof(*out_dyn_slots));
 	/* First pass: count the number of relocates for each slot */
 	count_relocs(dma_cfg, out_static_slots, num_static_slots, out_dyn_slots,
 		     num_dyn_slots);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.c
@@ -16,42 +16,23 @@
 enum pva_error pva_kmd_notify_fw_set_debug_log_level(struct pva_kmd_device *pva,
 						     uint32_t log_level)
 {
-	struct pva_kmd_submitter *submitter = &pva->submitter;
+	struct pva_cmd_set_debug_log_level cmd = { 0 };
-	struct pva_kmd_cmdbuf_builder builder;
+	pva_kmd_set_cmd_set_debug_log_level(&cmd, log_level);
 	struct pva_cmd_set_debug_log_level *cmd;
 	uint32_t fence_val;
 	enum pva_error err;
-	err = pva_kmd_submitter_prepare(submitter, &builder);
+	return pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
 	ASSERT(cmd != NULL);
 	pva_kmd_set_cmd_set_debug_log_level(cmd, log_level);
 	err = pva_kmd_submitter_submit(submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err("set debug log level cmd submission failed");
 		goto cancel_builder;
 	}
 	err = pva_kmd_submitter_wait(submitter, fence_val,
 				       PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				       PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out when setting debug log level");
 		goto err_out;
 }
-cancel_builder:
+enum pva_error pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
-	pva_kmd_cmdbuf_builder_cancel(&builder);
+						     uint32_t level)
 {
 	struct pva_cmd_set_profiling_level cmd = { 0 };
 	pva_kmd_set_cmd_set_profiling_level(&cmd, level);
-err_out:
+	return pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
-	return err;
+				       PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				       PVA_KMD_WAIT_FW_TIMEOUT_US);
 }
 void pva_kmd_drain_fw_print(struct pva_kmd_fw_print_buffer *print_buffer)
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.h
@@ -15,6 +15,9 @@ struct pva_kmd_fw_print_buffer {
 enum pva_error pva_kmd_notify_fw_set_debug_log_level(struct pva_kmd_device *pva,
 						     uint32_t log_level);
 enum pva_error pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
 						     uint32_t level);
 void pva_kmd_drain_fw_print(struct pva_kmd_fw_print_buffer *print_buffer);
 #endif // PVA_KMD_FW_DEBUG_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.c
@@ -10,6 +10,7 @@
 #include "pva_utils.h"
 #include "pva_kmd_fw_profiler.h"
 #include "pva_kmd_shared_buffer.h"
 #include "pva_api_private.h"
 // TODO: This is here temporarily just for testing. Should be moved to a common header
 #define CMD_ID(x) PVA_EXTRACT(x, 6, 0, uint8_t)
@@ -101,13 +102,11 @@ void pva_kmd_device_deinit_profiler(struct pva_kmd_device *pva)
 enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva)
 {
 	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
-	struct pva_cmd_enable_fw_profiling *cmd;
+	struct pva_cmd_enable_fw_profiling cmd = { 0 };
 	uint32_t filter = 0U;
 	uint8_t timestamp_type = TIMESTAMP_TYPE_CYCLE_COUNT;
-	uint32_t fence_val;
+	enum pva_error err = PVA_SUCCESS;
 	enum pva_error err;
 	struct pva_kmd_shared_buffer *profiling_buffer =
 		&pva->kmd_fw_buffers[PVA_PRIV_CCQ_ID];
@@ -123,26 +122,14 @@ enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva)
 		return PVA_SUCCESS;
 	}
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	pva_kmd_set_cmd_enable_fw_profiling(&cmd, filter, timestamp_type);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
 	ASSERT(cmd != NULL);
 	pva_kmd_set_cmd_enable_fw_profiling(cmd, filter, timestamp_type);
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	err = pva_kmd_submit_cmd_sync(dev_submitter, &cmd, sizeof(cmd),
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
+		pva_kmd_log_err("Failed to submit command");
-			"Waiting for FW timed out when initializing context");
+		goto out;
 		goto err_out;
 	}
 	pva->debugfs_context.g_fw_profiling_config.enabled = true;
@@ -155,38 +142,22 @@ enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva)
 			      8 :
 			      4;
-	return PVA_SUCCESS;
+out:
 err_out:
 	return err;
 }
 enum pva_error pva_kmd_notify_fw_disable_profiling(struct pva_kmd_device *pva)
 {
-	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_cmd_disable_fw_profiling cmd = { 0 };
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
 	struct pva_cmd_disable_fw_profiling *cmd;
 	uint32_t fence_val;
 	enum pva_error err;
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	pva_kmd_set_cmd_disable_fw_profiling(&cmd);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
 	ASSERT(cmd != NULL);
 	pva_kmd_set_cmd_disable_fw_profiling(cmd);
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
+		pva_kmd_log_err("Failed to submit command");
 			"Waiting for FW timed out when initializing context");
 		goto err_out;
 	}
@@ -194,6 +165,7 @@ enum pva_error pva_kmd_notify_fw_disable_profiling(struct pva_kmd_device *pva)
 	pva->debugfs_context.g_fw_profiling_config.filter = 0x0;
 	return PVA_SUCCESS;
 err_out:
 	return err;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.c
@@ -50,7 +50,7 @@ void pva_kmd_handle_hyp_msg(void *pva_dev, uint32_t const *data, uint8_t len)
 		memcpy(abort_msg + 2, &data[1], size);
 		abort_msg[PVA_FW_MSG_ABORT_STR_MAX_LEN] = '\0';
 		pva_kmd_log_err(abort_msg);
-		pva_kmd_abort(pva);
+		pva_kmd_abort_fw(pva);
 	} break;
 	case PVA_FW_MSG_TYPE_FLUSH_PRINT:
 		pva_kmd_drain_fw_print(&pva->fw_print_buffer);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.c
@@ -86,7 +86,6 @@ pva_kmd_op_memory_register_async(struct pva_kmd_context *ctx,
 		err = PVA_NOMEM;
 		goto err_out;
 	}
 	if (args->segment == PVA_MEMORY_SEGMENT_R5) {
 		smmu_ctx_id = PVA_R5_SMMU_CONTEXT_ID;
 	} else {
@@ -168,8 +167,8 @@ static enum pva_error pva_kmd_op_executable_register_async(
 	}
 	args = (struct pva_ops_executable_register *)input_buffer;
-	if (args->exec_size + sizeof(struct pva_ops_executable_register) >
+	if (args->exec_size >
-	    size) {
+	    (size - sizeof(struct pva_ops_executable_register))) {
 		pva_kmd_log_err("Executable register payload size too small");
 		return PVA_INVAL;
 	}
@@ -404,8 +403,10 @@ exit_loop:
 	post_fence->flags |= PVA_FW_POSTFENCE_FLAGS_USER_FENCE;
 	submit_error = pva_kmd_submitter_submit_with_fence(
 		&ctx->submitter, &cmdbuf_builder, post_fence);
 	ASSERT(submit_error == PVA_SUCCESS);
 	if (err == PVA_SUCCESS) {
 		err = submit_error;
 	}
 out:
 	return err;
 }
@@ -434,97 +435,14 @@ pva_kmd_op_context_init(struct pva_kmd_context *ctx, const void *input_buffer,
 	err = pva_kmd_context_init(ctx, ctx_init_args->resource_table_capacity);
 	ctx_init_out.error = err;
 	ctx_init_out.ccq_shm_hdl = (uint64_t)ctx->ccq_shm_handle;
 	ctx_init_out.max_cmdbuf_chunk_size =
 		pva_kmd_get_max_cmdbuf_chunk_size(ctx->pva);
 	produce_data(out_buffer, &ctx_init_out, sizeof(ctx_init_out));
 	return PVA_SUCCESS;
 }
 static enum pva_error pva_kmd_op_syncpt_register_async(
 	struct pva_kmd_context *ctx, const void *input_buffer,
 	uint32_t input_buffer_size, struct pva_kmd_ops_buffer *out_buffer,
 	struct pva_kmd_cmdbuf_builder *cmdbuf_builder)
 {
 	enum pva_error err;
 	struct pva_syncpt_rw_info *syncpts;
 	struct pva_kmd_device_memory dev_mem;
 	uint32_t resource_id = 0;
 	struct pva_cmd_update_resource_table *update_cmd;
 	struct pva_resource_entry entry = { 0 };
 	struct pva_ops_response_syncpt_register syncpt_register_out = { 0 };
 	if (input_buffer_size != sizeof(struct pva_ops_syncpt_register)) {
 		pva_kmd_log_err("Syncpt register size is not correct");
 		return PVA_INVAL;
 	}
 	if (!access_ok(out_buffer,
 		       sizeof(struct pva_ops_response_syncpt_register))) {
 		return PVA_INVAL;
 	}
 	/* Register RO syncpts */
 	dev_mem.iova = ctx->pva->syncpt_ro_iova;
 	dev_mem.va = 0;
 	dev_mem.size = ctx->pva->syncpt_offset * ctx->pva->num_syncpts;
 	dev_mem.pva = ctx->pva;
 	dev_mem.smmu_ctx_idx = PVA_R5_SMMU_CONTEXT_ID;
 	err = pva_kmd_add_syncpt_resource(&ctx->ctx_resource_table, &dev_mem,
 					  &resource_id);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	syncpt_register_out.syncpt_ro_res_id = resource_id;
 	syncpt_register_out.num_ro_syncpoints = ctx->pva->num_syncpts;
 	update_cmd =
 		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
 	ASSERT(update_cmd != NULL);
 	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
 					  &entry);
 	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_set_cmd_update_resource_table(
 		update_cmd, ctx->resource_table_id, resource_id, &entry);
 	/* Register RW syncpts */
 	pva_kmd_mutex_lock(&ctx->pva->syncpt_allocator.allocator_lock);
 	syncpts = (struct pva_syncpt_rw_info *)pva_kmd_get_block_unsafe(
 		&ctx->pva->syncpt_allocator, ctx->syncpt_block_index);
 	ASSERT(syncpts != NULL);
 	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS_PER_CONTEXT; i++) {
 		ctx->syncpt_ids[i] = syncpts[i].syncpt_id;
 		syncpt_register_out.synpt_ids[i] = syncpts[i].syncpt_id;
 	}
 	dev_mem.iova = syncpts[0].syncpt_iova;
 	pva_kmd_mutex_unlock(&ctx->pva->syncpt_allocator.allocator_lock);
 	dev_mem.va = 0;
 	dev_mem.size = ctx->pva->syncpt_offset * PVA_NUM_RW_SYNCPTS_PER_CONTEXT;
 	dev_mem.pva = ctx->pva;
 	dev_mem.smmu_ctx_idx = PVA_R5_SMMU_CONTEXT_ID;
 	err = pva_kmd_add_syncpt_resource(&ctx->ctx_resource_table, &dev_mem,
 					  &resource_id);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	syncpt_register_out.syncpt_rw_res_id = resource_id;
 	syncpt_register_out.synpt_size = ctx->pva->syncpt_offset;
 	update_cmd =
 		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
 	ASSERT(update_cmd != NULL);
 	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
 					  &entry);
 	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_set_cmd_update_resource_table(
 		update_cmd, ctx->resource_table_id, resource_id, &entry);
 err_out:
 	syncpt_register_out.error = err;
 	produce_data(out_buffer, &syncpt_register_out,
 		     sizeof(syncpt_register_out));
 	return PVA_SUCCESS;
 }
 static enum pva_error
 pva_kmd_op_queue_create(struct pva_kmd_context *ctx, const void *input_buffer,
 			uint32_t input_buffer_size,
@@ -532,6 +450,7 @@ pva_kmd_op_queue_create(struct pva_kmd_context *ctx, const void *input_buffer,
 {
 	const struct pva_ops_queue_create *queue_create_args;
 	struct pva_ops_response_queue_create queue_out_args = { 0 };
 	const struct pva_syncpt_rw_info *syncpt_info;
 	uint32_t queue_id = PVA_INVALID_QUEUE_ID;
 	enum pva_error err = PVA_SUCCESS;
@@ -553,10 +472,12 @@ pva_kmd_op_queue_create(struct pva_kmd_context *ctx, const void *input_buffer,
 		goto out;
 	}
 	syncpt_info = pva_kmd_queue_get_rw_syncpt_info(ctx, queue_id);
 	queue_out_args.error = err;
 	queue_out_args.queue_id = queue_id;
-	pva_kmd_read_syncpt_val(ctx->pva, ctx->syncpt_ids[queue_id],
+	queue_out_args.syncpt_id = syncpt_info->syncpt_id;
-				&queue_out_args.syncpt_fence_counter);
+	pva_kmd_read_syncpt_val(ctx->pva, syncpt_info->syncpt_id,
 				&queue_out_args.syncpt_current_value);
 out:
 	produce_data(out_buffer, &queue_out_args,
@@ -687,15 +608,16 @@ pva_kmd_op_synced_submit(struct pva_kmd_context *ctx, const void *input_buffer,
 	err = pva_kmd_submitter_submit(&ctx->submitter, &cmdbuf_builder,
 				       &fence_val);
-	/* TODO: handle this error */
+	if (err != PVA_SUCCESS) {
-	ASSERT(err == PVA_SUCCESS);
+		goto cancel_submit;
 	}
 	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
 				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				     PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto cancel_submit;
 	}
 	return PVA_SUCCESS;
@@ -758,11 +680,6 @@ pva_kmd_sync_ops_handler(struct pva_kmd_context *ctx,
 			ctx, input_buffer, input_buffer_size, out_arg,
 			pva_kmd_op_memory_register_async);
 		break;
 	case PVA_OPS_OPCODE_SYNCPT_REGISTER:
 		err = pva_kmd_op_synced_submit(
 			ctx, input_buffer, input_buffer_size, out_arg,
 			pva_kmd_op_syncpt_register_async);
 		break;
 	case PVA_OPS_OPCODE_EXECUTABLE_REGISTER:
 		err = pva_kmd_op_synced_submit(
 			ctx, input_buffer, input_buffer_size, out_arg,
@@ -798,11 +715,6 @@ enum pva_error pva_kmd_ops_handler(struct pva_kmd_context *ctx,
 	struct pva_kmd_ops_buffer in_buffer = { 0 }, out_buffer = { 0 };
 	enum pva_error err = PVA_SUCCESS;
 	if (ctx->pva->recovery) {
 		pva_kmd_log_err("PVA firmware aborted. No KMD ops allowed.");
 		return PVA_ERR_FW_ABORTED;
 	}
 	in_buffer.base = ops_buffer;
 	in_buffer.size = ops_size;
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #include "pva_kmd_utils.h"
 #include "pva_fw.h"
 #include "pva_kmd_device_memory.h"
@@ -14,11 +15,8 @@
 enum pva_error pva_kmd_prepare_suspend(struct pva_kmd_device *pva)
 {
 	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
 	enum pva_error err = PVA_SUCCESS;
-	struct pva_cmd_suspend_fw *fw_suspend;
+	struct pva_cmd_suspend_fw cmd = { 0 };
 	uint32_t fence_val;
 	pva_kmd_mutex_lock(&pva->powercycle_lock);
 	if (pva->refcount == 0u) {
@@ -27,44 +25,16 @@ enum pva_error pva_kmd_prepare_suspend(struct pva_kmd_device *pva)
 		goto err_out;
 	}
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	pva_kmd_set_cmd_suspend_fw(&cmd);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"PVA: Prepare submitter for FW suspend command failed\n");
 		goto err_out;
 	}
-	//Build args
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 	fw_suspend = pva_kmd_reserve_cmd_space(&builder, sizeof(*fw_suspend));
 	if (fw_suspend == NULL) {
 		pva_kmd_log_err(
 			"PVA: Memory alloc for FW suspend command failed\n");
 		err = PVA_NOMEM;
 		goto cancel_submit;
 	}
 	pva_kmd_set_cmd_suspend_fw(fw_suspend);
 	//Submit
 	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"PVA: Submission for FW suspend command failed\n");
 		goto cancel_submit;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
+		pva_kmd_log_err("PVA: Failed to submit FW suspend command\n");
 			"PVA: Waiting for FW timed out when preparing for suspend state\n");
 		goto err_out;
 	}
 cancel_submit:
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 err_out:
 	pva_kmd_mutex_unlock(&pva->powercycle_lock);
 	return err;
@@ -77,9 +47,11 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 	struct pva_cmd_init_resource_table *res_cmd;
 	struct pva_cmd_init_queue *queue_cmd;
 	struct pva_cmd_resume_fw *fw_resume;
 	struct pva_cmd_init_shared_dram_buffer *shared_buf_cmd;
 	enum pva_error err;
 	uint32_t fence_val;
 	struct pva_kmd_queue *queue;
 	const struct pva_syncpt_rw_info *syncpt_info;
 	pva_kmd_mutex_lock(&pva->powercycle_lock);
 	if (pva->refcount == 0u) {
@@ -89,8 +61,10 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 		goto err_out;
 	}
-	pva_kmd_send_resource_table_info_by_ccq(pva, &pva->dev_resource_table);
+	err = pva_kmd_config_fw_after_boot(pva);
-	pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
+	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
 	if (err != PVA_SUCCESS) {
@@ -140,14 +114,38 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 				goto cancel_builder;
 			}
 			/* Initialize shared buffer */
 			shared_buf_cmd = pva_kmd_reserve_cmd_space(
 				&builder, sizeof(*shared_buf_cmd));
 			if (shared_buf_cmd == NULL) {
 				pva_kmd_log_err(
 					"PVA: Memory alloc for shared buffer registration in FW resume command failed\n");
 				err = PVA_NOMEM;
 				goto cancel_builder;
 			}
 			pva_dbg_printf(
 				"PVA: Resume shared buffer for context %d\n",
 				ctx->ccq_id);
 			pva_kmd_set_cmd_init_shared_dram_buffer(
 				shared_buf_cmd, ctx->ccq_id,
 				pva->kmd_fw_buffers[ctx->ccq_id]
 					.resource_memory->iova,
 				pva->kmd_fw_buffers[ctx->ccq_id]
 					.resource_memory->size);
 			pva_dbg_printf(
 				"PVA: Resume priv queue for context %d\n",
 				ctx->ccq_id);
 			syncpt_info = pva_kmd_queue_get_rw_syncpt_info(
 				PVA_PRIV_CCQ_ID, ctx->ccq_id);
 			pva_kmd_set_cmd_init_queue(
 				queue_cmd, PVA_PRIV_CCQ_ID,
 				ctx->ccq_id, /* For privileged queues, queue ID == user CCQ ID*/
 				ctx->ctx_queue.queue_memory->iova,
-				ctx->ctx_queue.max_num_submit);
+				ctx->ctx_queue.max_num_submit,
 				syncpt_info->syncpt_id,
 				syncpt_info->syncpt_iova);
 			/**Initialize resource table */
 			for (uint32_t j = 0; j < ctx->max_n_queues; j++) {
@@ -168,11 +166,16 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 						goto cancel_builder;
 					}
 					syncpt_info =
 						pva_kmd_queue_get_rw_syncpt_info(
 							ctx, queue->queue_id);
 					pva_kmd_set_cmd_init_queue(
 						queue_cmd, queue->ccq_id,
 						queue->queue_id,
 						queue->queue_memory->iova,
-						queue->max_num_submit);
+						queue->max_num_submit,
 						syncpt_info->syncpt_id,
 						syncpt_info->syncpt_iova);
 				}
 				pva_kmd_mutex_unlock(
 					&ctx->queue_allocator.allocator_lock);
@@ -194,9 +197,12 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out when resuming from suspend state");
-		goto err_out;
+		goto cancel_builder;
 	}
 	pva_kmd_mutex_unlock(&pva->powercycle_lock);
 	return PVA_SUCCESS;
 cancel_builder:
 	pva_kmd_cmdbuf_builder_cancel(&builder);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #include "pva_constants.h"
 #include "pva_kmd.h"
 #include "pva_kmd_utils.h"
 #include "pva_fw.h"
@@ -74,48 +75,23 @@ pva_kmd_queue_submit(struct pva_kmd_queue *queue,
 	return err;
 }
 void pva_kmd_queue_deinit(struct pva_kmd_queue *queue)
 {
 	queue->queue_memory = NULL;
 	queue->ccq_id = PVA_INVALID_QUEUE_ID;
 	queue->max_num_submit = 0;
 }
 static enum pva_error notify_fw_queue_deinit(struct pva_kmd_context *ctx,
 					     struct pva_kmd_queue *queue)
 {
-	enum pva_error err = PVA_SUCCESS;
+	struct pva_cmd_deinit_queue cmd = { 0 };
-	struct pva_kmd_cmdbuf_builder builder;
+	enum pva_error err;
 	struct pva_cmd_deinit_queue *queue_cmd;
 	uint32_t fence_val;
-	err = pva_kmd_submitter_prepare(&ctx->submitter, &builder);
+	pva_kmd_set_cmd_deinit_queue(&cmd, queue->ccq_id, queue->queue_id);
 	if (err != PVA_SUCCESS) {
 		goto end;
 	}
-	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
+	err = pva_kmd_submit_cmd_sync(&ctx->submitter, &cmd, sizeof(cmd),
 	if (queue_cmd == NULL) {
 		err = PVA_NOMEM;
 		goto cancel_submitter;
 	}
 	pva_kmd_set_cmd_deinit_queue(queue_cmd, queue->ccq_id, queue->queue_id);
 	err = pva_kmd_submitter_submit(&ctx->submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		goto cancel_submitter;
 	}
 	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		goto end;
 	}
 	return PVA_SUCCESS;
-cancel_submitter:
+
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 end:
 	return err;
 }
@@ -126,10 +102,9 @@ enum pva_error pva_kmd_queue_create(struct pva_kmd_context *ctx,
 {
 	struct pva_kmd_device_memory *submission_mem_kmd = NULL;
 	struct pva_kmd_queue *queue = NULL;
-	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_cmd_init_queue cmd = { 0 };
 	struct pva_cmd_init_queue *queue_cmd;
 	uint32_t fence_val;
 	enum pva_error err, tmperr;
 	const struct pva_syncpt_rw_info *syncpt_info;
 	queue = pva_kmd_zalloc_block(&ctx->queue_allocator, queue_id);
 	if (queue == NULL) {
@@ -160,42 +135,26 @@ enum pva_error pva_kmd_queue_create(struct pva_kmd_context *ctx,
 		goto err_free_kmd_memory;
 	}
-	err = pva_kmd_submitter_prepare(&ctx->submitter, &builder);
+	syncpt_info = pva_kmd_queue_get_rw_syncpt_info(ctx, queue->queue_id);
 	pva_kmd_set_cmd_init_queue(&cmd, queue->ccq_id, queue->queue_id,
 				   queue->queue_memory->iova,
 				   queue->max_num_submit,
 				   syncpt_info->syncpt_id,
 				   syncpt_info->syncpt_iova);
 	err = pva_kmd_submit_cmd_sync(&ctx->submitter, &cmd, sizeof(cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		goto unmap_iova;
 	}
 	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
 	if (queue_cmd == NULL) {
 		err = PVA_NOMEM;
 		goto cancel_submitter;
 	}
 	ASSERT(queue_cmd != NULL);
 	pva_kmd_set_cmd_init_queue(queue_cmd, queue->ccq_id, queue->queue_id,
 				   queue->queue_memory->iova,
 				   queue->max_num_submit);
 	err = pva_kmd_submitter_submit(&ctx->submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		goto cancel_submitter;
 	}
 	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
 				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				     PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		goto cancel_submitter;
 	}
 	return PVA_SUCCESS;
 cancel_submitter:
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 unmap_iova:
 	pva_kmd_device_memory_iova_unmap(submission_mem_kmd);
 err_free_kmd_memory:
 	pva_kmd_device_memory_free(queue->queue_memory);
 	pva_kmd_queue_deinit(queue);
 err_free_queue:
 	tmperr = pva_kmd_free_block(&ctx->queue_allocator, *queue_id);
 	ASSERT(tmperr == PVA_SUCCESS);
@@ -210,35 +169,40 @@ enum pva_error pva_kmd_queue_destroy(struct pva_kmd_context *ctx,
 {
 	struct pva_kmd_queue *queue;
 	enum pva_error err = PVA_SUCCESS;
 	enum pva_error tmp_err;
 	/*
 	 * TODO :
 	 * Send command to FW to stop queue usage. Wait for ack.
 	 * This call needs to be added after syncpoint and ccq functions are ready.
 	 */
 	pva_kmd_mutex_lock(&ctx->queue_allocator.allocator_lock);
 	queue = pva_kmd_get_block_unsafe(&ctx->queue_allocator, queue_id);
 	if (queue == NULL) {
-		pva_kmd_mutex_unlock(&ctx->queue_allocator.allocator_lock);
+		pva_kmd_log_err("Destroying non-existent queue");
-		return PVA_INVAL;
+		err = PVA_INVAL;
 		goto unlock;
 	}
-	if (!ctx->pva->recovery) {
+
 	err = notify_fw_queue_deinit(ctx, queue);
 	if (err != PVA_SUCCESS) {
-			pva_kmd_mutex_unlock(
+		//Might happen if FW is aborted. It's safe to keep going.
-				&ctx->queue_allocator.allocator_lock);
+		pva_kmd_log_err("Failed to notify FW to destroy queue");
 			return err;
 		}
 	}
 	pva_kmd_device_memory_iova_unmap(queue->queue_memory);
 	pva_kmd_device_memory_free(queue->queue_memory);
-
+	tmp_err = pva_kmd_free_block_unsafe(&ctx->queue_allocator, queue_id);
-	pva_kmd_queue_deinit(queue);
+	// This cannot fail as we have already checked for queue existence and we
 	// are still holding the lock
 	ASSERT(tmp_err == PVA_SUCCESS);
 unlock:
 	pva_kmd_mutex_unlock(&ctx->queue_allocator.allocator_lock);
-
+	return err;
-	err = pva_kmd_free_block(&ctx->queue_allocator, queue_id);
+}
-	ASSERT(err == PVA_SUCCESS);
+
-	return PVA_SUCCESS;
+const struct pva_syncpt_rw_info *
 pva_kmd_queue_get_rw_syncpt_info(struct pva_kmd_context *ctx, uint8_t queue_id)
 {
 	uint8_t ctx_offset =
 		safe_mulu32(ctx->ccq_id, PVA_NUM_RW_SYNCPTS_PER_CONTEXT);
 	uint32_t syncpt_index = safe_addu32(ctx_offset, queue_id);
 	ASSERT(syncpt_index < PVA_NUM_RW_SYNCPTS);
 	return &ctx->pva->rw_syncpts[syncpt_index];
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.h
@@ -33,6 +33,8 @@ enum pva_error
 pva_kmd_queue_submit(struct pva_kmd_queue *queue,
 		     struct pva_fw_cmdbuf_submit_info const *submit_info);
 uint32_t pva_kmd_queue_space(struct pva_kmd_queue *queue);
-void pva_kmd_queue_deinit(struct pva_kmd_queue *queue);
+
 const struct pva_syncpt_rw_info *
 pva_kmd_queue_get_rw_syncpt_info(struct pva_kmd_context *ctx, uint8_t queue_id);
 #endif // PVA_KMD_QUEUE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.c
@@ -46,8 +46,7 @@ static uint32_t get_max_dma_config_size(struct pva_kmd_device *pva)
 enum pva_error
 pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
 			    struct pva_kmd_device *pva,
-			    uint8_t user_smmu_ctx_id, uint32_t n_entries,
+			    uint8_t user_smmu_ctx_id, uint32_t n_entries)
 			    uint32_t max_num_dma_configs)
 {
 	uint32_t max_dma_config_size = get_max_dma_config_size(pva);
 	enum pva_error err;
@@ -56,45 +55,55 @@ pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
 	res_table->pva = pva;
 	res_table->n_entries = n_entries;
 	res_table->user_smmu_ctx_id = user_smmu_ctx_id;
 	pva_kmd_sema_init(&res_table->resource_semaphore, n_entries);
 	pva_kmd_mutex_init(&res_table->resource_table_lock);
 	size = (uint64_t)safe_mulu32(
 		n_entries, (uint32_t)sizeof(struct pva_resource_entry));
 	res_table->table_mem = pva_kmd_device_memory_alloc_map(
 		size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
-	ASSERT(res_table->table_mem != NULL);
+	if (res_table->table_mem == NULL) {
-
+		err = PVA_NOMEM;
-	pva_kmd_sema_init(&res_table->resource_semaphore, n_entries);
+		goto deinit_locks;
-	pva_kmd_mutex_init(&res_table->resource_table_lock);
+	}
 	size = (uint64_t)safe_mulu32(sizeof(struct pva_kmd_resource_record),
 				     n_entries);
 	res_table->records_mem = pva_kmd_zalloc(size);
-	ASSERT(res_table->records_mem != NULL);
+	if (res_table->records_mem == NULL) {
 		err = PVA_NOMEM;
 		goto free_table_mem;
 	}
 	err = pva_kmd_block_allocator_init(
 		&res_table->resource_record_allocator, res_table->records_mem,
 		PVA_RESOURCE_ID_BASE, sizeof(struct pva_kmd_resource_record),
 		n_entries);
-	ASSERT(err == PVA_SUCCESS);
+	if (err != PVA_SUCCESS) {
 		goto free_records_mem;
 	}
-	size = (uint64_t)safe_mulu32(max_num_dma_configs, max_dma_config_size);
+	err = pva_kmd_devmem_pool_init(&res_table->dma_config_pool, pva,
-	res_table->dma_config_mem = pva_kmd_device_memory_alloc_map(
+				       PVA_R5_SMMU_CONTEXT_ID,
 		size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
 	ASSERT(res_table->dma_config_mem != NULL);
 	err = pva_kmd_block_allocator_init(&res_table->dma_config_allocator,
 					   res_table->dma_config_mem->va, 0,
 				       max_dma_config_size,
-					   max_num_dma_configs);
+				       PVA_KMD_DMA_CONFIG_POOL_INCR);
-	ASSERT(err == PVA_SUCCESS);
+	if (err != PVA_SUCCESS) {
-
+		goto free_resource_record_allocator;
-	res_table->dma_aux = pva_kmd_zalloc(
+	}
 		safe_mulu32((uint32_t)sizeof(struct pva_kmd_dma_resource_aux),
 			    max_num_dma_configs));
 	ASSERT(res_table->dma_aux != NULL);
 	return PVA_SUCCESS;
 free_resource_record_allocator:
 	pva_kmd_block_allocator_deinit(&res_table->resource_record_allocator);
 free_records_mem:
 	pva_kmd_free(res_table->records_mem);
 free_table_mem:
 	pva_kmd_device_memory_free(res_table->table_mem);
 deinit_locks:
 	pva_kmd_mutex_deinit(&res_table->resource_table_lock);
 	pva_kmd_sema_deinit(&res_table->resource_semaphore);
 	return err;
 }
 static struct pva_kmd_resource_record *
@@ -118,7 +127,7 @@ pva_kmd_alloc_resource_id(struct pva_kmd_resource_table *resource_table,
 		goto out;
 	}
-	rec = (struct pva_kmd_resource_record *)pva_kmd_alloc_block(
+	rec = (struct pva_kmd_resource_record *)pva_kmd_zalloc_block(
 		&resource_table->resource_record_allocator, out_resource_id);
 	ASSERT(rec != NULL);
@@ -141,9 +150,8 @@ pva_kmd_free_resource_id(struct pva_kmd_resource_table *resource_table,
 static void
 pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,
-			 uint32_t resource_id)
+			 uint32_t resource_id, bool drop_dma_reference)
 {
 	enum pva_error err;
 	struct pva_kmd_resource_record *rec = pva_kmd_get_block_unsafe(
 		&resource_table->resource_record_allocator, resource_id);
@@ -151,9 +159,7 @@ pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,
 	switch (rec->type) {
 	case PVA_RESOURCE_TYPE_DRAM:
 		if (rec->dram.syncpt != true) {
 		pva_kmd_device_memory_free(rec->dram.mem);
 		}
 		break;
 	case PVA_RESOURCE_TYPE_EXEC_BIN:
 		pva_kmd_unload_executable(&rec->vpu_bin.symbol_table,
@@ -161,12 +167,12 @@ pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,
 					  rec->vpu_bin.sections_mem);
 		break;
 	case PVA_RESOURCE_TYPE_DMA_CONFIG: {
-		struct pva_kmd_dma_resource_aux *dma_aux;
+		if (drop_dma_reference) {
-		dma_aux = &resource_table->dma_aux[rec->dma_config.block_index];
+			pva_kmd_unload_dma_config_unsafe(
-		pva_kmd_unload_dma_config_unsafe(dma_aux);
+				rec->dma_config.aux_mem);
-		err = pva_kmd_free_block(&resource_table->dma_config_allocator,
+		}
-					 rec->dma_config.block_index);
+		pva_kmd_free(rec->dma_config.aux_mem);
-		ASSERT(err == PVA_SUCCESS);
+		pva_kmd_devmem_pool_free(&rec->dma_config.devmem);
 		break;
 	}
@@ -177,33 +183,6 @@ pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,
 	pva_kmd_free_resource_id(resource_table, resource_id);
 }
 enum pva_error
 pva_kmd_add_syncpt_resource(struct pva_kmd_resource_table *resource_table,
 			    struct pva_kmd_device_memory *dev_mem,
 			    uint32_t *out_resource_id)
 {
 	struct pva_kmd_resource_record *rec =
 		pva_kmd_alloc_resource_id(resource_table, out_resource_id);
 	if (rec == NULL) {
 		pva_kmd_log_err("No more resource id");
 		return PVA_NO_RESOURCE_ID;
 	}
 	pva_kmd_mutex_lock(&resource_table->resource_table_lock);
 	if (*out_resource_id > resource_table->curr_max_resource_id) {
 		resource_table->curr_max_resource_id = *out_resource_id;
 	}
 	pva_kmd_mutex_unlock(&resource_table->resource_table_lock);
 	rec->type = PVA_RESOURCE_TYPE_DRAM;
 	rec->dram.mem = dev_mem;
 	rec->dram.syncpt = true;
 	rec->ref_count = 1;
 	return PVA_SUCCESS;
 }
 enum pva_error
 pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,
 				 struct pva_kmd_device_memory *dev_mem,
@@ -225,7 +204,6 @@ pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,
 	rec->type = PVA_RESOURCE_TYPE_DRAM;
 	rec->dram.mem = dev_mem;
 	rec->dram.syncpt = false;
 	rec->ref_count = 1;
 	return PVA_SUCCESS;
@@ -271,6 +249,7 @@ void pva_kmd_update_fw_resource_table(struct pva_kmd_resource_table *res_table)
 			entry->size_lo = iova_lo(rec->dram.mem->size);
 			entry->size_hi = iova_hi(rec->dram.mem->size);
 			entry->smmu_context_id = rec->dram.mem->smmu_ctx_idx;
 			entry->access_flags = rec->dram.mem->iova_access_flags;
 			break;
 		case PVA_RESOURCE_TYPE_INVALID:
 			break;
@@ -349,7 +328,7 @@ void pva_kmd_drop_resource_unsafe(struct pva_kmd_resource_table *resource_table,
 	rec->ref_count = safe_subu32(rec->ref_count, 1U);
 	if (rec->ref_count == 0) {
-		pva_kmd_release_resource(resource_table, resource_id);
+		pva_kmd_release_resource(resource_table, resource_id, true);
 	}
 }
@@ -414,6 +393,7 @@ pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
 		entry->size_lo = iova_lo(rec->dram.mem->size);
 		entry->size_hi = iova_hi(rec->dram.mem->size);
 		entry->smmu_context_id = rec->dram.mem->smmu_ctx_idx;
 		entry->access_flags = rec->dram.mem->iova_access_flags;
 		break;
 	case PVA_RESOURCE_TYPE_EXEC_BIN:
 		entry->type = rec->type;
@@ -423,6 +403,7 @@ pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
 		entry->size_hi = iova_hi(rec->vpu_bin.metainfo_mem->size);
 		entry->smmu_context_id =
 			rec->vpu_bin.metainfo_mem->smmu_ctx_idx;
 		entry->access_flags = PVA_ACCESS_RO;
 		break;
 	case PVA_RESOURCE_TYPE_DMA_CONFIG:
 		entry->type = rec->type;
@@ -431,6 +412,7 @@ pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
 		entry->size_lo = iova_lo(rec->dma_config.size);
 		entry->size_hi = iova_hi(rec->dma_config.size);
 		entry->smmu_context_id = PVA_R5_SMMU_CONTEXT_ID;
 		entry->access_flags = PVA_ACCESS_RO;
 		break;
 	default:
 		pva_kmd_log_err("Unsupported resource type");
@@ -447,24 +429,30 @@ enum pva_error pva_kmd_add_dma_config_resource(
 	uint32_t dma_config_size, uint32_t *out_resource_id)
 {
 	enum pva_error err = PVA_SUCCESS;
-	uint32_t block_idx, fw_fetch_size;
+	uint32_t fw_fetch_size;
 	void *fw_dma_cfg;
 	struct pva_kmd_dma_resource_aux *dma_aux;
 	struct pva_kmd_resource_record *rec;
 	uint32_t res_id;
 	struct pva_kmd_devmem_element dma_cfg_mem = { 0 };
-	fw_dma_cfg = pva_kmd_zalloc_block(&resource_table->dma_config_allocator,
+	err = pva_kmd_devmem_pool_zalloc(&resource_table->dma_config_pool,
-					  &block_idx);
+					 &dma_cfg_mem);
-	if (fw_dma_cfg == NULL) {
+	if (err != PVA_SUCCESS) {
 		err = PVA_NOMEM;
 		goto err_out;
 	}
 	fw_dma_cfg = pva_kmd_get_devmem_va(&dma_cfg_mem);
 	// Must satisfy alignment requirement for converting to struct
 	// pva_dma_config_resource*
 	ASSERT(((uintptr_t)fw_dma_cfg) % sizeof(uint64_t) == 0);
-	dma_aux = &resource_table->dma_aux[block_idx];
+	dma_aux = pva_kmd_zalloc(sizeof(struct pva_kmd_dma_resource_aux));
 	if (dma_aux == NULL) {
 		err = PVA_NOMEM;
 		goto free_dma_cfg_mem;
 	}
 	dma_aux->res_table = resource_table;
 	pva_kmd_mutex_lock(&resource_table->resource_table_lock);
 	err = pva_kmd_load_dma_config(resource_table, dma_cfg_hdr,
@@ -472,7 +460,7 @@ enum pva_error pva_kmd_add_dma_config_resource(
 				      &fw_fetch_size);
 	pva_kmd_mutex_unlock(&resource_table->resource_table_lock);
 	if (err != PVA_SUCCESS) {
-		goto free_block;
+		goto free_dma_aux;
 	}
 	rec = pva_kmd_alloc_resource_id(resource_table, &res_id);
@@ -489,12 +477,9 @@ enum pva_error pva_kmd_add_dma_config_resource(
 	rec->type = PVA_RESOURCE_TYPE_DMA_CONFIG;
 	rec->ref_count = 1;
-	rec->dma_config.block_index = block_idx;
+	rec->dma_config.devmem = dma_cfg_mem;
-	rec->dma_config.iova_addr = safe_addu64(
+	rec->dma_config.aux_mem = dma_aux;
-		resource_table->dma_config_mem->iova,
+	rec->dma_config.iova_addr = pva_kmd_get_devmem_iova(&dma_cfg_mem);
 		(uint64_t)safe_mulu32(
 			block_idx,
 			resource_table->dma_config_allocator.block_size));
 	rec->dma_config.size = fw_fetch_size;
 	*out_resource_id = res_id;
@@ -504,8 +489,10 @@ unload_dma:
 	pva_kmd_mutex_lock(&resource_table->resource_table_lock);
 	pva_kmd_unload_dma_config_unsafe(dma_aux);
 	pva_kmd_mutex_unlock(&resource_table->resource_table_lock);
-free_block:
+free_dma_aux:
-	pva_kmd_free_block(&resource_table->dma_config_allocator, block_idx);
+	pva_kmd_free(dma_aux);
 free_dma_cfg_mem:
 	pva_kmd_devmem_pool_free(&dma_cfg_mem);
 err_out:
 	return err;
 }
@@ -523,7 +510,7 @@ pva_kmd_release_all_resources(struct pva_kmd_resource_table *res_table)
 		struct pva_kmd_resource_record *rec =
 			pva_kmd_peek_resource(res_table, id);
 		if (rec != NULL) {
-			pva_kmd_release_resource(res_table, id);
+			pva_kmd_release_resource(res_table, id, false);
 		}
 	}
 	pva_kmd_mutex_unlock(&res_table->resource_table_lock);
@@ -533,11 +520,9 @@ pva_kmd_release_all_resources(struct pva_kmd_resource_table *res_table)
 void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table)
 {
 	pva_kmd_release_all_resources(res_table);
 	pva_kmd_free(res_table->dma_aux);
 	pva_kmd_block_allocator_deinit(&res_table->dma_config_allocator);
 	pva_kmd_device_memory_free(res_table->dma_config_mem);
 	pva_kmd_block_allocator_deinit(&res_table->resource_record_allocator);
 	pva_kmd_free(res_table->records_mem);
 	pva_kmd_devmem_pool_deinit(&res_table->dma_config_pool);
 	pva_kmd_mutex_deinit(&res_table->resource_table_lock);
 	pva_kmd_sema_deinit(&res_table->resource_semaphore);
 	pva_kmd_device_memory_free(res_table->table_mem);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.h
@@ -14,12 +14,12 @@
 #include "pva_kmd_dma_cfg.h"
 #include "pva_kmd_mutex.h"
 #include "pva_kmd_thread_sema.h"
 #include "pva_kmd_devmem_pool.h"
 struct pva_kmd_device;
 struct pva_kmd_dram_resource {
 	struct pva_kmd_device_memory *mem;
 	bool syncpt;
 };
 struct pva_kmd_vpu_bin_resource {
@@ -29,7 +29,8 @@ struct pva_kmd_vpu_bin_resource {
 };
 struct pva_kmd_dma_config_resource {
-	uint32_t block_index;
+	struct pva_kmd_devmem_element devmem;
 	struct pva_kmd_dma_resource_aux *aux_mem;
 	uint64_t size;
 	uint64_t iova_addr;
 };
@@ -70,13 +71,8 @@ struct pva_kmd_resource_table {
 	/** Memory for resource table entries, in R5 segment */
 	struct pva_kmd_device_memory *table_mem;
-	/** Memory for fw dma configs, in DMA segment */
+	/** Pool for FW DMA configurations */
-	struct pva_kmd_device_memory *dma_config_mem;
+	struct pva_kmd_devmem_pool dma_config_pool;
 	struct pva_kmd_block_allocator dma_config_allocator;
 	/** Memory for tracking resources used by DMA configuration. Single
 	 * allocation shared by all DMA configs */
 	struct pva_kmd_dma_resource_aux *dma_aux;
 	/** Memory for resource records */
 	void *records_mem;
@@ -88,8 +84,7 @@ struct pva_kmd_resource_table {
 enum pva_error
 pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
 			    struct pva_kmd_device *pva,
-			    uint8_t user_smmu_ctx_id, uint32_t n_entries,
+			    uint8_t user_smmu_ctx_id, uint32_t n_entries);
 			    uint32_t max_num_dma_configs);
 void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table);
 /** KMD only writes to FW resource table during init time. Once the address of
@@ -97,11 +92,6 @@ void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table);
 */
 void pva_kmd_update_fw_resource_table(struct pva_kmd_resource_table *res_table);
 enum pva_error
 pva_kmd_add_syncpt_resource(struct pva_kmd_resource_table *resource_table,
 			    struct pva_kmd_device_memory *dev_mem,
 			    uint32_t *out_resource_id);
 enum pva_error
 pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,
 				 struct pva_kmd_device_memory *memory,
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.c
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #include "pva_kmd_sha256.h"
 #include "pva_math_utils.h"
 #define ROTLEFT(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
 #define ROTRIGHT(a, b) (((a) >> (b)) | ((a) << (32 - (b))))
@@ -58,9 +59,11 @@ static void sha256_transform(struct sha256_ctx *ctx, const void *data_in)
 		m[i] = SWAP32(data[i]);
 	}
 	for (i = 0; i < U32(64) - U32(16); ++i) {
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		m[i + U32(16)] = safe_wrap_add_u32(
-		m[i + U32(16)] = SIG1(m[U32(14) + i]) + m[U32(9) + i] +
+			safe_wrap_add_u32(safe_wrap_add_u32(SIG1(m[U32(14) + i]),
-				 SIG0(m[U32(1) + i]) + m[i];
+							    m[U32(9) + i]),
 					  SIG0(m[U32(1) + i])),
 			m[i]);
 	}
 	a = ctx->state[0];
@@ -73,38 +76,32 @@ static void sha256_transform(struct sha256_ctx *ctx, const void *data_in)
 	h = ctx->state[7];
 	for (i = 0; i < U32(64); ++i) {
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		t1 = safe_wrap_add_u32(
-		t1 = h + SHA_EP1(e) + CH(e, f, g) + k[i] + m[i];
+			safe_wrap_add_u32(
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+				safe_wrap_add_u32(safe_wrap_add_u32(h,
-		t2 = SHA_EP0(a) + MAJ(a, b, c);
+								    SHA_EP1(e)),
 						  CH(e, f, g)),
 				k[i]),
 			m[i]);
 		t2 = safe_wrap_add_u32(SHA_EP0(a), MAJ(a, b, c));
 		h = g;
 		g = f;
 		f = e;
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		e = safe_wrap_add_u32(d, t1);
 		e = d + t1;
 		d = c;
 		c = b;
 		b = a;
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		a = safe_wrap_add_u32(t1, t2);
 		a = t1 + t2;
 	}
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[0] = safe_wrap_add_u32(ctx->state[0], a);
-	ctx->state[0] += a;
+	ctx->state[1] = safe_wrap_add_u32(ctx->state[1], b);
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[2] = safe_wrap_add_u32(ctx->state[2], c);
-	ctx->state[1] += b;
+	ctx->state[3] = safe_wrap_add_u32(ctx->state[3], d);
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[4] = safe_wrap_add_u32(ctx->state[4], e);
-	ctx->state[2] += c;
+	ctx->state[5] = safe_wrap_add_u32(ctx->state[5], f);
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[6] = safe_wrap_add_u32(ctx->state[6], g);
-	ctx->state[3] += d;
+	ctx->state[7] = safe_wrap_add_u32(ctx->state[7], h);
 	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
 	ctx->state[4] += e;
 	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
 	ctx->state[5] += f;
 	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
 	ctx->state[6] += g;
 	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
 	ctx->state[7] += h;
 }
 void sha256_init(struct sha256_ctx *ctx)
@@ -127,7 +124,8 @@ void sha256_update(struct sha256_ctx *ctx, const void *data, size_t len)
 	for (i = 0; i < len; i += U32(64)) {
 		ctx->bitlen &= U32(0xffffffff);
 		sha256_transform(ctx, ((const uint8_t *)data) + i);
-		ctx->bitlen += U32(512);
+		ctx->bitlen =
 			safe_wrap_add_u32((uint32_t)ctx->bitlen, U32(512));
 	}
 }
@@ -148,7 +146,9 @@ void sha256_finalize(struct sha256_ctx *ctx, const void *input,
 	/* the false of this condition is illegal for this API agreement */
 	/* this check is here only for Coverity INT30-C */
-	ctx->bitlen += input_size * U32(8);
+	ctx->bitlen = safe_wrap_add_u32((uint32_t)ctx->bitlen,
 					safe_wrap_mul_u32((uint32_t)input_size,
 							  U32(8)));
 	(void)memcpy(p, input, input_size);
 	data[input_size] = 0x80;
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_shared_buffer.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_shared_buffer.c
@@ -7,82 +7,6 @@
 #include "pva_kmd_shim_trace_event.h"
 #include "pva_kmd_shared_buffer.h"
 static void
 setup_cmd_init_shared_dram_buffer(void *cmd, uint8_t interface,
 				  struct pva_kmd_shared_buffer *fw_buffer)
 {
 	struct pva_cmd_init_shared_dram_buffer *init_cmd =
 		(struct pva_cmd_init_shared_dram_buffer *)cmd;
 	pva_kmd_set_cmd_init_shared_dram_buffer(
 		init_cmd, interface, fw_buffer->resource_memory->iova,
 		fw_buffer->resource_memory->size);
 }
 static void
 setup_cmd_deinit_shared_dram_buffer(void *cmd, uint8_t interface,
 				    struct pva_kmd_shared_buffer *fw_buffer)
 {
 	struct pva_cmd_deinit_shared_dram_buffer *deinit_cmd =
 		(struct pva_cmd_deinit_shared_dram_buffer *)cmd;
 	pva_kmd_set_cmd_deinit_shared_dram_buffer(deinit_cmd, interface);
 }
 static enum pva_error
 notify_fw(struct pva_kmd_device *pva, uint8_t interface,
 	  void (*setup_cmd_cb)(void *cmd, uint8_t interface,
 			       struct pva_kmd_shared_buffer *fw_buffer),
 	  size_t cmd_size)
 {
 	enum pva_error err;
 	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
 	struct pva_kmd_shared_buffer *fw_buffer;
 	void *cmd_space;
 	uint32_t fence_val;
 	ASSERT(interface < PVA_MAX_NUM_CCQ);
 	fw_buffer = &pva->kmd_fw_buffers[interface];
 	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	// Make sure FW buffer was allocated
 	ASSERT(fw_buffer->header != NULL);
 	cmd_space = pva_kmd_reserve_cmd_space(&builder, cmd_size);
 	ASSERT(cmd_space != NULL);
 	// Let the setup callback configure the specific command
 	setup_cmd_cb(cmd_space, interface, fw_buffer);
 	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		// Error is either QUEUE_FULL or TIMEDOUT
 		goto cancel_builder;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				     PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out while processing buffer command");
 		goto err_out;
 	}
 	return PVA_SUCCESS;
 cancel_builder:
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 err_out:
 	return err;
 }
 enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,
 					  uint8_t interface,
 					  uint32_t element_size,
@@ -95,17 +19,24 @@ enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,
 	struct pva_kmd_device_memory *device_memory;
 	struct pva_kmd_shared_buffer *buffer;
 	uint64_t buffer_size;
 	struct pva_cmd_init_shared_dram_buffer init_cmd = { 0 };
 	ASSERT(interface < PVA_MAX_NUM_CCQ);
 	buffer = &pva->kmd_fw_buffers[interface];
 	// If the buffer is already initialized, skip buffer allocation and just notify FW.
 	// This is needed to support suspend/resume.
 	if (buffer->header == NULL) {
 		// Ensure that the buffer body is a multiple of 'element size'
 		buffer_size = safe_mulu64(num_entries, element_size);
-	buffer_size = safe_addu64(buffer_size,
+		buffer_size =
 			safe_addu64(buffer_size,
 				    sizeof(struct pva_fw_shared_buffer_header));
-	device_memory = pva_kmd_device_memory_alloc_map(
+		device_memory =
-		buffer_size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+			pva_kmd_device_memory_alloc_map(buffer_size, pva,
 							PVA_ACCESS_RW,
 							PVA_R5_SMMU_CONTEXT_ID);
 		if (device_memory == NULL) {
 			return PVA_NOMEM;
 		}
@@ -116,8 +47,8 @@ enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,
 		buffer->header->element_size = element_size;
 		buffer->header->head = 0U;
 		buffer->header->tail = 0U;
-	buffer->body =
+		buffer->body = (pva_offset_pointer(buffer->header,
-		(pva_offset_pointer(buffer->header, sizeof(*buffer->header)));
+						   sizeof(*buffer->header)));
 		buffer->lock_cb = lock_cb;
 		buffer->unlock_cb = unlock_cb;
 		buffer->resource_offset = 0U;
@@ -125,12 +56,24 @@ enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,
 		err = pva_kmd_bind_shared_buffer_handler(pva, interface, pva);
 		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err_u64(
 				"Failed to bind shared buffer handler for interface",
 				interface);
 			goto free_buffer_memory;
 		}
 	} else {
 		device_memory = buffer->resource_memory;
 	}
-	err = notify_fw(pva, interface, setup_cmd_init_shared_dram_buffer,
+	pva_kmd_set_cmd_init_shared_dram_buffer(
-			sizeof(struct pva_cmd_init_shared_dram_buffer));
+		&init_cmd, interface, device_memory->iova, device_memory->size);
 	err = pva_kmd_submit_cmd_sync(&pva->submitter, &init_cmd,
 				      sizeof(init_cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err("Failed to submit command");
 		goto release_handler;
 	}
@@ -140,6 +83,8 @@ release_handler:
 	pva_kmd_release_shared_buffer_handler(pva, interface);
 free_buffer_memory:
 	pva_kmd_device_memory_free(device_memory);
 	buffer->header = NULL;
 	buffer->resource_memory = NULL;
 	return err;
 }
@@ -148,22 +93,26 @@ enum pva_error pva_kmd_shared_buffer_deinit(struct pva_kmd_device *pva,
 {
 	enum pva_error err = PVA_SUCCESS;
 	struct pva_kmd_shared_buffer *buffer;
 	struct pva_cmd_deinit_shared_dram_buffer deinit_cmd = { 0 };
 	ASSERT(interface < PVA_MAX_NUM_CCQ);
 	buffer = &pva->kmd_fw_buffers[interface];
-	if (!pva->recovery) {
+	pva_kmd_set_cmd_deinit_shared_dram_buffer(&deinit_cmd, interface);
-		err = notify_fw(
+
-			pva, interface, setup_cmd_deinit_shared_dram_buffer,
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &deinit_cmd,
-			sizeof(struct pva_cmd_deinit_shared_dram_buffer));
+				      sizeof(deinit_cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-			pva_kmd_log_err("Failed to deinit FW buffer");
+		// This might happen if FW is aborted. It's safe to keep going.
-		}
+		pva_kmd_log_err("Failed to notify FW of buffer deinit");
 	}
 	pva_kmd_release_shared_buffer_handler(pva, interface);
 	pva_kmd_shared_buffer_process(pva, interface);
 	buffer->header = NULL;
 	pva_kmd_device_memory_free(buffer->resource_memory);
 	buffer->resource_memory = NULL;
@@ -176,6 +125,7 @@ static void shared_buffer_process_msg(struct pva_kmd_device *pva,
 	enum pva_error err = PVA_SUCCESS;
 	struct pva_kmd_fw_buffer_msg_header header;
 	struct pva_kmd_fw_msg_vpu_trace vpu_trace;
 	struct pva_kmd_fw_msg_fence_trace fence_trace;
 	struct pva_kmd_fw_msg_res_unreg unreg_data;
 	struct pva_kmd_context *ctx = NULL;
 	void *msg_body;
@@ -214,6 +164,12 @@ static void shared_buffer_process_msg(struct pva_kmd_device *pva,
 		pva_kmd_shim_add_trace_vpu_exec(pva, &vpu_trace);
 		break;
 	}
 	case PVA_KMD_FW_BUF_MSG_TYPE_FENCE_TRACE: {
 		ASSERT(msg_size == sizeof(struct pva_kmd_fw_msg_fence_trace));
 		memcpy(&fence_trace, msg_body, sizeof(fence_trace));
 		pva_kmd_shim_add_trace_fence(pva, &fence_trace);
 		break;
 	}
 	case PVA_KMD_FW_BUF_MSG_TYPE_RES_UNREG: {
 		ASSERT(msg_size == sizeof(struct pva_kmd_fw_msg_res_unreg));
 		memcpy(&unreg_data, msg_body, sizeof(unreg_data));
@@ -281,7 +237,7 @@ void pva_kmd_shared_buffer_process(void *pva_dev, uint8_t interface)
 			// Note that ideally this should never happen as the buffer is expected to be
 			// the same size as the resource table.
 			// TODO: abort only the user context, not the device.
-			pva_kmd_abort(pva);
+			pva_kmd_abort_fw(pva);
 		}
 		// Buffer corresponding to CCQ 0 is used for sending messages common to a VM.
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.c
@@ -4,6 +4,7 @@
 #include "pva_kmd_device.h"
 #include "pva_fw_address_map.h"
 #include "pva_fw_hyp.h"
 #include "pva_kmd_shim_init.h"
 #include "pva_kmd_thread_sema.h"
 #include "pva_kmd_constants.h"
 #include "pva_kmd_silicon_isr.h"
@@ -153,27 +154,12 @@ void pva_kmd_config_sid(struct pva_kmd_device *pva)
 	}
 }
-static uint32_t pva_kmd_get_syncpt_ro_offset(struct pva_kmd_device *pva)
+static uint32_t get_syncpt_offset(struct pva_kmd_device *pva,
 				  uint64_t syncpt_iova)
 {
-	if (pva->num_syncpts > 0U) {
+	if (pva->num_ro_syncpts > 0U) {
 		uint64_t offset;
-		offset = safe_subu64(pva->syncpt_ro_iova,
+		offset = safe_subu64(syncpt_iova, pva_kmd_get_r5_iova_start());
 				     pva_kmd_get_r5_iova_start());
 		ASSERT(offset <= UINT32_MAX);
 		return (uint32_t)offset;
 	} else {
 		// This is only for SIM mode where syncpoints are not supported.
 		return PVA_R5_SYNCPT_REGION_IOVA_OFFSET_NOT_SET;
 	}
 }
 static uint32_t pva_kmd_get_syncpt_rw_offset(struct pva_kmd_device *pva)
 {
 	if (pva->num_syncpts > 0U) {
 		uint64_t offset;
 		offset = safe_subu64(pva->syncpt_rw_iova,
 				     pva_kmd_get_r5_iova_start());
 		ASSERT(offset <= UINT32_MAX);
 		return (uint32_t)offset;
@@ -249,12 +235,17 @@ enum pva_error pva_kmd_init_fw(struct pva_kmd_device *pva)
 	if (pva->bl_sector_pack_format == PVA_BL_XBAR_RAW) {
 		boot_sema = PVA_BOOT_SEMA_USE_XBAR_RAW;
 	}
 	if (pva->test_mode) {
 		boot_sema |= PVA_BOOT_SEMA_TEST_MODE;
 	}
 	pva_kmd_set_sema(pva, PVA_BOOT_SEMA, boot_sema);
-	pva_kmd_write(pva, PVA_REG_HSP_SS2_SET_ADDR,
+	pva_kmd_set_sema(pva, PVA_RO_SYNC_BASE_SEMA,
-		      pva_kmd_get_syncpt_ro_offset(pva));
+			 get_syncpt_offset(pva, pva->ro_syncpt_base_iova));
-	pva_kmd_write(pva, PVA_REG_HSP_SS3_SET_ADDR,
+	pva_kmd_set_sema(pva, PVA_RW_SYNC_BASE_SEMA,
-		      pva_kmd_get_syncpt_rw_offset(pva));
+			 get_syncpt_offset(pva, pva->rw_syncpt_base_iova));
 	pva_kmd_set_sema(pva, PVA_RW_SYNC_SIZE_SEMA,
 			 pva->rw_syncpt_region_size);
 	pva_kmd_config_sid_regs(pva);
@@ -290,6 +281,7 @@ free_sec_lic:
 	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
 free_fw_debug_mem:
 	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
 	pva_kmd_freeze_fw(pva);
 	pva_kmd_device_memory_free(pva->fw_debug_mem);
 free_fw_mem:
 	if (!pva->load_from_gsc) {
@@ -299,17 +291,14 @@ out:
 	return err;
 }
-void pva_kmd_deinit_fw(struct pva_kmd_device *pva)
+void pva_kmd_freeze_fw(struct pva_kmd_device *pva)
 {
 	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
 	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
 	/*
-	 * Before powering off PVA, disable SEC error reporting.
+	 * Before freezing PVA, disable SEC error reporting.
-	 * While powering off, PVA might generate (unexplained) error interrupts
+	 * While setting the reset line, PVA might generate (unexplained) error
-	 * This causes HSM to read some PVA SEC registers. However, since PVA might
+	 * interrupts This causes HSM to read some PVA SEC registers. However,
-	 * already be powergated by this time, access to PVA SEC registers from HSM
+	 * since PVA might already be powergated by this time, access to PVA SEC
-	 * fails. This was discussed in Bug 3785498.
+	 * registers from HSM fails. This was discussed in Bug 3785498.
 	 *
 	 * Note: we do not explicity enable these errors during power on since
 	 *	 'enable' is their reset value
@@ -317,6 +306,17 @@ void pva_kmd_deinit_fw(struct pva_kmd_device *pva)
 	disable_sec_mission_error_reporting(pva);
 	disable_sec_latent_error_reporting(pva);
 	pva_kmd_set_reset_line(pva);
 }
 void pva_kmd_deinit_fw(struct pva_kmd_device *pva)
 {
 	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
 	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
 	// FW so that we can free memory
 	pva_kmd_freeze_fw(pva);
 	pva_kmd_device_memory_free(pva->fw_debug_mem);
 	if (!pva->load_from_gsc) {
 		pva_kmd_device_memory_free(pva->fw_bin_mem);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_executable.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_executable.c
@@ -748,9 +748,11 @@ load_metainfo(struct pva_kmd_device *pva, uint64_t section_iova,
 	metainfo->num_vmem_buffers = n_symbols;
 	data_sections_mem = pva_offset_pointer(metainfo, sizeof(*metainfo));
 	if (n_data_sections > 0U && section_infos != NULL) {
 		memcpy(data_sections_mem, section_infos,
 		       mulu32(n_data_sections, (uint32_t)sizeof(*section_infos),
 			      &math_err));
 	}
 	vmem_buffers_mem = pva_offset_pointer(
 		data_sections_mem,
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.c
@@ -42,6 +42,7 @@ int pva_kmd_hwpm_ip_pm(void *ip_dev, bool disable)
 {
 	struct pva_kmd_device *dev = ip_dev;
 	enum pva_error err = PVA_SUCCESS;
 	int ret = 0;
 	if (disable) {
 		err = pva_kmd_device_busy(dev);
@@ -51,5 +52,10 @@ int pva_kmd_hwpm_ip_pm(void *ip_dev, bool disable)
 	} else {
 		pva_kmd_device_idle(dev);
 	}
-	return err;
+
 	if (err != PVA_SUCCESS) {
 		ret = -1;
 	}
 	return ret;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.c
@@ -45,22 +45,16 @@ void pva_kmd_hyp_isr(void *data, enum pva_kmd_intr_line intr_line)
 	if (wdt_val != 0) {
 		/* Clear interrupt status */
-		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status,
+		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status, wdt_val);
 			      intr_status &
 				      PVA_MASK(PVA_REG_SEC_LIC_INTR_WDT_MSB,
 					       PVA_REG_SEC_LIC_INTR_WDT_LSB));
 		pva_kmd_log_err("PVA watchdog timeout!");
-		pva_kmd_abort(pva);
+		pva_kmd_abort_fw(pva);
 	}
 	if (h1x_val != 0) {
 		pva_kmd_log_err_u64("Host1x errors", h1x_val);
 		/* Clear interrupt status */
-		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status,
+		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status, h1x_val);
-			      intr_status &
+		pva_kmd_abort_fw(pva);
 				      PVA_MASK(PVA_REG_SEC_LIC_INTR_H1X_MSB,
 					       PVA_REG_SEC_LIC_INTR_H1X_LSB));
 		pva_kmd_abort(pva);
 	}
 	if (hsp_val != 0) {
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_misc.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_misc.c
@@ -23,3 +23,10 @@ uint32_t pva_kmd_get_ccq_space(struct pva_kmd_device *pva, uint8_t ccq_id)
 			    PVA_REG_CCQ_STATUS2_NUM_ENTRIES_LSB, uint32_t);
 	return safe_subu32((uint32_t)PVA_CCQ_DEPTH, len) / 2U;
 }
 void pva_kmd_disable_all_interrupts_nosync(struct pva_kmd_device *pva)
 {
 	for (int i = 0; i < PVA_KMD_INTR_LINE_COUNT; i++) {
 		pva_kmd_disable_intr_nosync(pva, (enum pva_kmd_intr_line)i);
 	}
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.c
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #include "pva_kmd_submitter.h"
 #include "pva_api_types.h"
 #include "pva_kmd_utils.h"
 #include "pva_kmd_abort.h"
@@ -70,6 +71,7 @@ pva_kmd_submitter_submit_with_fence(struct pva_kmd_submitter *submitter,
 	submit_info.first_chunk_offset_lo = iova_lo(first_chunk_offset);
 	submit_info.first_chunk_offset_hi = iova_hi(first_chunk_offset);
 	submit_info.first_chunk_size = first_chunk_size;
 	submit_info.execution_timeout_ms = PVA_EXEC_TIMEOUT_INF;
 	pva_kmd_mutex_lock(submitter->submit_lock);
 	err = pva_kmd_queue_submit(submitter->queue, &submit_info);
@@ -108,6 +110,7 @@ enum pva_error pva_kmd_submitter_submit(struct pva_kmd_submitter *submitter,
 	submit_info.first_chunk_offset_lo = iova_lo(first_chunk_offset);
 	submit_info.first_chunk_offset_hi = iova_hi(first_chunk_offset);
 	submit_info.first_chunk_size = first_chunk_size;
 	submit_info.execution_timeout_ms = PVA_EXEC_TIMEOUT_INF;
 	/* TODO: remove these flags after FW execute command buffer with no engines. */
 	submit_info.flags =
 		PVA_INSERT8(0x3, PVA_CMDBUF_FLAGS_ENGINE_AFFINITY_MSB,
@@ -137,16 +140,63 @@ enum pva_error pva_kmd_submitter_wait(struct pva_kmd_submitter *submitter,
 {
 	uint32_t volatile *fence_addr = submitter->post_fence_va;
 	uint32_t time_spent = 0;
 	struct pva_kmd_device *pva = submitter->queue->pva;
 	while (*fence_addr < fence_val) {
 		if (pva->recovery) {
 			return PVA_ERR_FW_ABORTED;
 		}
 		pva_kmd_sleep_us(poll_interval_us);
 		time_spent = safe_addu32(time_spent, poll_interval_us);
 		if (time_spent >= timeout_us) {
 			pva_kmd_log_err("pva_kmd_submitter_wait Timed out");
-			pva_kmd_abort(submitter->queue->pva);
+			pva_kmd_abort_fw(submitter->queue->pva);
 			return PVA_TIMEDOUT;
 		}
 	}
 	return PVA_SUCCESS;
 }
 enum pva_error pva_kmd_submit_cmd_sync(struct pva_kmd_submitter *submitter,
 				       void *cmds, uint32_t size,
 				       uint32_t poll_interval_us,
 				       uint32_t timeout_us)
 {
 	struct pva_kmd_cmdbuf_builder builder = { 0 };
 	enum pva_error err;
 	void *cmd_dst = NULL;
 	uint32_t fence_val = 0;
 	err = pva_kmd_submitter_prepare(submitter, &builder);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
 	cmd_dst = pva_kmd_reserve_cmd_space(&builder, size);
 	if (cmd_dst == NULL) {
 		err = PVA_INVAL;
 		pva_kmd_log_err(
 			"Trying to submit too many commands using pva_kmd_submit_cmd_sync.");
 		goto cancel_builder;
 	}
 	memcpy(cmd_dst, cmds, size);
 	err = pva_kmd_submitter_submit(submitter, &builder, &fence_val);
 	if (err != PVA_SUCCESS) {
 		goto cancel_builder;
 	}
 	err = pva_kmd_submitter_wait(submitter, fence_val, poll_interval_us,
 				     timeout_us);
 	if (err != PVA_SUCCESS) {
 		goto cancel_builder;
 	}
 	return err;
 cancel_builder:
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 err_out:
 	return err;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.h
@@ -57,4 +57,11 @@ pva_kmd_submitter_submit_with_fence(struct pva_kmd_submitter *submitter,
 /* add cmd */
 /* do submit with fence (provide a fence) */
 /* Helper function to submit several commands and wait for them to complete.
 Total size must be smaller than a chunk. */
 enum pva_error pva_kmd_submit_cmd_sync(struct pva_kmd_submitter *submitter,
 				       void *cmds, uint32_t size,
 				       uint32_t poll_interval_us,
 				       uint32_t timeout_us);
 #endif // PVA_KMD_SUBMITTER_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.c
@@ -59,11 +59,8 @@ enum pva_error
 pva_kmd_notify_fw_get_tegra_stats(struct pva_kmd_device *pva,
 				  struct pva_kmd_tegrastats *kmd_tegra_stats)
 {
-	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_cmd_get_tegra_stats cmd = { 0 };
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
 	struct pva_cmd_get_tegra_stats *cmd;
 	uint64_t buffer_offset = 0U;
 	uint32_t fence_val;
 	enum pva_error err = PVA_SUCCESS;
 	struct pva_kmd_fw_tegrastats fw_tegra_stats = { 0 };
 	bool stats_enabled = pva->debugfs_context.stats_enable;
@@ -86,29 +83,15 @@ pva_kmd_notify_fw_get_tegra_stats(struct pva_kmd_device *pva,
 		goto err_out;
 	}
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	pva_kmd_set_cmd_get_tegra_stats(&cmd, pva->tegra_stats_resource_id,
 	if (err != PVA_SUCCESS) {
 		goto dev_idle;
 	}
 	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
 	ASSERT(cmd != NULL);
 	pva_kmd_set_cmd_get_tegra_stats(cmd, pva->tegra_stats_resource_id,
 					pva->tegra_stats_buf_size,
 					buffer_offset, stats_enabled);
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err("tegra stats cmd submission failed");
 		goto cancel_builder;
 	}
 	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
+		pva_kmd_log_err("tegra stats cmd submission failed");
 			"Waiting for FW timed out when getting tegra stats");
 		goto dev_idle;
 	}
@@ -129,8 +112,7 @@ out:
 	kmd_tegra_stats->window_end_time = fw_tegra_stats.window_end_time;
 	return PVA_SUCCESS;
-cancel_builder:
+
 	pva_kmd_cmdbuf_builder_cancel(&builder);
 dev_idle:
 	pva_kmd_device_idle(pva);
 err_out:
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_device_memory.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_device_memory.h
@@ -21,6 +21,7 @@ struct pva_kmd_device_memory {
 	uint64_t size; /**< Size of the mapping. */
 	struct pva_kmd_device *pva; /**< The PVA this memory is mapped to. */
 	uint32_t smmu_ctx_idx; /**< The SMMU context this memory is mapped to. */
 	uint32_t iova_access_flags; /**< Access flags for the memory. RO - 1/WO - 2/RW - 3 */
 };
 /**
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_debugfs.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_debugfs.h
@@ -9,7 +9,8 @@ void pva_kmd_debugfs_create_bool(struct pva_kmd_device *pva, const char *name,
 				 bool *val);
 void pva_kmd_debugfs_create_u32(struct pva_kmd_device *pva, const char *name,
 				uint32_t *val);
-void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
+enum pva_error pva_kmd_debugfs_create_file(struct pva_kmd_device *pva,
 					   const char *name,
 					   struct pva_kmd_file_ops *fops);
 void pva_kmd_debugfs_remove_nodes(struct pva_kmd_device *pva);
 unsigned long pva_kmd_copy_data_from_user(void *dst, const void *src,
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_init.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_init.h
@@ -14,9 +14,6 @@ void pva_kmd_device_plat_deinit(struct pva_kmd_device *pva);
 void pva_kmd_read_syncpt_val(struct pva_kmd_device *pva, uint32_t syncpt_id,
 			     uint32_t *syncpt_value);
 void pva_kmd_get_syncpt_iova(struct pva_kmd_device *pva, uint32_t syncpt_id,
 			     uint64_t *syncpt_iova);
 void pva_kmd_allocate_syncpts(struct pva_kmd_device *pva);
 /**
@@ -34,7 +31,7 @@ void pva_kmd_power_off(struct pva_kmd_device *pva);
 * user submission halted. This is requied for host1x
 * watchdog, or kmd submission timeout failures.
 */
-void pva_kmd_fw_reset_assert(struct pva_kmd_device *pva);
+void pva_kmd_freeze_fw(struct pva_kmd_device *pva);
 /**
 * @brief Initialize firmware.
@@ -60,4 +57,18 @@ enum pva_error pva_kmd_init_fw(struct pva_kmd_device *pva);
 * @param pva pointer to the PVA device to de-initialize
 */
 void pva_kmd_deinit_fw(struct pva_kmd_device *pva);
 /**
 * @brief Disable all interrupts without waiting for running interrupt handlers
 * to complete.
 *
 * We don't wait for running interrupt handlers to complete because we want to
 * be able to call this function from interrupt handles themselves.
 *
 * This function is to be called when PVA enters bad state and we want to
 * protect KMD from potential interrupt floods from PVA (particularly watchdog
 * interrupt that will trigger repeatedly by HW).
 */
 void pva_kmd_disable_all_interrupts_nosync(struct pva_kmd_device *pva);
 #endif // PVA_KMD_SHIM_INIT_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_silicon.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_silicon.h
@@ -81,9 +81,9 @@ void pva_kmd_enable_intr(struct pva_kmd_device *pva,
 			 enum pva_kmd_intr_line intr_line);
 /**
- * @brief Disable an interrupt line.
+ * @brief Disable an interrupt line without waiting for running interrupt handlers to complete.
 */
-void pva_kmd_disable_intr(struct pva_kmd_device *pva,
+void pva_kmd_disable_intr_nosync(struct pva_kmd_device *pva,
 				 enum pva_kmd_intr_line intr_line);
 /**
@@ -104,13 +104,6 @@ void pva_kmd_free_intr(struct pva_kmd_device *pva,
 */
 enum pva_error pva_kmd_read_fw_bin(struct pva_kmd_device *pva);
 /**
 * @brief Reset assert FW so it can be in recovery and
 * user submission halted. This is requied for host1x
 * watchdog, or kmd submission timeout failures.
 */
 void pva_kmd_fw_reset_assert(struct pva_kmd_device *pva);
 /**
 * @brief Get starting IOVA of the memory shared by R5 and KMD.
 *
@@ -141,4 +134,9 @@ void pva_kmd_config_evp_seg_scr_regs(struct pva_kmd_device *pva);
 */
 void pva_kmd_config_sid_regs(struct pva_kmd_device *pva);
 /**
 * @brief Set the PVA HW reset line.
 */
 void pva_kmd_set_reset_line(struct pva_kmd_device *pva);
 #endif // PVA_KMD_SHIM_SILICON_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_trace_event.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_trace_event.h
@@ -9,4 +9,8 @@ void pva_kmd_shim_add_trace_vpu_exec(
 	struct pva_kmd_device *pva,
 	struct pva_kmd_fw_msg_vpu_trace const *trace_info);
 void pva_kmd_shim_add_trace_fence(
 	struct pva_kmd_device *pva,
 	struct pva_kmd_fw_msg_fence_trace const *trace_info);
 #endif // PVA_KMD_SHIM_TRACE_EVENT_H
--- a/drivers/video/tegra/host/pva/src/kmd/include/pva_kmd.h
+++ b/drivers/video/tegra/host/pva/src/kmd/include/pva_kmd.h
@@ -20,24 +20,10 @@ struct pva_ops_context_init {
 struct pva_ops_response_context_init {
 	enum pva_error error;
 	uint16_t max_cmdbuf_chunk_size;
 	uint64_t ccq_shm_hdl;
 };
 struct pva_ops_syncpt_register {
 #define PVA_OPS_OPCODE_SYNCPT_REGISTER (2U | PVA_OPS_PRIVATE_OPCODE_FLAG)
 	struct pva_ops_header header;
 };
 struct pva_ops_response_syncpt_register {
 	enum pva_error error;
 	uint32_t syncpt_ro_res_id;
 	uint32_t syncpt_rw_res_id;
 	uint32_t synpt_size;
 	uint32_t synpt_ids[PVA_NUM_RW_SYNCPTS_PER_CONTEXT];
 	uint32_t num_ro_syncpoints;
 	uint32_t pad;
 };
 /**
 * Calculates the total memory size required for a PVA submission queue.
 * This includes the size of the queue header and the combined size of all command buffer submission info structures.
@@ -67,7 +53,8 @@ struct pva_ops_queue_create {
 struct pva_ops_response_queue_create {
 	enum pva_error error;
 	uint32_t queue_id;
-	uint32_t syncpt_fence_counter;
+	uint32_t syncpt_id;
 	uint32_t syncpt_current_value;
 };
 /* KMD API: queue destroy */
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_debugfs.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_debugfs.c
@@ -130,7 +130,8 @@ void pva_kmd_debugfs_create_u32(struct pva_kmd_device *pva, const char *name,
 	debugfs_create_u32(name, 0644, de, pdata);
 }
-void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
+enum pva_error pva_kmd_debugfs_create_file(struct pva_kmd_device *pva,
 					   const char *name,
 					   struct pva_kmd_file_ops *pvafops)
 {
 	struct pva_kmd_linux_device_data *device_data =
@@ -142,7 +143,12 @@ void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
 	struct dentry *file;
 	file = debugfs_create_file(name, 0644, de, pvafops, fops);
-	ASSERT(file != NULL);
+	if (file == NULL) {
 		pva_kmd_log_err("Failed to create debugfs file");
 		return PVA_INVAL;
 	}
 	return PVA_SUCCESS;
 }
 void pva_kmd_debugfs_remove_nodes(struct pva_kmd_device *pva)
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device.c
@@ -53,15 +53,6 @@ void pva_kmd_read_syncpt_val(struct pva_kmd_device *pva, uint32_t syncpt_id,
 	}
 }
 void pva_kmd_get_syncpt_iova(struct pva_kmd_device *pva, uint32_t syncpt_id,
 			     uint64_t *syncpt_iova)
 {
 	uint32_t offset = 0;
 	offset = nvpva_syncpt_unit_interface_get_byte_offset_ext(syncpt_id);
 	*syncpt_iova = safe_addu64(pva->syncpt_ro_iova, (uint64_t)offset);
 }
 void pva_kmd_linux_host1x_init(struct pva_kmd_device *pva)
 {
 	phys_addr_t base;
@@ -69,7 +60,6 @@ void pva_kmd_linux_host1x_init(struct pva_kmd_device *pva)
 	int err = 0;
 	uint32_t stride, num_syncpts;
 	uint32_t syncpt_page_size;
 	uint32_t syncpt_offset[PVA_NUM_RW_SYNCPTS];
 	dma_addr_t sp_start;
 	struct device *dev;
 	struct pva_kmd_linux_device_data *device_data =
@@ -92,53 +82,38 @@ void pva_kmd_linux_host1x_init(struct pva_kmd_device *pva)
 	syncpt_page_size = nvpva_syncpt_unit_interface_get_byte_offset_ext(1);
 	dev = &device_data->smmu_contexts[PVA_R5_SMMU_CONTEXT_ID]->dev;
 	if (iommu_get_domain_for_dev(dev)) {
-		sp_start = dma_map_resource(dev, base, size, DMA_TO_DEVICE,
+		sp_start = dma_map_resource(dev, base, size, DMA_BIDIRECTIONAL,
 					    DMA_ATTR_SKIP_CPU_SYNC);
 		if (dma_mapping_error(dev, sp_start)) {
-			FAULT("Failed to pin RO syncpoints\n");
+			FAULT("Failed to pin syncpoints\n");
 		}
 	} else {
-		FAULT("Failed to pin RO syncpoints\n");
+		FAULT("Failed to pin syncpoints\n");
 	}
-	pva->syncpt_ro_iova = sp_start;
+	pva->ro_syncpt_base_iova = sp_start;
-	pva->syncpt_offset = syncpt_page_size;
+	pva->syncpt_page_size = syncpt_page_size;
-	pva->num_syncpts = (size / syncpt_page_size);
+	pva->num_ro_syncpts = num_syncpts;
 	// The same region is also used for RW syncpts...
 	pva->rw_syncpt_base_iova = sp_start;
 	pva->rw_syncpt_region_size = size;
 	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS; i++) {
-		pva->syncpt_rw[i].syncpt_id = nvpva_get_syncpt_client_managed(
+		uint32_t syncpt_id;
-			props->pdev, "pva_syncpt");
+		uint64_t syncpt_iova;
-		if (pva->syncpt_rw[i].syncpt_id == 0) {
+
 		syncpt_id = nvpva_get_syncpt_client_managed(props->pdev,
 							    "pva_syncpt");
 		if (syncpt_id == 0) {
 			FAULT("Failed to get syncpt\n");
 		}
-		syncpt_offset[i] =
+		syncpt_iova = safe_addu64(
 			sp_start,
 			nvpva_syncpt_unit_interface_get_byte_offset_ext(
-				pva->syncpt_rw[i].syncpt_id);
+				syncpt_id));
 		err = nvpva_syncpt_read_ext_check(
 			props->pdev, pva->syncpt_rw[i].syncpt_id,
 			&pva->syncpt_rw[i].syncpt_value);
 		if (err < 0) {
 			FAULT("Failed to read syncpoint value\n");
 		}
 	}
-	pva->syncpt_rw_iova =
+		pva->rw_syncpts[i].syncpt_iova = syncpt_iova;
-		dma_map_resource(dev,
+		pva->rw_syncpts[i].syncpt_id = syncpt_id;
 				 safe_addu64(base, (uint64_t)syncpt_offset[0]),
 				 safe_mulu64((uint64_t)pva->syncpt_offset,
 					     (uint64_t)PVA_NUM_RW_SYNCPTS),
 				 DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
 	if (dma_mapping_error(dev, pva->syncpt_rw_iova)) {
 		FAULT("Failed to pin RW syncpoints\n");
 	}
 	pva->syncpt_rw[0].syncpt_iova = pva->syncpt_rw_iova;
 	for (uint32_t i = 1; i < PVA_NUM_RW_SYNCPTS; i++) {
 		if (safe_addu32(syncpt_offset[i - 1], pva->syncpt_offset) !=
 		    syncpt_offset[i]) {
 			FAULT("RW syncpts are not contiguous\n");
 		}
 		pva->syncpt_rw[i].syncpt_iova = safe_addu64(
 			pva->syncpt_rw_iova,
 			safe_mulu64((uint64_t)pva->syncpt_offset, (uint64_t)i));
 	}
 }
@@ -166,25 +141,19 @@ void pva_kmd_linux_host1x_deinit(struct pva_kmd_device *pva)
 	dev = &device_data->smmu_contexts[PVA_R5_SMMU_CONTEXT_ID]->dev;
 	if (iommu_get_domain_for_dev(dev)) {
-		dma_unmap_resource(dev, pva->syncpt_ro_iova, size,
+		dma_unmap_resource(dev, pva->ro_syncpt_base_iova, size,
 				   DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 		dma_unmap_resource(dev, pva->syncpt_rw_iova,
 				   safe_mulu64((uint64_t)pva->syncpt_offset,
 					       (uint64_t)PVA_NUM_RW_SYNCPTS),
 				   DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
 	} else {
 		FAULT("Failed to unmap syncpts\n");
 	}
 	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS; i++) {
 		nvpva_syncpt_put_ref_ext(props->pdev,
-					 pva->syncpt_rw[i].syncpt_id);
+					 pva->rw_syncpts[i].syncpt_id);
-		pva->syncpt_rw[i].syncpt_id = 0;
+		pva->rw_syncpts[i].syncpt_id = 0;
-		pva->syncpt_rw[i].syncpt_iova = 0;
+		pva->rw_syncpts[i].syncpt_iova = 0;
 		pva->syncpt_rw[i].syncpt_value = 0;
 	}
-	pva->syncpt_ro_iova = 0;
+	pva->ro_syncpt_base_iova = 0;
-	pva->syncpt_rw_iova = 0;
+	pva->syncpt_page_size = 0;
 	pva->syncpt_offset = 0;
 	nvpva_syncpt_unit_interface_deinit(props->pdev);
 }
@@ -235,21 +204,11 @@ void pva_kmd_power_off(struct pva_kmd_device *pva)
 		pva_kmd_linux_device_get_data(pva);
 	struct nvpva_device_data *props = device_data->pva_device_properties;
 	// Set reset line before cutting off power
 	/* Power management operation is asynchronous. We don't control when PVA
 	 * will really be powered down. However, we need to free memories after
 	 * this call. Therefore, we assert the reset line to stop PVA from any
 	 * further activity. */
 	reset_control_acquire(props->reset_control);
 	reset_control_assert(props->reset_control);
 	reset_control_release(props->reset_control);
 	pm_runtime_mark_last_busy(&props->pdev->dev);
 	pm_runtime_put(&props->pdev->dev);
 }
-void pva_kmd_fw_reset_assert(struct pva_kmd_device *pva)
+void pva_kmd_set_reset_line(struct pva_kmd_device *pva)
 {
 	struct pva_kmd_linux_device_data *device_data =
 		pva_kmd_linux_device_get_data(pva);
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device_memory.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device_memory.c
@@ -23,7 +23,7 @@ struct pva_kmd_device_memory_impl {
 	struct pva_kmd_device_memory dev_mem;
 	struct dma_buf *dmabuf;
 	struct iosys_map iosysmap;
-	struct dma_buf_attachment *dmabuf_attch;
+	struct dma_buf_attachment *dmabuf_attach;
 	struct sg_table *sgt;
 	uint64_t offset;
 };
@@ -36,11 +36,20 @@ pva_kmd_device_memory_alloc_map(uint64_t size, struct pva_kmd_device *pva,
 	struct device *dev = get_context_device(pva, smmu_ctx_idx);
 	dma_addr_t pa = 0U;
 	void *va = NULL;
 	struct pva_kmd_device_memory_impl *mem_impl;
 	mem_impl = pva_kmd_zalloc(sizeof(struct pva_kmd_device_memory_impl));
 	if (mem_impl == NULL) {
 		goto err_out;
 	}
 	if (size == 0u) {
 		pva_kmd_log_err("Invalid allocation size");
 		goto free_mem;
 	}
 	struct pva_kmd_device_memory_impl *mem_impl =
 		pva_kmd_zalloc(sizeof(struct pva_kmd_device_memory_impl));
 	va = dma_alloc_coherent(dev, size, &pa, GFP_KERNEL);
-	if (va == NULL) {
+	if (IS_ERR_OR_NULL(va)) {
 		pva_kmd_log_err("dma_alloc_coherent failed");
 		goto free_mem;
 	}
@@ -49,12 +58,13 @@ pva_kmd_device_memory_alloc_map(uint64_t size, struct pva_kmd_device *pva,
 	mem_impl->dev_mem.size = size;
 	mem_impl->dev_mem.pva = pva;
 	mem_impl->dev_mem.smmu_ctx_idx = smmu_ctx_idx;
 	mem_impl->dev_mem.iova_access_flags = iova_access_flags;
 	mem_impl->dmabuf = NULL;
 	return &mem_impl->dev_mem;
 free_mem:
 	pva_kmd_free(mem_impl);
 err_out:
 	return NULL;
 }
@@ -66,13 +76,16 @@ struct pva_kmd_device_memory *
 pva_kmd_device_memory_acquire(uint64_t memory_handle, uint64_t offset,
 			      uint64_t size, struct pva_kmd_context *ctx)
 {
 	struct pva_kmd_device_memory_impl *mem_impl =
 		(struct pva_kmd_device_memory_impl *)pva_kmd_zalloc(
 			sizeof(struct pva_kmd_device_memory_impl));
 	struct dma_buf *dma_buf;
 	struct pva_kmd_device_memory_impl *mem_impl;
 	mem_impl = pva_kmd_zalloc(sizeof(struct pva_kmd_device_memory_impl));
 	if (mem_impl == NULL) {
 		goto err_out;
 	}
 	dma_buf = dma_buf_get(memory_handle);
-	if (dma_buf == NULL) {
+	if (IS_ERR_OR_NULL(dma_buf)) {
 		pva_kmd_log_err("Failed to acquire memory");
 		goto free_mem;
 	}
@@ -92,6 +105,7 @@ put_dmabuf:
 	dma_buf_put(dma_buf);
 free_mem:
 	pva_kmd_free(mem_impl);
 err_out:
 	return NULL;
 }
@@ -103,7 +117,7 @@ void pva_kmd_device_memory_free(struct pva_kmd_device_memory *mem)
 	if (mem_impl->dmabuf != NULL) {
 		/* This memory comes from dma_buf_get */
-		if (mem->iova != 0U) {
+		if (mem_impl->dmabuf_attach != NULL) {
 			pva_kmd_device_memory_iova_unmap(mem);
 		}
@@ -160,14 +174,28 @@ pva_kmd_device_memory_iova_map(struct pva_kmd_device_memory *memory,
 	pva_math_error math_err = MATH_OP_SUCCESS;
 	struct pva_kmd_device_memory_impl *mem_impl = container_of(
 		memory, struct pva_kmd_device_memory_impl, dev_mem);
 	// struct pva_kmd_linux_device_plat_data *plat_data =
 	// 	pva_kmd_linux_device_get_plat_data(pva);
 	// struct device *dev = plat_data->dev[smmu_ctx_idx];
 	struct device *dev = get_context_device(pva, smmu_ctx_idx);
 	struct dma_buf_attachment *attach;
 	struct sg_table *sgt;
 	enum pva_error err = PVA_SUCCESS;
 	enum dma_data_direction dma_direction;
 	uint64_t iova;
 	switch (access_flags) {
 	case PVA_ACCESS_RO: // Read-Only
 		dma_direction = DMA_TO_DEVICE;
 		break;
 	case PVA_ACCESS_WO: // Write-Only
 		dma_direction = DMA_FROM_DEVICE;
 		break;
 	case PVA_ACCESS_RW: // Read-Write
 		dma_direction = DMA_BIDIRECTIONAL;
 		break;
 	default:
 		pva_kmd_log_err("Invalid access flags\n");
 		err = PVA_INVAL;
 		goto err_out;
 	}
 	attach = dma_buf_attach(mem_impl->dmabuf, dev);
 	if (IS_ERR_OR_NULL(attach)) {
@@ -176,28 +204,32 @@ pva_kmd_device_memory_iova_map(struct pva_kmd_device_memory *memory,
 		goto err_out;
 	}
-	mem_impl->dmabuf_attch = attach;
+	sgt = dma_buf_map_attachment(attach, dma_direction);
 	sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL);
 	if (IS_ERR_OR_NULL(sgt)) {
 		err = PVA_INVAL;
 		pva_kmd_log_err("Failed to map attachment\n");
 		goto detach;
 	}
-	mem_impl->sgt = sgt;
+	iova = addu64(sg_dma_address(sgt->sgl), mem_impl->offset, &math_err);
 	mem_impl->dev_mem.iova =
 		addu64(sg_dma_address(sgt->sgl), mem_impl->offset, &math_err);
 	if (math_err != MATH_OP_SUCCESS) {
 		err = PVA_INVAL;
 		pva_kmd_log_err(
 			"pva_kmd_device_memory_iova_map Invalid DMA address\n");
-		goto detach;
+		goto unmap;
 	}
 	mem_impl->sgt = sgt;
 	mem_impl->dmabuf_attach = attach;
 	mem_impl->dev_mem.iova = iova;
 	mem_impl->dev_mem.pva = pva;
 	mem_impl->dev_mem.smmu_ctx_idx = smmu_ctx_idx;
 	mem_impl->dev_mem.iova_access_flags = access_flags;
 	return PVA_SUCCESS;
 unmap:
 	dma_buf_unmap_attachment(attach, sgt, dma_direction);
 detach:
-	dma_buf_detach(mem_impl->dmabuf, mem_impl->dmabuf_attch);
+	dma_buf_detach(mem_impl->dmabuf, attach);
 err_out:
 	return err;
 }
@@ -209,10 +241,11 @@ void pva_kmd_device_memory_iova_unmap(struct pva_kmd_device_memory *memory)
 	ASSERT(mem_impl->dmabuf != NULL);
-	dma_buf_unmap_attachment(mem_impl->dmabuf_attch, mem_impl->sgt,
+	dma_buf_unmap_attachment(mem_impl->dmabuf_attach, mem_impl->sgt,
 				 DMA_BIDIRECTIONAL);
-	dma_buf_detach(mem_impl->dmabuf, mem_impl->dmabuf_attch);
+	dma_buf_detach(mem_impl->dmabuf, mem_impl->dmabuf_attach);
-	memory->iova = 0;
+	mem_impl->sgt = NULL;
 	mem_impl->dmabuf_attach = NULL;
 }
 uint64_t pva_kmd_get_r5_iova_start(void)
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_driver.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_driver.c
@@ -50,13 +50,13 @@
 extern struct platform_driver pva_kmd_linux_smmu_context_driver;
 extern atomic_t g_num_smmu_ctxs;
 static bool load_from_gsc = PVA_KMD_LOAD_FROM_GSC_DEFAULT;
-static bool app_authenticate = PVA_KMD_APP_AUTH_DEFAULT;
+static bool pva_test_mode; //false by default
 module_param(load_from_gsc, bool, 0);
 MODULE_PARM_DESC(load_from_gsc, "Load V3 FW from GSC");
-module_param(app_authenticate, bool, 0);
+module_param(pva_test_mode, bool, 0);
-MODULE_PARM_DESC(app_authenticate, "Enable app authentication");
+MODULE_PARM_DESC(pva_test_mode, "Enable test mode");
 struct nvpva_device_data t23x_pva0_props = {
 	.version = PVA_CHIP_T23X,
@@ -112,11 +112,15 @@ static int pva_get_gsc_priv_hwid(struct platform_device *pdev)
 	return fwspec->ids[0] & 0xffff;
 }
-static void pva_kmd_linux_register_hwpm(struct pva_kmd_device *pva)
+static int pva_kmd_linux_register_hwpm(struct pva_kmd_device *pva)
 {
 	struct tegra_soc_hwpm_ip_ops *hwpm_ip_ops =
 		pva_kmd_zalloc(sizeof(*hwpm_ip_ops));
 	if (hwpm_ip_ops == NULL) {
 		return -ENOMEM;
 	}
 	hwpm_ip_ops->ip_dev = pva;
 	hwpm_ip_ops->ip_base_address = safe_addu64(
 		pva->reg_phy_base[0], (uint64_t)pva->regspec.cfg_perf_mon);
@@ -125,6 +129,7 @@ static void pva_kmd_linux_register_hwpm(struct pva_kmd_device *pva)
 	hwpm_ip_ops->hwpm_ip_reg_op = &pva_kmd_hwpm_ip_reg_op;
 	tegra_soc_hwpm_ip_register(hwpm_ip_ops);
 	pva->debugfs_context.data_hwpm = hwpm_ip_ops;
 	return 0;
 }
 static void pva_kmd_linux_unregister_hwpm(struct pva_kmd_device *pva)
@@ -256,10 +261,57 @@ static void pva_kmd_free_co_mem(struct platform_device *pdev)
 	}
 }
 static bool pva_kmd_in_test_mode(struct device *dev, bool param_test_mode)
 {
 	const char *dt_test_mode = NULL;
 	if (of_property_read_string(dev->of_node, "nvidia,test_mode_enable",
 				    &dt_test_mode)) {
 		return param_test_mode;
 	}
 	if (strcmp(dt_test_mode, "true")) {
 		return param_test_mode;
 	}
 	return true;
 }
 static struct kobj_type nvpva_kobj_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 };
 /**
 * Read VPU authentication property from device tree
 *
 * @param dev Pointer to the device structure
 * @return true if authentication should be enabled, false otherwise
 */
 static bool pva_kmd_linux_read_vpu_auth(const struct device *dev)
 {
 	bool auth_enabled = false;
 	int len;
 	const __be32 *val;
 	val = of_get_property(dev->of_node, "nvidia,vpu-auth", &len);
 	if ((val != NULL) && (len >= (int)sizeof(__be32))) {
 		u32 value = (u32)be32_to_cpu(*val);
 		if (value != 0U) {
 			auth_enabled = true;
 			dev_dbg(dev, "VPU authentication enabled\n");
 		} else {
 			auth_enabled = false;
 			dev_dbg(dev, "VPU authentication disabled\n");
 		}
 	} else {
 		dev_dbg(dev,
 			"No VPU authentication property found, using default: %d\n",
 			auth_enabled);
 	}
 	return auth_enabled;
 }
 static int pva_probe(struct platform_device *pdev)
 {
 	int err = 0U;
@@ -273,6 +325,9 @@ static int pva_probe(struct platform_device *pdev)
 	struct clk_bulk_data *clks;
 	struct clk *c;
 	bool pva_enter_test_mode = false;
 	bool app_authenticate;
 	device_id = of_match_device(tegra_pva_of_match, dev);
 	if (!device_id) {
 		dev_err(dev, "no match for pva dev\n");
@@ -286,6 +341,8 @@ static int pva_probe(struct platform_device *pdev)
 		return -ENODATA;
 	}
 	app_authenticate = pva_kmd_linux_read_vpu_auth(dev);
 	/* Create devices for child nodes of this device */
 	of_platform_default_populate(dev->of_node, NULL, dev);
@@ -300,17 +357,12 @@ static int pva_probe(struct platform_device *pdev)
 	pva_props->pdev = pdev;
 	mutex_init(&pva_props->lock);
-	pva_device =
+	pva_enter_test_mode = pva_kmd_in_test_mode(dev, pva_test_mode);
-		pva_kmd_device_create(pva_props->version, 0, app_authenticate);
+	pva_device = pva_kmd_device_create(
 		pva_props->version, 0, app_authenticate, pva_enter_test_mode);
 	pva_device->is_hv_mode = is_tegra_hypervisor_mode();
 	/* On L4T, forcing boot from file */
 	/* If needed to load from GSC, remove the below block */
 	if (!pva_device->is_hv_mode) {
 		load_from_gsc = false;
 	}
 	pva_device->load_from_gsc = load_from_gsc;
 	pva_device->stream_ids[pva_device->r5_image_smmu_context_id] =
 		pva_get_gsc_priv_hwid(pdev);
@@ -352,8 +404,17 @@ static int pva_probe(struct platform_device *pdev)
 	pva_kmd_linux_host1x_init(pva_device);
-	pva_kmd_debugfs_create_nodes(pva_device);
+	err = pva_kmd_debugfs_create_nodes(pva_device);
-	pva_kmd_linux_register_hwpm(pva_device);
+	if (err != PVA_SUCCESS) {
 		dev_err(dev, "debugfs creation failed\n");
 		goto err_cdev_init;
 	}
 	err = pva_kmd_linux_register_hwpm(pva_device);
 	if (err != PVA_SUCCESS) {
 		dev_err(dev, "pva_kmd_linux_register_hwpm failed\n");
 		goto err_cdev_init;
 	}
 	if (!pva_device->is_hv_mode && pva_device->load_from_gsc) {
 		err = pva_kmd_get_co_info(pdev);
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_event_trace.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_event_trace.c
@@ -6,6 +6,11 @@
 #include "trace/events/nvpva_ftrace.h"
 #include <linux/nvhost.h>
 static uint32_t get_job_id(uint32_t queue_id, uint64_t submit_id)
 {
 	return (queue_id & 0x000000FF) << 24 | (submit_id & 0xFFFFFFU);
 }
 void pva_kmd_shim_add_trace_vpu_exec(
 	struct pva_kmd_device *pva,
 	struct pva_kmd_fw_msg_vpu_trace const *trace_info)
@@ -38,7 +43,8 @@ void pva_kmd_shim_add_trace_vpu_exec(
 	// In V2, Job ID is a 32-bit value with the top 8 bits being the queue ID
 	// and the bottom 24 bits being a per-task counter. In V3, we only use the
 	// queue ID.
-	uint32_t job_id = (trace_info->queue_id & 0x000000FF) << 24;
+	uint32_t job_id =
 		get_job_id(trace_info->queue_id, trace_info->submit_id);
 	trace_pva_job_ext_event(job_id, trace_info->ccq_id,
 				0, // syncpt_thresh,
@@ -50,3 +56,42 @@ void pva_kmd_shim_add_trace_vpu_exec(
 			 trace_info->num_prefences, trace_info->prog_id,
 			 trace_info->submit_id, vpu_start);
 }
 void pva_kmd_shim_add_trace_fence(
 	struct pva_kmd_device *pva,
 	struct pva_kmd_fw_msg_fence_trace const *trace_info)
 {
 	uint32_t job_id;
 	// We want to log events only for user workloads
 	if (trace_info->ccq_id == PVA_PRIV_CCQ_ID) {
 		return;
 	}
 	job_id = get_job_id(trace_info->queue_id, trace_info->submit_id);
 	if (trace_info->action == PVA_KMD_FW_BUF_MSG_FENCE_ACTION_WAIT) {
 		if (trace_info->type == PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SYNCPT) {
 			trace_job_prefence(job_id, trace_info->fence_id,
 					   trace_info->value);
 		} else if (trace_info->type ==
 			   PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SEMAPHORE) {
 			trace_job_prefence_semaphore(job_id,
 						     trace_info->fence_id,
 						     trace_info->offset,
 						     trace_info->value);
 		}
 	} else if (trace_info->action ==
 		   PVA_KMD_FW_BUF_MSG_FENCE_ACTION_SIGNAL) {
 		if (trace_info->type == PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SYNCPT) {
 			trace_job_postfence(job_id, trace_info->fence_id,
 					    trace_info->value);
 		} else if (trace_info->type ==
 			   PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SEMAPHORE) {
 			trace_job_postfence_semaphore(job_id,
 						      trace_info->fence_id,
 						      trace_info->offset,
 						      trace_info->value);
 		}
 	}
 }
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_isr.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_isr.c
@@ -14,9 +14,13 @@ static struct pva_kmd_isr_data *get_isr(struct pva_kmd_device *pva,
 	struct pva_kmd_isr_data *isr_data;
 	ASSERT(intr_line < PVA_KMD_INTR_LINE_COUNT);
 	isr_data = &plat_data->isr[intr_line];
-	ASSERT(isr_data->binded);
+	if (!isr_data->binded) {
 		return NULL;
 	}
 	return isr_data;
 }
 static irqreturn_t pva_isr(int irq, void *dev_id)
 {
 	struct pva_kmd_isr_data *isr_data = (struct pva_kmd_isr_data *)dev_id;
@@ -35,40 +39,60 @@ enum pva_error pva_kmd_bind_intr_handler(struct pva_kmd_device *pva,
 		pva_kmd_linux_device_get_data(pva);
 	struct pva_kmd_isr_data *isr_data = &plat_data->isr[intr_line];
 	struct nvpva_device_data *props = plat_data->pva_device_properties;
 	enum pva_error pva_err = PVA_SUCCESS;
 	int irq;
-	isr_data->irq = platform_get_irq(props->pdev, intr_line);
+	ASSERT(isr_data->binded == false);
 	irq = platform_get_irq(props->pdev, intr_line);
 	if (irq < 0) {
 		pva_kmd_log_err("Failed to get irq number");
 		pva_err = kernel_err2pva_err(irq);
 		goto err_out;
 	}
 	isr_data->irq = irq;
 	isr_data->handler = handler;
 	isr_data->handler_data = data;
 	isr_data->binded = true;
 	isr_data->intr_line = intr_line;
 	err = request_threaded_irq(isr_data->irq, NULL, pva_isr, IRQF_ONESHOT,
 				   "pva-isr", isr_data);
 	if (err != 0) {
 		pva_kmd_log_err("Failed to bind interrupt handler");
 		pva_err = kernel_err2pva_err(err);
 		goto err_out;
 	}
-	return kernel_err2pva_err(err);
+	isr_data->binded = true;
 	return PVA_SUCCESS;
 err_out:
 	return pva_err;
 }
 void pva_kmd_enable_intr(struct pva_kmd_device *pva,
 			 enum pva_kmd_intr_line intr_line)
 {
 	struct pva_kmd_isr_data *isr_data = get_isr(pva, intr_line);
 	if (isr_data != NULL) {
 		enable_irq(isr_data->irq);
 	}
 }
-void pva_kmd_disable_intr(struct pva_kmd_device *pva,
+void pva_kmd_disable_intr_nosync(struct pva_kmd_device *pva,
 				 enum pva_kmd_intr_line intr_line)
 {
 	struct pva_kmd_isr_data *isr_data = get_isr(pva, intr_line);
-	disable_irq(isr_data->irq);
+	if (isr_data != NULL) {
 		disable_irq_nosync(isr_data->irq);
 	}
 }
 void pva_kmd_free_intr(struct pva_kmd_device *pva,
 		       enum pva_kmd_intr_line intr_line)
 {
 	struct pva_kmd_isr_data *isr_data = get_isr(pva, intr_line);
-	free_irq(isr_data->irq, isr_data);
+	ASSERT(isr_data != NULL);
 	(void)free_irq(isr_data->irq, isr_data);
 	isr_data->binded = false;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_misc.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_misc.c
@@ -11,7 +11,12 @@
 void *pva_kmd_zalloc(uint64_t size)
 {
-	return kvzalloc(size, GFP_KERNEL);
+	void *ptr = kvzalloc(size, GFP_KERNEL);
 	if (IS_ERR_OR_NULL(ptr)) {
 		return NULL;
 	}
 	return ptr;
 }
 void pva_kmd_free(void *ptr)
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_smmu.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_smmu.c
@@ -132,12 +132,16 @@ void pva_kmd_linux_device_smmu_contexts_init(struct pva_kmd_device *pva_device)
 	     sid_idx < safe_subu32(pva_device->hw_consts.n_smmu_contexts, 2U);
 	     sid_idx++) {
 		uint32_t smmu_ctx_idx = safe_addu32(sid_idx, 1U);
-		pva_device->stream_ids[smmu_ctx_idx] = g_smmu_ctxs[sid_idx].sid;
+		struct pva_kmd_linux_smmu_ctx *smmu_ctx = &g_smmu_ctxs[sid_idx];
-		device_data->smmu_contexts[smmu_ctx_idx] =
+
-			g_smmu_ctxs[sid_idx].pdev;
+		pva_device->stream_ids[smmu_ctx_idx] = smmu_ctx->sid;
-		dma_set_mask_and_coherent(
+		device_data->smmu_contexts[smmu_ctx_idx] = smmu_ctx->pdev;
-			&device_data->smmu_contexts[smmu_ctx_idx]->dev,
+		dma_set_mask_and_coherent(&smmu_ctx->pdev->dev,
 					  DMA_BIT_MASK(39));
 		//set max segment size to UINT_MAX to avoid creating scatterlist >= 4GB
 		//during IOVA mapping, which will overflow the scatterlist length field,
 		//causing IOVA leak
 		dma_set_max_seg_size(&smmu_ctx->pdev->dev, UINT_MAX);
 	}
 	/* Configure SMMU contexts for privileged operations */
--- a/drivers/video/tegra/host/pva/src/libs/pva/include/pva_constants.h
+++ b/drivers/video/tegra/host/pva/src/libs/pva/include/pva_constants.h
@@ -6,7 +6,7 @@
 #define PVA_NUM_ENGINES 2U
 #define PVA_MAX_NUM_CCQ 8
-#define PVA_CCQ_DEPTH 8U
+#define PVA_CCQ_DEPTH 14U
 #define PVA_USER_CCQ_BASE 1
 #define PVA_INVALID_CCQ_ID 0xFF
 #define PVA_INVALID_ENGINE_ID 0xFFU
@@ -138,4 +138,6 @@
 #define PVA_KMD_CHIP_ID_T26X "GEN3"
 #define PVA_KMD_CHIP_ID_DEFAULT PVA_KMD_CHIP_ID_T23X
 #define PVA_KMD_TEST_MODE_ENV_VAR "PVA_TEST_MODE"
 #endif // PVA_CONSTANTS_H
--- a/drivers/video/tegra/host/pva/src/libs/pva/include/pva_math_utils.h
+++ b/drivers/video/tegra/host/pva/src/libs/pva/include/pva_math_utils.h
@@ -689,6 +689,21 @@ static inline uint32_t safe_wraparound_dec_u32(uint32_t counter)
 	return result;
 }
 static inline uint32_t safe_wrap_add_u32(uint32_t a, uint32_t b)
 {
 	return (uint32_t)(((uint64_t)a + (uint64_t)b) & 0xFFFFFFFFU);
 }
 static inline uint32_t safe_wrap_sub_u32(uint32_t a, uint32_t b)
 {
 	return (uint32_t)(((uint64_t)a - (uint64_t)b) & 0xFFFFFFFFU);
 }
 static inline uint32_t safe_wrap_mul_u32(uint32_t a, uint32_t b)
 {
 	return (uint32_t)(((uint64_t)a * (uint64_t)b) & 0xFFFFFFFFU);
 }
 #define SAT_ADD_DEFINE(a, b, name, type)                                       \
 	static inline type sat_add##name(type a, type b)                       \
 	{                                                                      \
--- a/drivers/video/tegra/host/pva/src/private_api/pva_api_private.h
+++ b/drivers/video/tegra/host/pva/src/private_api/pva_api_private.h
@@ -0,0 +1,51 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
 #ifndef PVA_API_PRIVATE_H
 #define PVA_API_PRIVATE_H
 #include "pva_api.h"
 //For legacy support not exposed by public API
 #define PVA_CMD_FLAGS_USE_LEGACY_POINTER 0x1
 struct pva_fw_vpu_legacy_ptr_symbol {
 	uint64_t base;
 	uint32_t offset;
 	uint32_t size;
 };
 enum pva_error_inject_codes {
 	PVA_ERR_INJECT_WDT_HW_ERR, // watchdog Hardware error
 	PVA_ERR_INJECT_WDT_TIMEOUT, // watchdog Timeout error
 	PVA_ERR_INJECT_VMEM_CLEAR, // vmem clear
 	PVA_ERR_INJECT_ASSERT_CHECK, // assert check
 	PVA_ERR_INJECT_ARMV7_EXCEPTION, // ARMv7 exception
 };
 struct pva_cmd_run_unit_tests {
 #define PVA_CMD_OPCODE_RUN_UNIT_TESTS (PVA_CMD_OPCODE_MAX + 0U)
 	struct pva_cmd_header header;
 #define PVA_FW_UTESTS_MAX_ARGC 16U
 	uint8_t argc;
 	uint8_t pad[3];
 	uint32_t in_resource_id;
 	uint32_t in_offset;
 	uint32_t in_size;
 	uint32_t out_resource_id;
 	uint32_t out_offset;
 	uint32_t out_size;
 };
 struct pva_cmd_err_inject {
 #define PVA_CMD_OPCODE_ERR_INJECT (PVA_CMD_OPCODE_MAX + 1U)
 	struct pva_cmd_header header;
 	uint32_t err_inject_code; // enum pva_error_inject_codes
 };
 struct pva_cmd_gr_check {
 #define PVA_CMD_OPCODE_GR_CHECK (PVA_CMD_OPCODE_MAX + 2U)
 	struct pva_cmd_header header;
 };
 #define PVA_CMD_OPCODE_COUNT (PVA_CMD_OPCODE_MAX + 3U)
 #endif // PVA_API_PRIVATE_H