pva: mirror from gitlab cv/pva-sys-sw

Gitlab commit a307885246be7 ("umd: port intf tests to umd - ...") Changes since last deployment: - umd: port intf tests to umd - add NegativeTest_MaxContextCreate_Single... - Remove VPU auth default macro - kmd: Add a null check to fix static defect - tests: Fix sync unregistration test - fw: Handle DMA error when fetching chunk - umd_tests: add requirements tests - Fix error path of Linux KMD memory API - Add kernel code style check script - address review comments for access flag feat - fix memory leak in access buffer tests - kmd: use correct formatting - fw: update license and use macro - tests: add tests for buffer access type - feat: respect buffer access flags - Update deployment document - Add a default fence wait timeout - Fix error path of KMD sync ops submit - Move recovery state check to pva_kmd_device_busy - Fix error path of profiling level update - Increase max CCQ FIFO depth to 14 - kmd: zero initialize all commands - Make KMD robust against firmware abort - Add multi user stress tests - Conditional VMEM Clear Check - Conditional VMEM Clear Check - Fix static defects in KMD - Reading auth for all the PVA devices - Add support for VPU Device Tree authentication - UMD: Add Null and range checks - Remove assert and propogate error - Add error injection tests - Bug fix - 5207608 - Update CUPVA tests in CI to 2.7.0 rc3 - tests: add register_exec_noop_with_bit_flip - fw: Fix static defects - kmd: freeze PVA before freeing code/data memory - Add missing test_mode parameter for run_test_plan - deploy: allow deploying to different branches - pva kmd: linux: handle test mode dt entry - fw: baremetal: bound breaklink params - coverity: Set expiry for code quality report - kmd: Remove PVA_IS_DEBUG from native timeout calc - Reduce iterations of long duration UMD tests - Fix IOVA leak in Linux KMD - fw:common: fix order of enable/disable virt - umd_test: add missing requirement test specs - umd_test: add test for perf spikes - Fix nsight fence logging - deploy: fix GVS build failure - Add FSP Abort Hook - Execution timeout - Trace fences for NSIGHT - Fix shared buffer handling during suspend/resume - tests: add more tests for resource unregistration - Add MODS test support - KMD:Fix static defect - umd: fix double free in cuextend - umd: Free pva_memory object on free() - Unify VPU and PPE syscall ID - Clean up public API - cuextend: Cleanup implementation - cuextend: Add API to get stream payload - compat: Fix missing flushes of event fences - cuExtend: Unified code path for stream submit - cuExtend: Implementation of cuExtend Stream Submit - cuExtend: Stream submit API definitions - cuExtend: Sync to new cuExtend header - Set test mode default through macro - fw: Add PPE error codes - Use zalloc when allocating resource record - Allocate Temporary Buffers for DMA Config Loading - Fix fast reset failure test - Add DMA config allocator - kmd: Add unsafe API for block allocator - Add missing warning for Linux kernel build - Set err cmd idx to zero if there's no error - ci: Run tests for MODS test mode - Use 1K command buffer chunk size in MODS test mode - Allow developer to provide its own target lease - tests: add nvsci prefence_postfence_test - kmd: Sha calculation static defects fix - kmd: fix INT30-c static defect - Fix command index logging for PVA_FW_EVENT_RUN_VPU - Enable vpucfg_destroy_after_submit - tests: add tests spec for deterministic test - test: add cpu_signaller_pva_waiter_deterministic - tests: add cpu_waiter_pva_signaller_deterministic - Disable verbosity control of FW log - Ops free API should accept NULL ptr - Report TCM usage for t26x as well - Support non-contiguous syncpoints - umd: fix new top 25 CWE - License header update - L2SRAM flush command changes - debugfs: disable tests for broken nodes - debugfs: handle 0 input size for allowlist path - Move pva_kmd_device_idle to context destroy - Refactor interrupt handler binding in PVA KMD - Fix DMA registration error path - debugfs: Add read support for fw log level - Add stress test suites to CI - Fix error path for context init - Add stress test suites - umd: add NULL checks - ci: Perf Test Updates - ci: perf test updates - Enable boot from GSC in L4T GVS - Updating comment Change-Id: I98be7ec270ba5f6fd5bc0978d084d731a88e70b6 Signed-off-by: abhinayaa <abhinayaa@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3348508 GVS: buildbot_gerritrpt <buildbot_gerritrpt@nvidia.com> Reviewed-by: Mohnish Jain <mohnishj@nvidia.com>
2025-12-22 17:25:35 +03:00 · 2025-04-24 04:43:25 +00:00
parent 0ac4068c89
commit 63f6f2f159
69 changed files with 2205 additions and 1891 deletions
--- a/drivers/video/tegra/host/pva/Makefile
+++ b/drivers/video/tegra/host/pva/Makefile
@@ -22,6 +22,7 @@ pva_objs += \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_context.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_debugfs.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_device.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_devmem_pool.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg_binding.o \
    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg_validate.o \
@@ -70,6 +71,7 @@ pva_inc_flags += \
    -I$(PVA_SYS_ABSDIR)/src/kmd/include \
    -I$(PVA_SYS_ABSDIR)/src/kmd/linux/include \
    -I$(PVA_SYS_ABSDIR)/src/libs/pva/include \
+    -I$(PVA_SYS_ABSDIR)/src/private_api \

 pva_def_flags += \
    -DPVA_BUILD_MODE=PVA_BUILD_MODE_L4T \
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-checkpoint.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-checkpoint.h
@@ -224,6 +224,11 @@
 #define PVA_ABORT_NOC_BIST (0xfcU)
 //! @endcond

+/**
+ * @brief Minor code for abort in case of FSP abort.
+ */
+#define PVA_ABORT_FSP 0x42U
+
 /** @} */

 /**
@@ -299,4 +304,36 @@
 #define PVA_ABORT_IRQ_TEST_HOST (0xE002U)
 #endif
 /** @} */
+
+/**
+ * @defgroup PVA_ABORT_ARGUMENTS_FSP Argument to pva_abort() from FSP abort
+ * @ingroup PVA_ABORT_ARGUMENTS
+ * @{
+ */
+
+/**
+ * @brief Minor Code for FSP aborts because of safertos errors
+ */
+#define PVA_ABORT_FSP_SAFERTOS (0xE001U)
+
+/**
+ * @brief Minor Code for FSP aborts because of asserts in fsp
+ */
+#define PVA_ABORT_FSP_ASSERT (0xE002U)
+
+/**
+ * @brief Minor Code for FSP aborts because of exception in fsp
+ */
+#define PVA_ABORT_FSP_EXCEPTION (0xE003U)
+
+/**
+ * @brief Minor Code for FSP aborts because of stack guard failure
+ */
+#define PVA_ABORT_FSP_STACK (0xE004U)
+
+/**
+ * @brief Minor Code for Unknown FSP aborts
+ */
+#define PVA_ABORT_FSP_UNKNOWN (0xE005U)
+/** @} */
 #endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-errors.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-errors.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: Copyright (c) 2016-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+/* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
 #ifndef PVA_ERRORS_H
 #define PVA_ERRORS_H

@@ -154,15 +154,6 @@ typedef uint16_t pva_errors_t;
 */

 //! @cond DISABLE_DOCUMENTATION
-/**
- * @brief Error in case of Floating point NAN.
- */
-#define PVA_ERR_PPE_DIVIDE_BY_0 (0x34U)
-/**
- * @brief Error in case of Floating point NAN.
- */
-#define PVA_ERR_PPE_ILLEGAL_DEBUG (0x36U)
-
 #define PVA_ERR_PPE_ILLEGAL_INSTR_ALIGN (0x37U)

 /**
@@ -270,40 +261,6 @@ typedef uint16_t pva_errors_t;
 * more than HW Seq RAM size.
 */
 #define PVA_ERR_DMA_HWSEQ_PROGRAM_TOO_LONG (0x217U)
-/** @} */
-
-/**
- * @defgroup PVA_MISR_ERRORS
- *
- * @brief MISR error codes used across PVA.
- * @{
- */
-/**
- * @brief Error status when DMA MISR test is not run.
- */
-#define PVA_ERR_MISR_NOT_RUN (0x280U)
-/**
- * @brief Error status when DMA MISR test did not complete.
- */
-#define PVA_ERR_MISR_NOT_DONE (0x281U)
-/**
- * @brief Error status when DMA MISR test timed out.
- */
-#define PVA_ERR_MISR_TIMEOUT (0x282U)
-/**
- * @brief Error status in case of DMA MISR test address failure.
- */
-#define PVA_ERR_MISR_ADDR (0x283U)
-/**
- * @brief Error status in case of DMA MISR test data failure.
- */
-#define PVA_ERR_MISR_DATA (0x284U)
-/**
- * @brief Error status in case of DMA MISR test data and address failure.
- */
-#define PVA_ERR_MISR_ADDR_DATA (0x285U)
-/** @} */
-
 /**
 * @defgroup PVA_VPU_ISR_ERRORS
 *
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-vpu-syscall-interface.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-vpu-syscall-interface.h
@@ -6,150 +6,4 @@

 #include <stdint.h>

-/**
- * @breif Write syscall parameter will be a pointer to this struct
- * @{
- */
-typedef union {
-	struct {
-		uint32_t addr;
-		uint32_t size;
-	} in;
-	struct {
-		uint32_t written_size;
-	} out;
-} pva_fw_pe_syscall_write;
-/** @} */
-
-/**
- * @defgroup PVA_VPU_SYSCALL_PERFMON_SAMPLE_PARAM_GROUP
- *
- * @brief Parameter specification for syscall perfmon_sample
- *
- * @{
- */
-
-/**
- * @brief Perfmon sample syscall parameter will be a pointer to this struct
- */
-typedef struct {
-	/** counter_mask[0] is for ID: 0-31; counter_mask[1] is for ID: 32-63 */
-	uint32_t counter_mask[2];
-	uint32_t output_addr;
-} pva_fw_pe_syscall_perfmon_sample;
-
-/**
- * @brief Index for t26x performance counters for VPU
- */
-#define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
-#define PERFMON_COUNTER_ID_VPS_ID_VALID_T26X (1U)
-#define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T26X (2U)
-#define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T26X (3U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T26X (4U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T26X (5U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T26X (6U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T26X (7U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T26X (8U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T26X (9U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T26X (10U)
-#define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T26X (11U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T26X (12U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T26X (13U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T26X (14U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T26X (15U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T26X (16U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T26X (17U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T26X (18U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T26X (19U)
-#define PERFMON_COUNTER_ID_VPS_ICACHE_FETCH_REQ_T26X (20U)
-#define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_T26X (21U)
-#define PERFMON_COUNTER_ID_VPS_ICACHE_PREEMPT_T26X (22U)
-#define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_LINES_T26X (23U)
-#define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_DUR_T26X (24U)
-#define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_DUR_T26X (25U)
-#define PERFMON_COUNTER_ID_DLUT_BUSY_T26X (26U)
-#define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T26X (27U)
-#define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T26X (28U)
-#define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T26X (29U)
-#define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T26X (30U)
-#define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T26X (31U)
-#define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T26X (32U)
-#define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T26X (33U)
-#define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T26X (34U)
-#define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T26X (35U)
-#define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T26X (36U)
-
-/**
- * @brief Index for t23x performance counters
- */
-#define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T23X (0U)
-#define PERFMON_COUNTER_ID_VPS_ID_VALID_T23X (1U)
-#define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T23X (2U)
-#define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T23X (3U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T23X (4U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T23X (5U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T23X (6U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T23X (7U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T23X (8U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T23X (9U)
-#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T23X (10U)
-#define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T23X (11U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T23X (12U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T23X (13U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T23X (14U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T23X (15U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T23X (16U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T23X (17U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T23X (18U)
-#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T23X (19U)
-#define PERFMON_COUNTER_ID_ICACHE_FETCH_REQ_T23X (20U)
-#define PERFMON_COUNTER_ID_ICACHE_MISS_T23X (21U)
-#define PERFMON_COUNTER_ID_ICACHE_PREEMP_T23X (22U)
-#define PERFMON_COUNTER_ID_ICACHE_PREFETCH_LINES_T23X (23U)
-#define PERFMON_COUNTER_ID_ICACHE_MISS_DUR_T23X (24U)
-#define PERFMON_COUNTER_ID_ICACHE_PREFETCH_DUR_T23X (25U)
-#define PERFMON_COUNTER_ID_DLUT_BUSY_T23X (26U)
-#define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T23X (27U)
-#define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T23X (28U)
-#define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T23X (29U)
-#define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T23X (30U)
-#define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T23X (31U)
-#define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T23X (32U)
-#define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T23X (33U)
-#define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T23X (34U)
-#define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T23X (35U)
-#define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T23X (36U)
-
-/**
- * @brief Index for t26x performance counters for PPE
- */
-#define PERFMON_COUNTER_ID_PPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
-#define PERFMON_COUNTER_ID_PPS_ID_VALID_T26X (1U)
-#define PERFMON_COUNTER_ID_PPS_STALL_ID_REG_DEPEND_T26X (2U)
-#define PERFMON_COUNTER_ID_PPS_STALL_ID_ONLY_T26X (3U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX1_ONLY_T26X (4U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_LD_DEPENDENCY_T26X (5U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_ST_DEPENDENCY_T26X (6U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_DEPENDENCY_T26X (7U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STRM_STORE_FLUSH_T26X (8U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_STORE_FLUSH_T26X (9U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STORE_FLUSH_T26X (10U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_LD_T26X (11U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_ST_T26X (12U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_T26X (13U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LD_T26X (14U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_ST_T26X (15U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LDST_T26X (16U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_PUSHBACK_T26X (17U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STQ_PUSHBACK_T26X (18U)
-#define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_FLUSH_T26X (19U)
-#define PERFMON_COUNTER_ID_PPS_WFE_GPI_EX_STATE_T26X (20U)
-#define PERFMON_COUNTER_ID_PPS_ICACHE_FETCH_REQ_T26X (21U)
-#define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_T26X (22U)
-#define PERFMON_COUNTER_ID_PPS_ICACHE_PREEMPT_T26X (23U)
-#define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_LINES_T26X (24U)
-#define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_DUR_T26X (25U)
-#define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_DUR_T26X (26U)
-/** @} */
-
 #endif /*PVA_VPU_SYSCALL_INTERFACE_H*/
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_fw.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_fw.h
@@ -11,6 +11,138 @@

 /* The sizes of these structs must be explicitly padded to align to 4 bytes */

+#define PVA_CMD_PRIV_OPCODE_FLAG (1U << 7U)
+
+#define PVA_RESOURCE_ID_BASE 1U
+struct pva_resource_entry {
+	uint8_t access_flags : 2; // 1: RO, 2: WO, 3: RW
+	uint8_t reserved : 4;
+#define PVA_RESOURCE_TYPE_INVALID 0U
+#define PVA_RESOURCE_TYPE_DRAM 1U
+#define PVA_RESOURCE_TYPE_EXEC_BIN 2U
+#define PVA_RESOURCE_TYPE_DMA_CONFIG 3U
+	uint8_t type : 2;
+	uint8_t smmu_context_id;
+	uint8_t addr_hi;
+	uint8_t size_hi;
+	uint32_t addr_lo;
+	uint32_t size_lo;
+};
+
+struct pva_cmd_init_resource_table {
+#define PVA_CMD_OPCODE_INIT_RESOURCE_TABLE (0U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	/**< Resource table id is from 0 to 7, 0 is the device's resource table,
+	 * 1-7 are users'. */
+	uint8_t resource_table_id;
+	uint8_t resource_table_addr_hi;
+	uint8_t pad[2];
+	uint32_t resource_table_addr_lo;
+	uint32_t max_n_entries;
+};
+
+struct pva_cmd_deinit_resource_table {
+#define PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE (1U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t resource_table_id;
+	uint8_t pad[3];
+};
+
+struct pva_cmd_update_resource_table {
+#define PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE (2U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t resource_table_id;
+	uint8_t pad[3];
+	uint32_t resource_id;
+	struct pva_resource_entry entry;
+};
+
+struct pva_cmd_init_queue {
+#define PVA_CMD_OPCODE_INIT_QUEUE (3U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t ccq_id;
+	uint8_t queue_id;
+	uint8_t queue_addr_hi;
+	uint8_t syncpt_addr_hi;
+	uint32_t queue_addr_lo;
+	uint32_t max_n_submits;
+	uint32_t syncpt_addr_lo;
+	uint32_t syncpt_id;
+};
+
+struct pva_cmd_deinit_queue {
+#define PVA_CMD_OPCODE_DEINIT_QUEUE (4U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t ccq_id;
+	uint8_t queue_id;
+	uint8_t pad[2];
+};
+
+struct pva_cmd_enable_fw_profiling {
+#define PVA_CMD_OPCODE_ENABLE_FW_PROFILING (5U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t timestamp_type;
+	uint8_t pad[3];
+	uint32_t filter;
+};
+
+struct pva_cmd_disable_fw_profiling {
+#define PVA_CMD_OPCODE_DISABLE_FW_PROFILING (6U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+};
+
+struct pva_cmd_get_tegra_stats {
+#define PVA_CMD_OPCODE_GET_TEGRA_STATS (7U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t buffer_offset_hi;
+	bool enabled;
+	uint8_t pad[2];
+	uint32_t buffer_resource_id;
+	uint32_t buffer_size;
+	uint32_t buffer_offset_lo;
+};
+
+struct pva_cmd_suspend_fw {
+#define PVA_CMD_OPCODE_SUSPEND_FW (8U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+};
+
+struct pva_cmd_resume_fw {
+#define PVA_CMD_OPCODE_RESUME_FW (9U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+};
+
+struct pva_cmd_init_shared_dram_buffer {
+#define PVA_CMD_OPCODE_INIT_SHARED_DRAM_BUFFER (10U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t interface;
+	uint8_t buffer_iova_hi;
+	uint8_t pad[2];
+	uint32_t buffer_iova_lo;
+	uint32_t buffer_size;
+};
+
+struct pva_cmd_deinit_shared_dram_buffer {
+#define PVA_CMD_OPCODE_DEINIT_SHARED_DRAM_BUFFER                               \
+	(11U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t interface;
+	uint8_t pad[3];
+};
+struct pva_cmd_set_debug_log_level {
+#define PVA_CMD_OPCODE_SET_DEBUG_LOG_LEVEL (12U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint32_t log_level;
+};
+
+struct pva_cmd_set_profiling_level {
+#define PVA_CMD_OPCODE_SET_PROFILING_LEVEL (13U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint32_t level;
+};
+
+#define PVA_CMD_PRIV_OPCODE_COUNT 14U
+
 struct pva_fw_prefence {
 	uint8_t offset_hi;
 	uint8_t pad0[3];
@@ -301,7 +433,8 @@ struct pva_fw_shared_buffer_header {
 struct pva_kmd_fw_buffer_msg_header {
 #define PVA_KMD_FW_BUF_MSG_TYPE_FW_EVENT 0
 #define PVA_KMD_FW_BUF_MSG_TYPE_VPU_TRACE 1
-#define PVA_KMD_FW_BUF_MSG_TYPE_RES_UNREG 2
+#define PVA_KMD_FW_BUF_MSG_TYPE_FENCE_TRACE 2
+#define PVA_KMD_FW_BUF_MSG_TYPE_RES_UNREG 3
 	uint32_t type : 8;
 	// Size of payload in bytes. Includes the size of the header.
 	uint32_t size : 24;
@@ -334,6 +467,27 @@ struct pva_kmd_fw_msg_vpu_trace {
 	uint64_t submit_id;
 };

+struct pva_kmd_fw_msg_fence_trace {
+	uint64_t submit_id;
+	uint64_t timestamp;
+	// For syncpt fences, fence_id is the syncpt index
+	// For semaphore fences, fence_id is the serial ID of the semaphore NvRM memory
+	uint64_t fence_id;
+	// 'offset' is the offset into the semaphore memory where the value is stored
+	// This is only valid for semaphore fences
+	// Note: Trace APIs in KMD only support 32-bit offset
+	uint32_t offset;
+	uint32_t value;
+	uint8_t ccq_id;
+	uint8_t queue_id;
+#define PVA_KMD_FW_BUF_MSG_FENCE_ACTION_WAIT 0U
+#define PVA_KMD_FW_BUF_MSG_FENCE_ACTION_SIGNAL 1U
+	uint8_t action;
+#define PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SYNCPT 0U
+#define PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SEMAPHORE 1U
+	uint8_t type;
+};
+
 // Resource unregister message
 struct pva_kmd_fw_msg_res_unreg {
 	uint32_t resource_id;
@@ -345,4 +499,11 @@ struct pva_kmd_fw_tegrastats {
 	uint64_t total_utilization[PVA_NUM_PVE];
 };

+#define PVA_MAX_CMDBUF_CHUNK_LEN 1024
+#define PVA_MAX_CMDBUF_CHUNK_SIZE (sizeof(uint32_t) * PVA_MAX_CMDBUF_CHUNK_LEN)
+
+#define PVA_TEST_MODE_MAX_CMDBUF_CHUNK_LEN 256
+#define PVA_TEST_MODE_MAX_CMDBUF_CHUNK_SIZE                                    \
+	(sizeof(uint32_t) * PVA_TEST_MODE_MAX_CMDBUF_CHUNK_LEN)
+
 #endif // PVA_FW_H
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_fw_hyp.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_fw_hyp.h
@@ -49,7 +49,9 @@
 * |    23-21     |  Reserved             |  Reserved for future use                                                                                                                                  |
 * |     20       |  CG DISABLE           |  To indicate the PVA R5 FW should disable the clock gating feature                                                                                        |
 * |     19       |  VMEM RD WAR DISABLE  |  To disable the VMEM Read fail workaround feature                                                                                                         |
- * |    18-16     |  Reserved             |  Reserved for future use                                                                                                                                  |
+ * |     18       |  TEST_MODE_ENABLE     |  To enter test mode. See Documentation.                                                                                                                   |
+ * |     17       |  USE_XBAR_RAW         |  Reserved for future use                                                                                                                                  |
+ * |     16       |  Reserved             |  Reserved for future use                                                                                                                                  |
 *
 * The table below shows the mapping which is sent by FW to KMD
 *
@@ -72,11 +74,20 @@
 #define PVA_BOOT_SEMA_CG_DISABLE PVA_BIT(20U)
 //! @cond DISABLE_DOCUMENTATION

+/** Tell firmware to enter test mode */
+#define PVA_BOOT_SEMA_TEST_MODE_ENABLE PVA_BIT(18U)
+
 /** Tell firmware that block linear surfaces are in XBAR_RAW format instead of
 * TEGRA_RAW format */
 #define PVA_BOOT_SEMA_USE_XBAR_RAW PVA_BIT(17U)

+/** Tell firmware to enable test mode */
+#define PVA_BOOT_SEMA_TEST_MODE PVA_BIT(16U)
+
 #define PVA_BOOT_SEMA 0U
+#define PVA_RO_SYNC_BASE_SEMA 1U
+#define PVA_RW_SYNC_BASE_SEMA 2U
+#define PVA_RW_SYNC_SIZE_SEMA 3U

 /**
 * @brief This macro has the value to be set by KMD in the shared semaphores
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_resource.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_resource.h
@@ -62,8 +62,10 @@ struct pva_fw_dma_slot {
 * to block linear surface. */
 #define PVA_FW_DMA_SLOT_FLAG_CB (1u << 4u)
 #define PVA_FW_DMA_SLOT_FLAG_BOUND (1u << 5u)
-	uint8_t flags;
-	uint8_t pad;
+#define PVA_FW_DMA_SLOT_FLAG_MASKED (1u << 6u)
+#define PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB 7u
+#define PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB 8u
+	uint16_t flags;
 	/** Bitmask of channels that use this slot */
 	uint16_t ch_use_mask;

--- a/drivers/video/tegra/host/pva/src/include/pva_api.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api.h
@@ -16,6 +16,7 @@ extern "C" {

 /* Core APIs */

+#define PVA_MAX_NUM_RESOURCES_PER_CONTEXT (16U * 1024U)
 /**
 * @brief Create a PVA context.
 *
@@ -37,6 +38,18 @@ enum pva_error pva_context_create(uint32_t pva_index,
 */
 void pva_context_destroy(struct pva_context *ctx);

+/**
+ * @brief Get the value of a context attribute.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] attr Attribute to get.
+ * @param[out] out_value Pointer to the value of the attribute.
+ * @param[size] size of the attribute structure
+ */
+enum pva_error pva_get_attribute(struct pva_context *ctx, enum pva_attr attr,
+				 void *out_value, uint64_t size);
+
+#define PVA_MAX_NUM_SUBMISSIONS_PER_QUEUE (8U * 1024U)
 /**
 * @brief Create a PVA queue.
 *
@@ -97,7 +110,7 @@ void pva_memory_free(struct pva_memory *mem);
 * @param[in] ctx Pointer to the context.
 * @param[in] syncpiont_id Syncpoint ID to wait on.
 * @param[in] value Value to wait for.
- * @param[in] timeout_us Timeout in microseconds. PVA_TIMEOUT_INF for infinite.
+ * @param[in] timeout_us Timeout in microseconds. PVA_SUBMIT_TIMEOUT_INF for infinite.
 */
 enum pva_error pva_syncpoint_wait(struct pva_context *ctx,
 				  uint32_t syncpiont_id, uint32_t value,
@@ -109,7 +122,7 @@ enum pva_error pva_syncpoint_wait(struct pva_context *ctx,
 * @param[in] queue Pointer to the queue.
 * @param[in] submit_infos Array of submit info structures.
 * @param[in] count Number of submit info structures.
- * @param[in] timeout_us Timeout in microseconds. PVA_TIMEOUT_INF for infinite.
+ * @param[in] timeout_us Timeout in microseconds. PVA_SUBMIT_TIMEOUT_INF for infinite.
 *
 * @note Concurrent submission to the same queue needs to be serialized by the
 *       caller.
@@ -206,26 +219,6 @@ enum pva_error pva_memory_import_id_destroy(uint64_t import_id);
 /** \brief Specifies the PVA system software minor version. */
 #define PVA_SYSSW_MINOR_VERSION (7U)

-/**
- * @brief Get PVA system software version.
- *
- * PVA system software version is defined as the latest version of cuPVA which is fully supported
- * by this version of the PVA system software.
- *
- * @param[out] version version of currently running system SW, computed as:
- 	       (PVA_SYSSW_MAJOR_VERSION * 1000) + PVA_SYSSW_MINOR_VERSION
- * @return PVA_SUCCESS on success, else error code indicating the failure.
- */
-enum pva_error pva_get_version(uint32_t *version);
-
-/**
- * @brief Get the hardware characteristics of the PVA.
- *
- * @param[out] pva_hw_char Pointer to the hardware characteristics.
- */
-enum pva_error
-pva_get_hw_characteristics(struct pva_characteristics *pva_hw_char);
-
 #ifdef __cplusplus
 }
 #endif
--- a/drivers/video/tegra/host/pva/src/include/pva_api_cmdbuf.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_cmdbuf.h
@@ -5,13 +5,9 @@
 #define PVA_API_CMDBUF_H
 #include "pva_api_types.h"

-//Maximum number of slots for maintaining Timestamps
-#define PVA_MAX_QUERY_SLOTS_COUNT 32U
-
 /** The common header for all commands.
 */
 struct pva_cmd_header {
-#define PVA_CMD_PRIV_OPCODE_FLAG (1U << 7U)
 	/** Opcode for the command. MSB of opcode indicates whether this command is
 	 * privileged or not */
 	uint8_t opcode;
@@ -35,6 +31,26 @@ struct pva_cmd_header {
 	uint8_t len;
 };

+struct pva_dma_misr_config {
+#define PVA_DMA_FLAG_MISR_ENABLE 1u
+	uint8_t enabled;
+	uint8_t reserved;
+	uint16_t channel_mask;
+	uint32_t seed_crc0;
+	uint32_t seed_crc1;
+	uint32_t ref_addr;
+	uint32_t ref_data_1;
+	uint32_t ref_data_2;
+	uint32_t misr_timeout;
+};
+
+struct pva_dma_misr {
+	uint32_t slot_mask_low0;
+	uint32_t slot_mask_low1;
+	uint32_t slot_mask_high;
+	struct pva_dma_misr_config misr_config;
+};
+
 struct pva_user_dma_allowance {
 #define PVA_USER_DMA_ALLOWANCE_ADB_STEP_SIZE 8
 	uint32_t channel_idx : 4;
@@ -189,11 +205,6 @@ struct pva_cmd_set_vpu_parameter_with_buffer {
 	uint32_t src_dram_offset_lo;
 };

-/** For set_vpu_parameter_with_address command, set this flag in header.flags to
- * indicate that the target symbol is the legacy pointer symbol type:
- * pva_fw_vpu_legacy_ptr_symbol, which only supports 32bit offset and 32bit
- * size. */
-#define PVA_CMD_FLAGS_USE_LEGACY_POINTER 0x1
 /** Copy the address of a DRAM buffer to a VPU variable. The variable must be
 * laid out exactly according to pva_fw_vpu_ptr_symbol
 */
@@ -208,7 +219,6 @@ struct pva_cmd_set_vpu_parameter_with_address {
 };

 #define PVA_MAX_DMA_SETS_PER_DMA_ENGINE 4
-#define PVA_DMA_CONFIG_FETCH_BUFFER_PER_DMA_ENGINE 1

 /** This command first acquires the TCM scratch and then fetches DMA configuration
 * into the scratch. The command does not modify DMA
@@ -291,17 +301,7 @@ struct pva_cmd_run_ppe {
 	uint32_t entry_point_index;
 };

-#define PVA_BARRIER_GROUP_0 0U
-#define PVA_BARRIER_GROUP_1 1U
-#define PVA_BARRIER_GROUP_2 2U
-#define PVA_BARRIER_GROUP_3 3U
-#define PVA_BARRIER_GROUP_4 4U
-#define PVA_BARRIER_GROUP_5 5U
-#define PVA_BARRIER_GROUP_6 6U
-#define PVA_BARRIER_GROUP_7 7U
-
 #define PVA_MAX_BARRIER_GROUPS 8U
-
 #define PVA_BARRIER_GROUP_INVALID 0xFFU

 /**
@@ -464,29 +464,15 @@ struct pva_cmd_set_vpu_instance_parameter {
 	uint32_t symbol_id;
 };

-struct pva_cmd_run_unit_tests {
-#define PVA_CMD_OPCODE_RUN_UNIT_TESTS 30U
+struct pva_cmd_set_vpu_print_buffer {
+#define PVA_CMD_OPCODE_SET_VPU_PRINT_BUFFER 30U
 	struct pva_cmd_header header;
-#define PVA_FW_UTESTS_MAX_ARGC 16U
-	uint8_t argc;
-	uint8_t pad[3];
-	uint32_t in_resource_id;
-	uint32_t in_offset;
-	uint32_t in_size;
-	uint32_t out_resource_id;
-	uint32_t out_offset;
-	uint32_t out_size;
-};
-
-struct pva_cmd_set_vpu_print_cb {
-#define PVA_CMD_OPCODE_SET_VPU_PRINT_CB 31U
-	struct pva_cmd_header header;
-	uint32_t cb_resource_id;
-	uint32_t cb_offset;
+	uint32_t resource_id;
+	uint32_t offset;
 };

 struct pva_cmd_invalidate_l2sram {
-#define PVA_CMD_OPCODE_INVALIDATE_L2SRAM 32U
+#define PVA_CMD_OPCODE_INVALIDATE_L2SRAM 31U
 	struct pva_cmd_header header;
 	uint8_t dram_offset_hi;
 	uint8_t pad[3];
@@ -496,19 +482,18 @@ struct pva_cmd_invalidate_l2sram {
 };

 struct pva_cmd_flush_l2sram {
-#define PVA_CMD_OPCODE_FLUSH_L2SRAM 33U
+#define PVA_CMD_OPCODE_FLUSH_L2SRAM 32U
 	struct pva_cmd_header header;
+	uint8_t dram_offset_hi;
+	uint8_t pad[3];
+	uint32_t dram_resource_id;
+	uint32_t dram_offset_lo;
+	uint32_t l2sram_size;
 	struct pva_user_dma_allowance user_dma;
 };

-struct pva_cmd_err_inject {
-#define PVA_CMD_OPCODE_ERR_INJECT 34U
-	struct pva_cmd_header header;
-	enum pva_error_inject_codes err_inject_code;
-};
-
 struct pva_cmd_patch_l2sram_offset {
-#define PVA_CMD_OPCODE_PATCH_L2SRAM_OFFSET 35U
+#define PVA_CMD_OPCODE_PATCH_L2SRAM_OFFSET 33U
 	struct pva_cmd_header header;
 	uint8_t dma_set_id;
 	uint8_t slot_id;
@@ -520,130 +505,16 @@ struct pva_cmd_patch_l2sram_offset {
 * mapped to a new logical barrier group. This allows re-using barrier ids within a command buffer.
 */
 struct pva_cmd_retire_barrier_group {
-#define PVA_CMD_OPCODE_RETIRE_BARRIER_GROUP 36U
+#define PVA_CMD_OPCODE_RETIRE_BARRIER_GROUP 34U
 	struct pva_cmd_header header;
 };

-struct pva_cmd_gr_check {
-#define PVA_CMD_OPCODE_GR_CHECK 37U
+struct pva_cmd_setup_misr {
+#define PVA_CMD_OPCODE_SETUP_MISR 35U
 	struct pva_cmd_header header;
+	struct pva_dma_misr misr_params;
 };

-#define PVA_CMD_OPCODE_COUNT 38U
-
-struct pva_cmd_init_resource_table {
-#define PVA_CMD_OPCODE_INIT_RESOURCE_TABLE (0U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	/**< Resource table id is from 0 to 7, 0 is the device's resource table,
-	 * 1-7 are users'. */
-	uint8_t resource_table_id;
-	uint8_t resource_table_addr_hi;
-	uint8_t pad[2];
-	uint32_t resource_table_addr_lo;
-	uint32_t max_n_entries;
-};
-
-struct pva_cmd_deinit_resource_table {
-#define PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE (1U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t resource_table_id;
-	uint8_t pad[3];
-};
-
-struct pva_cmd_update_resource_table {
-#define PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE (2U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t resource_table_id;
-	uint8_t pad[3];
-	uint32_t resource_id;
-	struct pva_resource_entry entry;
-};
-
-struct pva_cmd_init_queue {
-#define PVA_CMD_OPCODE_INIT_QUEUE (3U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t ccq_id;
-	uint8_t queue_id;
-	uint8_t queue_addr_hi;
-	uint8_t pad;
-	uint32_t queue_addr_lo;
-	uint32_t max_n_submits;
-};
-
-struct pva_cmd_deinit_queue {
-#define PVA_CMD_OPCODE_DEINIT_QUEUE (4U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t ccq_id;
-	uint8_t queue_id;
-	uint8_t pad[2];
-};
-
-struct pva_cmd_enable_fw_profiling {
-#define PVA_CMD_OPCODE_ENABLE_FW_PROFILING (5U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t timestamp_type;
-	uint8_t pad[3];
-	uint32_t filter;
-};
-
-struct pva_cmd_disable_fw_profiling {
-#define PVA_CMD_OPCODE_DISABLE_FW_PROFILING (6U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-};
-
-struct pva_cmd_get_tegra_stats {
-#define PVA_CMD_OPCODE_GET_TEGRA_STATS (7U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t buffer_offset_hi;
-	bool enabled;
-	uint8_t pad[2];
-	uint32_t buffer_resource_id;
-	uint32_t buffer_size;
-	uint32_t buffer_offset_lo;
-};
-
-struct pva_cmd_suspend_fw {
-#define PVA_CMD_OPCODE_SUSPEND_FW (8U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-};
-
-struct pva_cmd_resume_fw {
-#define PVA_CMD_OPCODE_RESUME_FW (9U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-};
-
-struct pva_cmd_init_shared_dram_buffer {
-#define PVA_CMD_OPCODE_INIT_SHARED_DRAM_BUFFER (10U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t interface;
-	uint8_t buffer_iova_hi;
-	uint8_t pad[2];
-	uint32_t buffer_iova_lo;
-	uint32_t buffer_size;
-};
-
-struct pva_cmd_deinit_shared_dram_buffer {
-#define PVA_CMD_OPCODE_DEINIT_SHARED_DRAM_BUFFER                               \
-	(11U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint8_t interface;
-	uint8_t pad[3];
-};
-struct pva_cmd_set_debug_log_level {
-#define PVA_CMD_OPCODE_SET_DEBUG_LOG_LEVEL (12U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint32_t log_level;
-};
-
-struct pva_cmd_set_profiling_level {
-#define PVA_CMD_OPCODE_SET_PROFILING_LEVEL (13U | PVA_CMD_PRIV_OPCODE_FLAG)
-	struct pva_cmd_header header;
-	uint32_t level;
-};
-
-#define PVA_CMD_PRIV_OPCODE_COUNT 14U
-
-#define PVA_MAX_CMDBUF_CHUNK_LEN 1024
-#define PVA_MAX_CMDBUF_CHUNK_SIZE (sizeof(uint32_t) * PVA_MAX_CMDBUF_CHUNK_LEN)
+#define PVA_CMD_OPCODE_MAX 36U

 #endif // PVA_API_CMDBUF_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_cuda.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_cuda.h
@@ -11,14 +11,14 @@ extern "C" {
 #include "cuda.h"
 #include "pva_api_types.h"

-/**
- *  @brief Structure for cuExtend queue data needed for command submission.
- */
-struct pva_cuextend_queue_data {
-	/*! Holds a pointer to pva queue object */
-	struct pva_queue *queue;
-	/*! Holds engine affinity for command submission*/
-	uint32_t affinity;
+#define PVA_CUEXTEND_MAX_NUM_PREFENCES 16
+#define PVA_CUEXTEND_MAX_NUM_POSTFENCES 16
+
+struct pva_cuextend_submit_events {
+	struct pva_fence prefences[PVA_CUEXTEND_MAX_NUM_PREFENCES];
+	struct pva_fence postfences[PVA_CUEXTEND_MAX_NUM_POSTFENCES];
+	uint32_t num_prefences;
+	uint32_t num_postfences;
 };

 /**
@@ -71,27 +71,16 @@ typedef enum pva_error (*pva_cuextend_stream_unregister)(void *callback_args,
 							 uint64_t flags);

 /**
- *  @brief Function type for cuExtend acquire queue callback.
+ * @brief Function type for submitting a batch of command buffers via a CUDA stream.
 *
 * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
 * @param[in] stream_payload Client data returned by \ref pva_cuextend_stream_register.
- * @param[out] queue_data Output pointer to a pva_cuextend_queue_data object.
- * @return \ref pva_error The completion status of acquire queue operation.
+ * @param[in] submit_payload Pointer to the submit payload.
+ * @return \ref pva_error The completion status of the submit operation.
 */
-typedef enum pva_error (*pva_cuextend_queue_acquire)(
-	void *callback_args, void *stream_payload,
-	struct pva_cuextend_queue_data **queue_data);
-
-/**
- *  @brief Function type for cuExtend release queue callback.
- *
- * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
- * @param[in] stream_payload Client data returned by \ref pva_cuextend_stream_register.
- * @return \ref pva_error The completion status of release  queue operation.
- */
-typedef enum pva_error (*pva_cuextend_queue_release)(void *callback_args,
-						     void *stream_payload,
-						     void *queue_data);
+typedef enum pva_error (*pva_cuextend_stream_submit)(
+	void *callback_args, void *stream_payload, void *submit_payload,
+	struct pva_cuextend_submit_events *submit_events);

 /**
 * @brief Function type for retrieving error code from cuExtend.
@@ -128,12 +117,10 @@ struct pva_cuextend_callbacks {
 	pva_cuextend_stream_register stream_reg;
 	/*! Holds the unregister stream callback */
 	pva_cuextend_stream_unregister stream_unreg;
-	/*! Holds the acquire queue callback */
-	pva_cuextend_queue_acquire queue_acquire;
-	/*! Holds the release queue callback */
-	pva_cuextend_queue_release queue_release;
 	/*! Holds the teardown callback */
 	pva_cuextend_teardown teardown;
+	/*! Holds the stream submit callback */
+	pva_cuextend_stream_submit stream_submit;
 	/*! Pointer to the callback arguments provided by client during cuExtend initialization */
 	void *args;
 };
@@ -188,22 +175,32 @@ enum pva_error pva_cuextend_memory_import(struct pva_context *ctx,
 /**
 * @brief Submit a batch of command buffers via a CUDA stream.
 *
- * @param[in] queue Pointer to the queue. If queue is not NULL, this API will try to submit the client tasks to this queue directly.
- *                  Otherwise, it will call queue_acquire callback to query a pva_queue object from stream payload, and then submit
- *                  the tasks to the queried queue.
- * @param[in] stream A CUDA stream.
- * @param[in] submit_infos Array of submit info structures.
- * @param[in] count Number of submit info structures.
- * @param[in] timeout_ms Timeout in milliseconds. PVA_TIMEOUT_INF for infinite.
+ * @param[in] ctx Pointer to the PVA context.
+ * @param[in] cuStream A CUDA stream.
+ * @param[in] client_stream A client stream.
+ * @param[in] submit_payload Pointer to the submit payload.
 * @return \ref pva_error The completion status of the submit operation.
- *
- * @note Concurrent submission to the same queue needs to be serialized by the
- *       caller.
 */
-enum pva_error
-pva_cuextend_cmdbuf_batch_submit(struct pva_queue *queue, CUstream stream,
-				 struct pva_cmdbuf_submit_info *submit_infos,
-				 uint32_t count, uint64_t timeout_ms);
+enum pva_error pva_cuextend_cmdbuf_batch_submit(struct pva_context *ctx,
+						CUstream cuStream,
+						void *client_stream,
+						void *submit_payload);
+
+/**
+ * @brief Get the payload associated with a CUDA stream.
+ *
+ * Returns the payload which was associated with the CUDA stream during registration callback.
+ *
+ * @param[in] ctx Pointer to the PVA context.
+ * @param[in] cuStream A CUDA stream.
+ * @param[out] stream_payload Pointer to the stream payload.
+ * @return PVA_SUCCESS if the stream payload is successfully retrieved
+ *         PVA_BAD_PARAMETER_ERROR if any of the parameters are NULL
+ *         PVA_CUDA_INIT_FAILED if the cuExtend was not initialized for the context
+ */
+enum pva_error pva_cuextend_get_stream_payload(struct pva_context *ctx,
+					       CUstream cuStream,
+					       void **stream_payload);

 #ifdef __cplusplus
 }
--- a/drivers/video/tegra/host/pva/src/include/pva_api_dma.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_dma.h
@@ -24,73 +24,8 @@ enum pva_gpio_bit {
 	GPIO_WRITE6_BIT = 29U
 };

-enum pva_dma_descriptor_id {
-	PVA_DMA_DESC_NONE = 0,
-	PVA_DMA_DESC0 = 1,
-	PVA_DMA_DESC1 = 2,
-	PVA_DMA_DESC2 = 3,
-	PVA_DMA_DESC3 = 4,
-	PVA_DMA_DESC4 = 5,
-	PVA_DMA_DESC5 = 6,
-	PVA_DMA_DESC6 = 7,
-	PVA_DMA_DESC7 = 8,
-	PVA_DMA_DESC8 = 9,
-	PVA_DMA_DESC9 = 10,
-	PVA_DMA_DESC10 = 11,
-	PVA_DMA_DESC11 = 12,
-	PVA_DMA_DESC12 = 13,
-	PVA_DMA_DESC13 = 14,
-	PVA_DMA_DESC14 = 15,
-	PVA_DMA_DESC15 = 16,
-	PVA_DMA_DESC16 = 17,
-	PVA_DMA_DESC17 = 18,
-	PVA_DMA_DESC18 = 19,
-	PVA_DMA_DESC19 = 20,
-	PVA_DMA_DESC20 = 21,
-	PVA_DMA_DESC21 = 22,
-	PVA_DMA_DESC22 = 23,
-	PVA_DMA_DESC23 = 24,
-	PVA_DMA_DESC24 = 25,
-	PVA_DMA_DESC25 = 26,
-	PVA_DMA_DESC26 = 27,
-	PVA_DMA_DESC27 = 28,
-	PVA_DMA_DESC28 = 29,
-	PVA_DMA_DESC29 = 30,
-	PVA_DMA_DESC30 = 31,
-	PVA_DMA_DESC31 = 32,
-	PVA_DMA_DESC32 = 33,
-	PVA_DMA_DESC33 = 34,
-	PVA_DMA_DESC34 = 35,
-	PVA_DMA_DESC35 = 36,
-	PVA_DMA_DESC36 = 37,
-	PVA_DMA_DESC37 = 38,
-	PVA_DMA_DESC38 = 39,
-	PVA_DMA_DESC39 = 40,
-	PVA_DMA_DESC40 = 41,
-	PVA_DMA_DESC41 = 42,
-	PVA_DMA_DESC42 = 43,
-	PVA_DMA_DESC43 = 44,
-	PVA_DMA_DESC44 = 45,
-	PVA_DMA_DESC45 = 46,
-	PVA_DMA_DESC46 = 47,
-	PVA_DMA_DESC47 = 48,
-	PVA_DMA_DESC48 = 49,
-	PVA_DMA_DESC49 = 50,
-	PVA_DMA_DESC50 = 51,
-	PVA_DMA_DESC51 = 52,
-	PVA_DMA_DESC52 = 53,
-	PVA_DMA_DESC53 = 54,
-	PVA_DMA_DESC54 = 55,
-	PVA_DMA_DESC55 = 56,
-	PVA_DMA_DESC56 = 57,
-	PVA_DMA_DESC57 = 58,
-	PVA_DMA_DESC58 = 59,
-	PVA_DMA_DESC59 = 60,
-	PVA_DMA_DESC60 = 61,
-	PVA_DMA_DESC61 = 62,
-	PVA_DMA_DESC62 = 63,
-	PVA_DMA_DESC63 = 64
-};
+#define PVA_DMA_DESC_ID_NULL 0
+#define PVA_DMA_DESC_ID_BASE 1

 /**
 * The values of the enum members conform to the definitions of DMA descriptors'
@@ -266,8 +201,6 @@ struct pva_dma_config_header {
 * means that every allocation of descriptors will start at an alignment of 4. The following
 * macros control the alignment/grouping requirement of DMA resources.
 */
-// TODO: Add compile time asserts to ensure the following alignment requirments don't result
-//	 in fractional resource partitions?
 #define PVA_DMA_CHANNEL_ALIGNMENT 1
 #define PVA_DMA_DESCRIPTOR_ALIGNMENT 4
 #define PVA_DMA_ADB_ALIGNMENT 16
--- a/drivers/video/tegra/host/pva/src/include/pva_api_ops.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_ops.h
@@ -11,7 +11,7 @@
 */
 struct pva_ops_memory {
 	uint32_t handle; /**< Memory handle */
-	uint32_t size; /**< Size of memory */
+	uint64_t size; /**< Size of memory */
 	void *va; /**< Virtual address */
 };

@@ -27,8 +27,8 @@ struct pva_ops_memory {
 */
 struct pva_ops_buffer {
 	struct pva_ops_memory *memory; /**< Pointer to buffer memory */
-	uint32_t start_offset; /**< Start offset in buffer memory */
-	uint32_t end_offset; /**< End offset (exclusive) in buffer memory */
+	uint64_t start_offset; /**< Start offset in buffer memory */
+	uint64_t end_offset; /**< End offset (exclusive) in buffer memory */
 };

 /**
@@ -45,9 +45,9 @@ struct pva_ops_buffer {
 * @brief Header structure for PVA operations.
 */
 struct pva_ops_header {
-	uint32_t opcode; /**< Operation code identifying the operation type */
+	uint64_t opcode; /**< Operation code identifying the operation type */
 	/** Size of the operation in bytes. This size must be a multiple of 8 bytes. */
-	uint32_t size;
+	uint64_t size;
 };

 /**
@@ -56,8 +56,7 @@ struct pva_ops_header {
 struct pva_ops_executable_register {
 #define PVA_OPS_OPCODE_EXECUTABLE_REGISTER 1U
 	struct pva_ops_header header; /**< Operation header */
-	uint32_t exec_size; /**< Size of executable data */
-	uint32_t pad; /**< Padding for 8 bytes alignment */
+	uint64_t exec_size; /**< Size of executable data */
 	//followed by executable data
 };

@@ -144,7 +143,7 @@ enum pva_error pva_ops_parse_unregister_resp(struct pva_ops_buffer *resp_buf);
 *
 * @return PVA_SUCCESS on success, appropriate error code otherwise.
 */
-enum pva_error pva_ops_memory_alloc(struct pva_context *ctx, uint32_t size,
+enum pva_error pva_ops_memory_alloc(struct pva_context *ctx, uint64_t size,
 				    struct pva_ops_memory *ops_buf);

 /**
--- a/drivers/video/tegra/host/pva/src/include/pva_api_types.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_types.h
@@ -118,6 +118,16 @@
 	ACT(PVA_ERR_MATH_OP)                                                   \
 	ACT(PVA_ERR_HWSEQ_INVALID)                                             \
 	ACT(PVA_ERR_FW_ABORTED)                                                \
+	ACT(PVA_ERR_PPE_DIVIDE_BY_0)                                           \
+	ACT(PVA_ERR_PPE_FP_NAN)                                                \
+	ACT(PVA_ERR_INVALID_ACCESS_MODE_COMBINATION)                           \
+	ACT(PVA_ERR_CMD_TCM_BUF_OUT_OF_RANGE)                                  \
+	ACT(PVA_ERR_MISR_NOT_RUN)                                              \
+	ACT(PVA_ERR_MISR_DATA)                                                 \
+	ACT(PVA_ERR_MISR_ADDR)                                                 \
+	ACT(PVA_ERR_MISR_NOT_DONE)                                             \
+	ACT(PVA_ERR_MISR_ADDR_DATA)                                            \
+	ACT(PVA_ERR_MISR_TIMEOUT)                                              \
 	ACT(PVA_ERR_CODE_COUNT)

 enum pva_error {
@@ -207,12 +217,6 @@ struct pva_fw_vpu_ptr_symbol {
 	uint64_t size;
 };

-struct pva_fw_vpu_legacy_ptr_symbol {
-	uint64_t base;
-	uint32_t offset;
-	uint32_t size;
-};
-
 enum pva_surface_format {
 	PVA_SURF_FMT_PITCH_LINEAR = 0,
 	PVA_SURF_FMT_BLOCK_LINEAR
@@ -243,25 +247,6 @@ enum pva_symbol_type {
 	PVA_SYM_TYPE_MAX,
 };

-/**
- * \brief Holds PVA Sync Client Type.
- * Currently NvSciSync supports NvSciSyncFences with syncpoint primitive type only.
- */
-enum pva_sync_client_type {
-	/*! For a given SyncObj PVA acts as a signaler. This type corresponds to
-      * postfences from PVA. */
-	PVA_SYNC_CLIENT_TYPE_SIGNALER,
-	/*! For a given SyncObj PVA acts as a waiter. This type corresponds to
-      * prefences to PVA. */
-	PVA_SYNC_CLIENT_TYPE_WAITER,
-	/*! For a given SyncObj PVA acts as both signaler and waiter. */
-	PVA_SYNC_CLIENT_TYPE_SIGNALER_WAITER,
-	/*! Specifies the non inclusive upper bound of valid values. */
-	PVA_SYNC_CLIENT_TYPE_MAX,
-	/*! Reserved bound of valid values. */
-	PVA_SYNC_CLIENT_TYPE_RESERVED = 0x7FFFFFFF,
-};
-
 #define PVA_SYMBOL_ID_INVALID 0U
 #define PVA_SYMBOL_ID_BASE 1U
 #define PVA_MAX_SYMBOL_NAME_LEN 64U
@@ -275,19 +260,6 @@ struct pva_symbol_info {
 };

 #define PVA_RESOURCE_ID_INVALID 0U
-#define PVA_RESOURCE_ID_BASE 1U
-struct pva_resource_entry {
-#define PVA_RESOURCE_TYPE_INVALID 0U
-#define PVA_RESOURCE_TYPE_DRAM 1U
-#define PVA_RESOURCE_TYPE_EXEC_BIN 2U
-#define PVA_RESOURCE_TYPE_DMA_CONFIG 3U
-	uint8_t type;
-	uint8_t smmu_context_id;
-	uint8_t addr_hi;
-	uint8_t size_hi;
-	uint32_t addr_lo;
-	uint32_t size_lo;
-};

 /** \brief Maximum number of queues per context */
 #define PVA_MAX_QUEUES_PER_CONTEXT (8)
@@ -300,7 +272,8 @@ struct pva_resource_entry {
 #define PVA_ACCESS_RW                                                          \
 	(PVA_ACCESS_RO | PVA_ACCESS_WO) /**< Read and write access */

-#define PVA_TIMEOUT_INF UINT64_MAX /**< Infinite timeout */
+// unify timeout to uint64_t, in microseconds
+#define PVA_SUBMIT_TIMEOUT_INF UINT64_MAX /**< Infinite timeout */

 #define PVA_MAX_NUM_INPUT_STATUS 2 /**< Maximum number of input statuses */
 #define PVA_MAX_NUM_OUTPUT_STATUS 2 /**< Maximum number of output statuses */
@@ -329,8 +302,9 @@ struct pva_cmdbuf_submit_info {
 	uint64_t submit_id;
 	/** Offset of the first chunk within the resource */
 	uint64_t first_chunk_offset;
-#define PVA_EXEC_TIMEOUT_REUSE 0xFFFFFFFFU
-#define PVA_EXEC_TIMEOUT_INF 0U
+/** Execution timeout is in ms */
+#define PVA_EXEC_TIMEOUT_INF UINT32_MAX
+#define PVA_EXEC_TIMEOUT_REUSE (UINT32_MAX - 1)
 	/** Execution Timeout */
 	uint32_t execution_timeout_ms;
 	struct pva_fence prefences[PVA_MAX_NUM_PREFENCES];
@@ -351,13 +325,13 @@ struct pva_cmdbuf_status {
 	uint16_t status;
 };

-/** \brief Holds the PVA capabilities. */
+/** @brief Holds the PVA capabilities. */
 struct pva_characteristics {
-	/*! Holds the number of PVA engines. */
+	/** Holds the number of PVA engines. */
 	uint32_t pva_engine_count;
-	/*! Holds the number of VPUs per PVA engine. */
+	/** Holds the number of VPUs per PVA engine. */
 	uint32_t pva_pve_count;
-	/*! Holds the PVA generation information */
+	/** Holds the PVA generation information */
 	enum pva_hw_gen hw_version;
 	uint16_t max_desc_count;
 	uint16_t max_ch_count;
@@ -370,11 +344,6 @@ struct pva_characteristics {
 	uint16_t reserved_adb_count;
 };

-enum pva_error_inject_codes {
-	PVA_ERR_INJECT_WDT_HW_ERR, // watchdog Hardware error
-	PVA_ERR_INJECT_WDT_TIMEOUT, // watchdog Timeout error
-};
-
 /*
 * !!!! DO NOT MODIFY !!!!!!
 * These values are defined as per DriveOS guidelines
@@ -382,4 +351,20 @@ enum pva_error_inject_codes {
 #define PVA_INPUT_STATUS_SUCCESS (0)
 #define PVA_INPUT_STATUS_INVALID (0xFFFF)

+/**
+ * @brief Context attribute keys.
+ */
+enum pva_attr {
+	PVA_CONTEXT_ATTR_MAX_CMDBUF_CHUNK_SIZE,
+	PVA_ATTR_HW_CHARACTERISTICS,
+	PVA_ATTR_VERSION
+};
+
+/**
+ * @brief Maximum size of a command buffer chunk.
+ */
+struct pva_ctx_attr_max_cmdbuf_chunk_size {
+	uint16_t max_size;
+};
+
 #endif // PVA_API_TYPES_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_vpu.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_vpu.h
@@ -23,113 +23,10 @@ struct pva_vpu_instance_data {
 };

 /**
- * @defgroup PVA_VPU_SYSCALL
- *
- * @brief PVA VPU SYS call IDs for each type of
- * SYS call.
- * @{
- */
-
-//! @cond DISABLE_DOCUMENTATION
-
-/**
- * @brief VPU Syscall id for vpu printf write.
- */
-#define PVA_FW_PE_SYSCALL_ID_WRITE (1U)
-//! @endcond
-/**
- * @brief VPU Syscall id for Icache prefetch.
- */
-#define PVA_FW_PE_SYSCALL_ID_ICACHE_PREFETCH (2U)
-
-/**
- * @brief VPU Syscall id for masking exceptions.
- */
-#define PVA_FW_PE_SYSCALL_ID_MASK_EXCEPTION (3U)
-
-/**
- * @brief VPU Syscall id for unmasking exceptions.
- */
-#define PVA_FW_PE_SYSCALL_ID_UNMASK_EXCEPTION (4U)
-//! @cond DISABLE_DOCUMENTATION
-/**
- * @brief VPU Syscall id for sampling VPU performance counters
- */
-#define PVA_FW_PE_SYSCALL_ID_PERFMON_SAMPLE (5U)
-//! @endcond
-/** @} */
-
-/**
- * @defgroup PVA_PPE_SYSCALL
- *
- * @brief PVA PPE SYS call IDs for each type of
- * SYS call.
- * @{
- */
-
-//! @cond DISABLE_DOCUMENTATION
-
-/**
- * @brief PPE Syscall id for ppe printf write.
- */
-#define PVA_FW_PPE_SYSCALL_ID_WRITE (1U)
-
-/**
- * @brief PPE Syscall id for masking exceptions.
- */
-#define PVA_FW_PPE_SYSCALL_ID_MASK_EXCEPTION (2U)
-
-/**
- * @brief PPE Syscall id for unmasking exceptions.
- */
-#define PVA_FW_PPE_SYSCALL_ID_UNMASK_EXCEPTION (3U)
-
-/**
- * @brief VPU Syscall id for sampling VPU performance counters
- */
-#define PVA_FW_PPE_SYSCALL_ID_PERFMON_SAMPLE (4U)
-/**
- * @brief PPE Syscall id for Icache prefetch.
- */
-#define PVA_FW_PPE_SYSCALL_ID_ICACHE_PREFETCH (5U)
-
-//! @endcond
-/** @} */
-
-/**
- * @brief Lookup table to convert PPE syscall IDs to VPU syscall IDs
- * Index is PPE syscall ID, value is corresponding VPU syscall ID
- */
-#define PVA_FW_PPE_TO_VPU_SYSCALL_LUT                                                          \
-	{                                                                                      \
-		0U, /* Index 0: Invalid */                                                     \
-			PVA_FW_PE_SYSCALL_ID_WRITE, /* Index 1: Write */                       \
-			PVA_FW_PE_SYSCALL_ID_MASK_EXCEPTION, /* Index 2: Mask Exception */     \
-			PVA_FW_PE_SYSCALL_ID_UNMASK_EXCEPTION, /* Index 3: Unmask Exception */ \
-			PVA_FW_PE_SYSCALL_ID_PERFMON_SAMPLE, /* Index 4: Perfmon Sample */     \
-			PVA_FW_PE_SYSCALL_ID_ICACHE_PREFETCH /* Index 5: ICache Prefetch */    \
-	}
-
-/**
- * @brief Maximum valid PPE syscall ID
- */
-#define PVA_FW_PPE_SYSCALL_ID_MAX PVA_FW_PPE_SYSCALL_ID_ICACHE_PREFETCH
-
-/**
- * @defgroup PVA_VPU_SYSCALL_WRITE_PARAM_GROUP
- *
- * @brief Parameter specification for syscall write
- */
-
-/**
- * @defgroup PVA_VPU_SYSCALL_COMMAND_FIELDS_GROUP
- *
- * @brief The command format to be used while issuing vpu syscall command from VPU kernel to R5.
- * The fields mentioned in this group is used for submitting the command
- * through the Signal_R5 interface from VPU kernel.
- *
- * @{
+ * @brief Used to store VPU Syscall IDs, that represent the
+ *        vpu syscall id between FW and VPU kernel.
 */
+typedef uint32_t pva_vpu_syscall_id_t;

 /**
 * @brief The most significant bit of the vpu syscall ID field in
@@ -154,17 +51,56 @@ struct pva_vpu_instance_data {
 * the vpu syscall command interface
 */
 #define PVA_FW_PE_SYSCALL_PARAM_LSB (0U)
-/** @} */

 /**
- * @defgroup PVA_VPU_SYSCALL_ICACHE_PREFETCH_PARAM_FIELDS_GROUP
- *
- * @brief The parameter format to be used while issuing vpu syscall command from VPU kernel to R5 for syscall icache prefetch.
- * The fields mentioned in this group is used for submitting the icache prefetch command
- * through the Signal_R5 interface from VPU kernel.
- *
- * @{
+ * @brief VPU Syscall id for vpu printf write.
 */
+#define PVA_FW_PE_SYSCALL_ID_WRITE (1U)
+
+/**
+ * @brief VPU Syscall id for Icache prefetch.
+ */
+#define PVA_FW_PE_SYSCALL_ID_ICACHE_PREFETCH (2U)
+
+/**
+ * @brief VPU Syscall id for masking exceptions.
+ */
+#define PVA_FW_PE_SYSCALL_ID_MASK_EXCEPTION (3U)
+
+/**
+ * @brief VPU Syscall id for unmasking exceptions.
+ */
+#define PVA_FW_PE_SYSCALL_ID_UNMASK_EXCEPTION (4U)
+
+/**
+ * @brief VPU Syscall id for sampling VPU performance counters
+ */
+#define PVA_FW_PE_SYSCALL_ID_PERFMON_SAMPLE (5U)
+
+/**
+ * @brief PPE Syscall id for ppe printf write.
+ */
+#define PVA_FW_PPE_SYSCALL_ID_WRITE (1U)
+
+/**
+ * @brief PPE Syscall id for Icache prefetch.
+ */
+#define PVA_FW_PPE_SYSCALL_ID_ICACHE_PREFETCH (2U)
+
+/**
+ * @brief PPE Syscall id for masking exceptions.
+ */
+#define PVA_FW_PPE_SYSCALL_ID_MASK_EXCEPTION (3U)
+
+/**
+ * @brief PPE Syscall id for unmasking exceptions.
+ */
+#define PVA_FW_PPE_SYSCALL_ID_UNMASK_EXCEPTION (4U)
+
+/**
+ * @brief PPE Syscall id for sampling PPE performance counters
+ */
+#define PVA_FW_PPE_SYSCALL_ID_PERFMON_SAMPLE (5U)

 /**
 * @brief The most significant bit of the prefetch cache line count field in
@@ -189,23 +125,146 @@ struct pva_vpu_instance_data {
 * the vpu syscall command interface
 */
 #define PVA_FW_PE_SYSCALL_PREFETCH_ADDR_LSB (0U)
-/** @} */
-
-/**
- * @defgroup PVA_VPU_SYSCALL_MASK_UNMASK_PARAM_FIELDS_GROUP
- *
- * @brief The parameter format to be used while issuing vpu syscall command from VPU kernel
- * to R5 for masking or unmasking FP NaN Exception.
- * The fields mentioned in this group is used for submitting the mask and unmask FP NaN eception command
- * through the Signal_R5 interface from VPU kernel.
- *
- * @{
- */

 /**
 * @brief Parameter specification for syscall mask/unmask exceptions
 */
+#define PVA_FW_PE_MASK_DIV_BY_0 (1U << 1U)
 #define PVA_FW_PE_MASK_FP_INV_NAN (1U << 2U)
-/** @} */
+
+/**
+ * @breif Write syscall parameter will be a pointer to this struct
+ */
+union pva_fw_pe_syscall_write {
+	struct {
+		uint32_t addr;
+		uint32_t size;
+	} in;
+	struct {
+		uint32_t written_size;
+	} out;
+};
+
+/**
+ * @brief Perfmon sample syscall parameter will be a pointer to this struct
+ */
+struct pva_fw_pe_syscall_perfmon_sample {
+	/** counter_mask[0] is for ID: 0-31; counter_mask[1] is for ID: 32-63 */
+	uint32_t counter_mask[2];
+	uint32_t output_addr;
+};
+
+/**
+ * @brief Index for t26x performance counters for VPU
+ */
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
+#define PERFMON_COUNTER_ID_VPS_ID_VALID_T26X (1U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T26X (2U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T26X (3U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T26X (4U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T26X (5U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T26X (6U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T26X (7U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T26X (8U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T26X (9U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T26X (10U)
+#define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T26X (11U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T26X (12U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T26X (13U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T26X (14U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T26X (15U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T26X (16U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T26X (17U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T26X (18U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T26X (19U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_FETCH_REQ_T26X (20U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_T26X (21U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_PREEMPT_T26X (22U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_LINES_T26X (23U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_DUR_T26X (24U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_DUR_T26X (25U)
+#define PERFMON_COUNTER_ID_DLUT_BUSY_T26X (26U)
+#define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T26X (27U)
+#define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T26X (28U)
+#define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T26X (29U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T26X (30U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T26X (31U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T26X (32U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T26X (33U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T26X (34U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T26X (35U)
+#define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T26X (36U)
+
+/**
+ * @brief Index for t23x performance counters
+ */
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T23X (0U)
+#define PERFMON_COUNTER_ID_VPS_ID_VALID_T23X (1U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T23X (2U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T23X (3U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T23X (4U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T23X (5U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T23X (6U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T23X (7U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T23X (8U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T23X (9U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T23X (10U)
+#define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T23X (11U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T23X (12U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T23X (13U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T23X (14U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T23X (15U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T23X (16U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T23X (17U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T23X (18U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T23X (19U)
+#define PERFMON_COUNTER_ID_ICACHE_FETCH_REQ_T23X (20U)
+#define PERFMON_COUNTER_ID_ICACHE_MISS_T23X (21U)
+#define PERFMON_COUNTER_ID_ICACHE_PREEMP_T23X (22U)
+#define PERFMON_COUNTER_ID_ICACHE_PREFETCH_LINES_T23X (23U)
+#define PERFMON_COUNTER_ID_ICACHE_MISS_DUR_T23X (24U)
+#define PERFMON_COUNTER_ID_ICACHE_PREFETCH_DUR_T23X (25U)
+#define PERFMON_COUNTER_ID_DLUT_BUSY_T23X (26U)
+#define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T23X (27U)
+#define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T23X (28U)
+#define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T23X (29U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T23X (30U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T23X (31U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T23X (32U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T23X (33U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T23X (34U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T23X (35U)
+#define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T23X (36U)
+
+/**
+ * @brief Index for t26x performance counters for PPE
+ */
+#define PERFMON_COUNTER_ID_PPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
+#define PERFMON_COUNTER_ID_PPS_ID_VALID_T26X (1U)
+#define PERFMON_COUNTER_ID_PPS_STALL_ID_REG_DEPEND_T26X (2U)
+#define PERFMON_COUNTER_ID_PPS_STALL_ID_ONLY_T26X (3U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX1_ONLY_T26X (4U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_LD_DEPENDENCY_T26X (5U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_ST_DEPENDENCY_T26X (6U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_DEPENDENCY_T26X (7U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STRM_STORE_FLUSH_T26X (8U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_STORE_FLUSH_T26X (9U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STORE_FLUSH_T26X (10U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_LD_T26X (11U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_ST_T26X (12U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_T26X (13U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LD_T26X (14U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_ST_T26X (15U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LDST_T26X (16U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_PUSHBACK_T26X (17U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STQ_PUSHBACK_T26X (18U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_FLUSH_T26X (19U)
+#define PERFMON_COUNTER_ID_PPS_WFE_GPI_EX_STATE_T26X (20U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_FETCH_REQ_T26X (21U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_T26X (22U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_PREEMPT_T26X (23U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_LINES_T26X (24U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_DUR_T26X (25U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_DUR_T26X (26U)

 #endif // PVA_API_VPU_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.c
@@ -2,17 +2,17 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

 #include "pva_kmd_abort.h"
-#include "pva_kmd_shim_init.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_regs.h"
+#include "pva_kmd_silicon_utils.h"

-void pva_kmd_abort(struct pva_kmd_device *pva)
+void pva_kmd_abort_fw(struct pva_kmd_device *pva)
 {
-	//TODO: Report to FSI first about the SW error code.
-	pva_kmd_log_err("Abort: FW Reset Assert");
-	/* Put the FW in reset ASSERT so the user space
-    cannot access the CCQ and thus force them to 
-    destroy the contexts. On destroy all the contexts.
-    KMD poweroff the FW whereas on first new contexts creation,
-    KMD will load the firmware image & poweron device */
-	pva_kmd_fw_reset_assert(pva);
+	// HW watchdog may fire repeatedly if PVA is hung. Therefore, disable all
+	// interrupts to protect KMD from potential interrupt floods.
+	pva_kmd_disable_all_interrupts_nosync(pva);
+
+	// We will handle firmware reboot after all contexts are closed and a new
+	// one is re-opened again
 	pva->recovery = true;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_abort.h
@@ -5,6 +5,6 @@
 #include "pva_kmd_device.h"
 #include "pva_kmd_utils.h"

-void pva_kmd_abort(struct pva_kmd_device *pva);
+void pva_kmd_abort_fw(struct pva_kmd_device *pva);

 #endif //PVA_KMD_ABORT_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.c
@@ -53,13 +53,12 @@ static inline uint32_t next_slot(struct pva_kmd_block_allocator *allocator,
 	return *next;
 }

-void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
+void *pva_kmd_alloc_block_unsafe(struct pva_kmd_block_allocator *allocator,
 				 uint32_t *out_id)
 {
 	void *block = NULL;
 	uint32_t slot = INVALID_ID;

-	pva_kmd_mutex_lock(&allocator->allocator_lock);
 	if (allocator->free_slot_head != INVALID_ID) {
 		slot = allocator->free_slot_head;
 		allocator->free_slot_head =
@@ -69,18 +68,24 @@ void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
 			slot = allocator->next_free_slot;
 			allocator->next_free_slot++;
 		} else {
-			goto unlock;
+			return NULL;
 		}
 	}
 	allocator->slot_in_use[slot] = true;
-	pva_kmd_mutex_unlock(&allocator->allocator_lock);
-
 	*out_id = slot + allocator->base_id;
 	block = get_block(allocator, slot);
 	return block;
-unlock:
+}
+
+void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
+			  uint32_t *out_id)
+{
+	void *block = NULL;
+
+	pva_kmd_mutex_lock(&allocator->allocator_lock);
+	block = pva_kmd_alloc_block_unsafe(allocator, out_id);
 	pva_kmd_mutex_unlock(&allocator->allocator_lock);
-	return NULL;
+	return block;
 }

 static bool is_slot_valid(struct pva_kmd_block_allocator *allocator,
@@ -103,16 +108,15 @@ void *pva_kmd_get_block_unsafe(struct pva_kmd_block_allocator *allocator,
 	return get_block(allocator, slot);
 }

-enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
+enum pva_error
+pva_kmd_free_block_unsafe(struct pva_kmd_block_allocator *allocator,
 			  uint32_t id)
 {
 	uint32_t slot = id - allocator->base_id;
 	uint32_t *next;
-	enum pva_error err = PVA_SUCCESS;
-	pva_kmd_mutex_lock(&allocator->allocator_lock);
+
 	if (!is_slot_valid(allocator, slot)) {
-		err = PVA_INVAL;
-		goto unlock;
+		return PVA_INVAL;
 	}

 	allocator->slot_in_use[slot] = false;
@@ -120,7 +124,16 @@ enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
 	*next = allocator->free_slot_head;
 	allocator->free_slot_head = slot;

-unlock:
+	return PVA_SUCCESS;
+}
+
+enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
+				  uint32_t id)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	pva_kmd_mutex_lock(&allocator->allocator_lock);
+	err = pva_kmd_free_block_unsafe(allocator, id);
 	pva_kmd_mutex_unlock(&allocator->allocator_lock);
 	return err;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.h
@@ -24,6 +24,8 @@ pva_kmd_block_allocator_init(struct pva_kmd_block_allocator *allocator,

 void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
 			  uint32_t *out_id);
+void *pva_kmd_alloc_block_unsafe(struct pva_kmd_block_allocator *allocator,
+				 uint32_t *out_id);
 static inline void *
 pva_kmd_zalloc_block(struct pva_kmd_block_allocator *allocator,
 		     uint32_t *out_id)
@@ -47,6 +49,9 @@ void *pva_kmd_get_block_unsafe(struct pva_kmd_block_allocator *allocator,
 			       uint32_t id);
 enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
 				  uint32_t id);
+enum pva_error
+pva_kmd_free_block_unsafe(struct pva_kmd_block_allocator *allocator,
+			  uint32_t id);

 void pva_kmd_block_allocator_deinit(struct pva_kmd_block_allocator *allocator);

--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.h
@@ -143,6 +143,7 @@ static inline void pva_kmd_set_cmd_init_resource_table(
 	struct pva_cmd_init_resource_table *cmd, uint8_t resource_table_id,
 	uint64_t iova_addr, uint32_t max_num_entries)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_INIT_RESOURCE_TABLE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_table_id = resource_table_id;
@@ -155,6 +156,7 @@ static inline void
 pva_kmd_set_cmd_deinit_resource_table(struct pva_cmd_deinit_resource_table *cmd,
 				      uint8_t resource_table_id)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_table_id = resource_table_id;
@@ -162,22 +164,29 @@ pva_kmd_set_cmd_deinit_resource_table(struct pva_cmd_deinit_resource_table *cmd,

 static inline void pva_kmd_set_cmd_init_queue(struct pva_cmd_init_queue *cmd,
 					      uint8_t ccq_id, uint8_t queue_id,
-					      uint64_t iova_addr,
-					      uint32_t max_num_submit)
+					      uint64_t queue_addr,
+					      uint32_t max_num_submit,
+					      uint32_t syncpt_id,
+					      uint64_t syncpt_addr)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_INIT_QUEUE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->ccq_id = ccq_id;
 	cmd->queue_id = queue_id;
-	cmd->queue_addr_lo = iova_lo(iova_addr);
-	cmd->queue_addr_hi = iova_hi(iova_addr);
+	cmd->queue_addr_lo = iova_lo(queue_addr);
+	cmd->queue_addr_hi = iova_hi(queue_addr);
 	cmd->max_n_submits = max_num_submit;
+	cmd->syncpt_id = syncpt_id;
+	cmd->syncpt_addr_lo = iova_lo(syncpt_addr);
+	cmd->syncpt_addr_hi = iova_hi(syncpt_addr);
 }

 static inline void
 pva_kmd_set_cmd_deinit_queue(struct pva_cmd_deinit_queue *cmd, uint8_t ccq_id,
 			     uint8_t queue_id)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_QUEUE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->ccq_id = ccq_id;
@@ -188,6 +197,7 @@ static inline void pva_kmd_set_cmd_update_resource_table(
 	struct pva_cmd_update_resource_table *cmd, uint32_t resource_table_id,
 	uint32_t resource_id, struct pva_resource_entry const *entry)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_table_id = resource_table_id;
@@ -199,6 +209,7 @@ static inline void
 pva_kmd_set_cmd_unregister_resource(struct pva_cmd_unregister_resource *cmd,
 				    uint32_t resource_id)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_UNREGISTER_RESOURCE;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->resource_id = resource_id;
@@ -208,6 +219,7 @@ static inline void
 pva_kmd_set_cmd_enable_fw_profiling(struct pva_cmd_enable_fw_profiling *cmd,
 				    uint32_t filter, uint8_t timestamp_type)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_ENABLE_FW_PROFILING;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->filter = filter;
@@ -217,6 +229,7 @@ pva_kmd_set_cmd_enable_fw_profiling(struct pva_cmd_enable_fw_profiling *cmd,
 static inline void
 pva_kmd_set_cmd_disable_fw_profiling(struct pva_cmd_disable_fw_profiling *cmd)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DISABLE_FW_PROFILING;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 }
@@ -225,6 +238,7 @@ static inline void pva_kmd_set_cmd_get_tegra_stats(
 	struct pva_cmd_get_tegra_stats *cmd, uint32_t buffer_resource_id,
 	uint32_t buffer_size, uint64_t offset, bool enabled)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_GET_TEGRA_STATS;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->buffer_resource_id = buffer_resource_id;
@@ -238,6 +252,7 @@ static inline void
 pva_kmd_set_cmd_set_debug_log_level(struct pva_cmd_set_debug_log_level *cmd,
 				    uint32_t log_level)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_SET_DEBUG_LOG_LEVEL;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->log_level = log_level;
@@ -245,24 +260,23 @@ pva_kmd_set_cmd_set_debug_log_level(struct pva_cmd_set_debug_log_level *cmd,

 static inline void pva_kmd_set_cmd_suspend_fw(struct pva_cmd_suspend_fw *cmd)
 {
-	uint64_t len = (sizeof(*cmd) / sizeof(uint32_t));
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_SUSPEND_FW;
-	ASSERT(len <= 255u);
-	cmd->header.len = (uint8_t)(len);
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 }

 static inline void pva_kmd_set_cmd_resume_fw(struct pva_cmd_resume_fw *cmd)
 {
-	uint64_t len = (sizeof(*cmd) / sizeof(uint32_t));
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_RESUME_FW;
-	ASSERT(len <= 255u);
-	cmd->header.len = (uint8_t)(len);
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 }

 static inline void pva_kmd_set_cmd_init_shared_dram_buffer(
 	struct pva_cmd_init_shared_dram_buffer *cmd, uint8_t interface,
 	uint32_t buffer_iova, uint32_t buffer_size)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_INIT_SHARED_DRAM_BUFFER;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->buffer_iova_hi = iova_hi(buffer_iova);
@@ -274,6 +288,7 @@ static inline void pva_kmd_set_cmd_init_shared_dram_buffer(
 static inline void pva_kmd_set_cmd_deinit_shared_dram_buffer(
 	struct pva_cmd_deinit_shared_dram_buffer *cmd, uint8_t interface)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_SHARED_DRAM_BUFFER;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->interface = interface;
@@ -283,8 +298,12 @@ static inline void
 pva_kmd_set_cmd_set_profiling_level(struct pva_cmd_set_profiling_level *cmd,
 				    uint32_t level)
 {
+	memset(cmd, 0, sizeof(*cmd));
 	cmd->header.opcode = PVA_CMD_OPCODE_SET_PROFILING_LEVEL;
 	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
 	cmd->level = level;
 }
+
+#define CMD_LEN(cmd_type) (sizeof(cmd_type) / sizeof(uint32_t))
+
 #endif // PVA_KMD_CMDBUF_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_constants.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_constants.h
@@ -23,7 +23,7 @@
 // clang-format off
 #if PVA_BUILD_MODE == PVA_BUILD_MODE_SIM
    #define PVA_KMD_TIMEOUT_FACTOR 100
-#elif (PVA_BUILD_MODE == PVA_BUILD_MODE_NATIVE) && (PVA_IS_DEBUG == 1)
+#elif (PVA_BUILD_MODE == PVA_BUILD_MODE_NATIVE)
    // On native builds, the FW calls the KMD's shared buffer handler in its
    // own thread. In debug builds, if there are a large number of messages
    // (prints, unregister, etc.), this handler might take a while to execute,
@@ -42,22 +42,16 @@
 #define PVA_KMD_WAIT_FW_POLL_INTERVAL_US PVA_KMD_TIMEOUT(100) /*< 100 us*/
 #define PVA_KMD_FW_BOOT_TIMEOUT_MS PVA_KMD_TIMEOUT(1000) /*< 1 seconds */

-#define PVA_NUM_RW_SYNCPTS 56
+#define PVA_NUM_RW_SYNCPTS (PVA_MAX_NUM_CCQ * PVA_NUM_RW_SYNCPTS_PER_CONTEXT)

 // clang-format off
 #if PVA_DEV_MAIN_COMPATIBLE == 1
    #define PVA_KMD_LOAD_FROM_GSC_DEFAULT true
-    #if PVA_SAFETY == 1
-        #define PVA_KMD_APP_AUTH_DEFAULT true
-    #else
-        #define PVA_KMD_APP_AUTH_DEFAULT false
-    #endif
 #else
    #define PVA_KMD_LOAD_FROM_GSC_DEFAULT false
-    #define PVA_KMD_APP_AUTH_DEFAULT false
 #endif
 // clang-format on

-#define PVA_KMD_MAX_NUM_USER_DMA_CONFIG 1024
+#define PVA_KMD_DMA_CONFIG_POOL_INCR 256

 #endif // PVA_KMD_CONSTANTS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.c
@@ -67,118 +67,86 @@ err_out:

 static enum pva_error notify_fw_context_init(struct pva_kmd_context *ctx)
 {
-	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &ctx->pva->submitter;
 	struct pva_cmd_init_resource_table *res_cmd;
 	struct pva_cmd_init_queue *queue_cmd;
 	struct pva_cmd_update_resource_table *update_cmd;
 	struct pva_resource_entry entry = { 0 };
-	uint32_t fence_val;
+	const struct pva_syncpt_rw_info *syncpt_info;
 	enum pva_error err;
+	uint32_t current_offset = 0;
+	uint32_t cmd_scratch[CMD_LEN(struct pva_cmd_init_resource_table) +
+			     CMD_LEN(struct pva_cmd_init_queue) +
+			     CMD_LEN(struct pva_cmd_update_resource_table)];

-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-	res_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*res_cmd));
-	ASSERT(res_cmd != NULL);
+	res_cmd = (struct pva_cmd_init_resource_table *)pva_offset_pointer(
+		&cmd_scratch[0], current_offset);
+	current_offset += sizeof(*res_cmd);
+
+	queue_cmd = (struct pva_cmd_init_queue *)pva_offset_pointer(
+		&cmd_scratch[0], current_offset);
+	current_offset += sizeof(*queue_cmd);
+
+	update_cmd = (struct pva_cmd_update_resource_table *)pva_offset_pointer(
+		&cmd_scratch[0], current_offset);
+	current_offset += sizeof(*update_cmd);

 	pva_kmd_set_cmd_init_resource_table(
 		res_cmd, ctx->resource_table_id,
 		ctx->ctx_resource_table.table_mem->iova,
 		ctx->ctx_resource_table.n_entries);

-	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
-	ASSERT(queue_cmd != NULL);
-
+	syncpt_info = pva_kmd_queue_get_rw_syncpt_info(ctx, ctx->ccq_id);
 	pva_kmd_set_cmd_init_queue(
 		queue_cmd, PVA_PRIV_CCQ_ID,
 		ctx->ccq_id, /* For privileged queues, queue ID == user CCQ ID*/
 		ctx->ctx_queue.queue_memory->iova,
-		ctx->ctx_queue.max_num_submit);
-
-	update_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*update_cmd));
-	ASSERT(update_cmd != NULL);
+		ctx->ctx_queue.max_num_submit, syncpt_info->syncpt_id,
+		syncpt_info->syncpt_iova);

 	err = pva_kmd_make_resource_entry(&ctx->pva->dev_resource_table,
 					  ctx->submit_memory_resource_id,
 					  &entry);
 	ASSERT(err == PVA_SUCCESS);
-
 	pva_kmd_set_cmd_update_resource_table(update_cmd,
 					      0, /* KMD's resource table ID */
 					      ctx->submit_memory_resource_id,
 					      &entry);

-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		// Error is either QUEUE_FULL or TIMEDOUT
-		goto cancel_builder;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+	err = pva_kmd_submit_cmd_sync(dev_submitter, cmd_scratch,
+				      sizeof(cmd_scratch),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when initializing context");
-		goto err_out;
-	}
-
-	return PVA_SUCCESS;
-
-cancel_builder:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
-err_out:
 	return err;
 }

 static enum pva_error notify_fw_context_deinit(struct pva_kmd_context *ctx)
 {
-	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &ctx->pva->submitter;
 	struct pva_cmd_deinit_resource_table *deinit_table_cmd;
 	struct pva_cmd_deinit_queue *deinit_queue_cmd;
-	uint32_t fence_val;
+	uint32_t cmd_scratch[CMD_LEN(struct pva_cmd_deinit_queue) +
+			     CMD_LEN(struct pva_cmd_deinit_resource_table)];
 	enum pva_error err;

-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
+	deinit_queue_cmd = (struct pva_cmd_deinit_queue *)pva_offset_pointer(
+		&cmd_scratch[0], 0);
+	deinit_table_cmd =
+		(struct pva_cmd_deinit_resource_table *)pva_offset_pointer(
+			&cmd_scratch[0], sizeof(struct pva_cmd_deinit_queue));

-	deinit_queue_cmd =
-		pva_kmd_reserve_cmd_space(&builder, sizeof(*deinit_queue_cmd));
-	ASSERT(deinit_queue_cmd != NULL);
 	pva_kmd_set_cmd_deinit_queue(
 		deinit_queue_cmd, PVA_PRIV_CCQ_ID,
 		ctx->ccq_id /* For privileged queues, queue ID == user CCQ ID*/
 	);

-	deinit_table_cmd =
-		pva_kmd_reserve_cmd_space(&builder, sizeof(*deinit_table_cmd));
-	ASSERT(deinit_table_cmd != NULL);
 	pva_kmd_set_cmd_deinit_resource_table(deinit_table_cmd,
 					      ctx->resource_table_id);

-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		goto cancel_builder;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+	err = pva_kmd_submit_cmd_sync(dev_submitter, cmd_scratch,
+				      sizeof(cmd_scratch),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when deinitializing context");
-		goto err_out;
-	}
-
-	return PVA_SUCCESS;
-cancel_builder:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
-err_out:
 	return err;
 }

@@ -189,20 +157,24 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 	uint32_t queue_mem_size;
 	uint64_t chunk_mem_size;
 	struct pva_fw_postfence post_fence = { 0 };
-	struct pva_syncpt_rw_info *syncpts;
 	uint64_t size;

-	/* Allocate RW syncpoints for this context */
-	syncpts = (struct pva_syncpt_rw_info *)pva_kmd_alloc_block(
-		&ctx->pva->syncpt_allocator, &ctx->syncpt_block_index);
-	ASSERT(syncpts != NULL);
+	if (ctx->inited) {
+		err = PVA_INVAL;
+		goto err_out;
+	}
+
+	if (res_table_capacity == 0u) {
+		pva_kmd_log_err("Invalid resource capacity");
+		err = PVA_BAD_PARAMETER_ERROR;
+		goto err_out;
+	}

 	/* Init resource table for this context */
 	err = pva_kmd_resource_table_init(&ctx->ctx_resource_table, ctx->pva,
-					  ctx->smmu_ctx_id, res_table_capacity,
-					  PVA_KMD_MAX_NUM_USER_DMA_CONFIG);
+					  ctx->smmu_ctx_id, res_table_capacity);
 	if (err != PVA_SUCCESS) {
-		goto drop_device;
+		goto err_out;
 	}

 	/* Init privileged queue for this context */
@@ -225,7 +197,8 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,

 	/* Allocate memory for submission */
 	chunk_mem_size = pva_kmd_cmdbuf_pool_get_required_mem_size(
-		PVA_MAX_CMDBUF_CHUNK_SIZE, PVA_KMD_MAX_NUM_PRIV_CHUNKS);
+		pva_kmd_get_max_cmdbuf_chunk_size(ctx->pva),
+		PVA_KMD_MAX_NUM_PRIV_CHUNKS);
 	/* Allocate one post fence at the end. This memory will be added to
 	 * KMD's own resource table. We don't need to explicitly free it. It
 	 * will be freed after we drop the resource. */
@@ -242,6 +215,8 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 					       ctx->submit_memory,
 					       &ctx->submit_memory_resource_id);
 	if (err != PVA_SUCCESS) {
+		// Ownership of submit memory is transferred to KMD's resource table so
+		// if adding to resource table fails, we need to free it here.
 		pva_kmd_device_memory_free(ctx->submit_memory);
 		goto queue_deinit;
 	}
@@ -249,7 +224,8 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 	/* Init chunk pool */
 	err = pva_kmd_cmdbuf_chunk_pool_init(
 		&ctx->chunk_pool, ctx->submit_memory_resource_id,
-		0 /* offset */, chunk_mem_size, PVA_MAX_CMDBUF_CHUNK_SIZE,
+		0 /* offset */, chunk_mem_size,
+		pva_kmd_get_max_cmdbuf_chunk_size(ctx->pva),
 		PVA_KMD_MAX_NUM_PRIV_CHUNKS, ctx->submit_memory->va);
 	if (err != PVA_SUCCESS) {
 		goto free_dram_buffer_resource;
@@ -283,13 +259,15 @@ enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
 					 pva_kmd_resource_table_lock,
 					 pva_kmd_resource_table_unlock);
 	if (err != PVA_SUCCESS) {
-		goto deinit_submitter;
+		goto deinit_fw_context;
 	}

 	ctx->inited = true;

 	return PVA_SUCCESS;

+deinit_fw_context:
+	notify_fw_context_deinit(ctx);
 deinit_submitter:
 	pva_kmd_mutex_deinit(&ctx->chunk_pool_lock);
 	pva_kmd_mutex_deinit(&ctx->submit_lock);
@@ -298,12 +276,10 @@ free_dram_buffer_resource:
 	pva_kmd_drop_resource(&ctx->pva->dev_resource_table,
 			      ctx->submit_memory_resource_id);
 queue_deinit:
-	pva_kmd_queue_deinit(&ctx->ctx_queue);
 	pva_kmd_device_memory_free(ctx->ctx_queue_mem);
 deinit_table:
 	pva_kmd_resource_table_deinit(&ctx->ctx_resource_table);
-drop_device:
-	pva_kmd_device_idle(ctx->pva);
+err_out:
 	return err;
 }

@@ -312,25 +288,24 @@ void pva_kmd_context_deinit(struct pva_kmd_context *ctx)
 	enum pva_error err;

 	if (ctx->inited) {
-		if (!ctx->pva->recovery) {
 		err = notify_fw_context_deinit(ctx);
-			ASSERT(err == PVA_SUCCESS);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err(
+				"Failed to notify FW of context deinit");
 		}

 		err = pva_kmd_shared_buffer_deinit(ctx->pva, ctx->ccq_id);
-		ASSERT(err == PVA_SUCCESS);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err("Failed to deinit FW buffer");
+		}

-		pva_kmd_device_idle(ctx->pva);
 		pva_kmd_mutex_deinit(&ctx->submit_lock);
 		pva_kmd_mutex_deinit(&ctx->chunk_pool_lock);
 		pva_kmd_cmdbuf_chunk_pool_deinit(&ctx->chunk_pool);
 		pva_kmd_drop_resource(&ctx->pva->dev_resource_table,
 				      ctx->submit_memory_resource_id);
-		pva_kmd_queue_deinit(&ctx->ctx_queue);
 		pva_kmd_device_memory_free(ctx->ctx_queue_mem);
 		pva_kmd_resource_table_deinit(&ctx->ctx_resource_table);
-		pva_kmd_free_block(&ctx->pva->syncpt_allocator,
-				   ctx->syncpt_block_index);
 		ctx->inited = false;
 	}
 }
@@ -345,14 +320,13 @@ static void pva_kmd_destroy_all_queues(struct pva_kmd_context *ctx)
 		pva_kmd_mutex_lock(&ctx->queue_allocator.allocator_lock);
 		queue = pva_kmd_get_block_unsafe(&ctx->queue_allocator,
 						 queue_id);
+		pva_kmd_mutex_unlock(&ctx->queue_allocator.allocator_lock);
 		if (queue != NULL) {
-			pva_kmd_mutex_unlock(
-				&ctx->queue_allocator.allocator_lock);
 			err = pva_kmd_queue_destroy(ctx, queue_id);
-			ASSERT(err == PVA_SUCCESS);
-		} else {
-			pva_kmd_mutex_unlock(
-				&ctx->queue_allocator.allocator_lock);
+			if (err != PVA_SUCCESS) {
+				pva_kmd_log_err_u64(
+					"Failed to destroy queue %d", queue_id);
+			}
 		}
 	}
 }
@@ -363,11 +337,12 @@ void pva_kmd_context_destroy(struct pva_kmd_context *ctx)

 	pva_kmd_destroy_all_queues(ctx);
 	pva_kmd_context_deinit(ctx);
+	pva_kmd_device_idle(ctx->pva);
 	pva_kmd_block_allocator_deinit(&ctx->queue_allocator);
 	pva_kmd_free(ctx->queue_allocator_mem);
 	pva_kmd_mutex_deinit(&ctx->ccq_lock);
-	err = pva_kmd_free_block(&ctx->pva->context_allocator, ctx->ccq_id);
 	pva_kmd_mutex_deinit(&ctx->ocb_lock);
+	err = pva_kmd_free_block(&ctx->pva->context_allocator, ctx->ccq_id);
 	ASSERT(err == PVA_SUCCESS);
 }

--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.h
@@ -63,9 +63,6 @@ struct pva_kmd_context {
 	void *plat_data;
 	uint64_t ccq_shm_handle;

-	/** Index of block of syncpoints allocated for this context */
-	uint32_t syncpt_block_index;
-	uint32_t syncpt_ids[PVA_NUM_RW_SYNCPTS_PER_CONTEXT];
 	pva_kmd_mutex_t ocb_lock;
 };

--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.c
@@ -27,45 +27,6 @@ static uint64_t read_from_buffer_to_user(void *to, uint64_t count,
 	return count;
 }

-static enum pva_error
-pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
-				      uint32_t level)
-{
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
-	struct pva_cmd_set_profiling_level *cmd;
-	uint32_t fence_val;
-	enum pva_error err;
-
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-
-	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
-	ASSERT(cmd != NULL);
-	pva_kmd_set_cmd_set_profiling_level(cmd, level);
-
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
-				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
-				     PVA_KMD_WAIT_FW_TIMEOUT_US);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when setting profiling level");
-		goto err_out;
-	}
-
-	return PVA_SUCCESS;
-
-err_out:
-	return err;
-}
-
 static int64_t profiling_level_read(struct pva_kmd_device *dev, void *file_data,
 				    uint8_t *out_buffer, uint64_t offset,
 				    uint64_t size)
@@ -118,92 +79,20 @@ static int64_t profiling_level_write(struct pva_kmd_device *dev,
 				"pva_kmd_device_busy failed when submitting set profiling level cmd");
 			return 0;
 		}
+
 		err = pva_kmd_notify_fw_set_profiling_level(dev, value);
+		pva_kmd_device_idle(dev);
+
 		if (err != PVA_SUCCESS) {
 			pva_kmd_log_err(
 				"Failed to notify FW about profiling level change");
 			return 0;
 		}
-		pva_kmd_device_idle(dev);
 	}
+
 	return size;
 }

-void pva_kmd_debugfs_create_nodes(struct pva_kmd_device *pva)
-{
-	static const char *vpu_ocd_names[NUM_VPU_BLOCKS] = { "ocd_vpu0_v3",
-							     "ocd_vpu1_v3" };
-	struct pva_kmd_file_ops *profiling_fops;
-
-	pva_kmd_debugfs_create_bool(pva, "stats_enabled",
-				    &pva->debugfs_context.stats_enable);
-	pva_kmd_debugfs_create_bool(pva, "vpu_debug",
-				    &pva->debugfs_context.vpu_debug);
-
-	// Create profiling_level file operations
-	profiling_fops = &pva->debugfs_context.profiling_level_fops;
-	profiling_fops->read = profiling_level_read;
-	profiling_fops->write = profiling_level_write;
-	profiling_fops->open = NULL;
-	profiling_fops->release = NULL;
-	profiling_fops->pdev = pva;
-	pva_kmd_debugfs_create_file(pva, "profiling_level", profiling_fops);
-
-	pva->debugfs_context.vpu_fops.read = &get_vpu_stats;
-	pva->debugfs_context.vpu_fops.write = NULL;
-	pva->debugfs_context.vpu_fops.pdev = pva;
-	pva_kmd_debugfs_create_file(pva, "vpu_stats",
-				    &pva->debugfs_context.vpu_fops);
-	for (uint32_t i = 0; i < NUM_VPU_BLOCKS; i++) {
-		pva->debugfs_context.vpu_ocd_fops[i].open =
-			&pva_kmd_vpu_ocd_open;
-		pva->debugfs_context.vpu_ocd_fops[i].release =
-			&pva_kmd_vpu_ocd_release;
-		pva->debugfs_context.vpu_ocd_fops[i].read =
-			&pva_kmd_vpu_ocd_read;
-		pva->debugfs_context.vpu_ocd_fops[i].write =
-			&pva_kmd_vpu_ocd_write;
-		pva->debugfs_context.vpu_ocd_fops[i].pdev = pva;
-		pva->debugfs_context.vpu_ocd_fops[i].file_data =
-			(void *)&pva->regspec.vpu_dbg_instr_reg_offset[i];
-		pva_kmd_debugfs_create_file(
-			pva, vpu_ocd_names[i],
-			&pva->debugfs_context.vpu_ocd_fops[i]);
-	}
-
-	pva->debugfs_context.allowlist_ena_fops.read =
-		&get_vpu_allowlist_enabled;
-	pva->debugfs_context.allowlist_ena_fops.write = &update_vpu_allowlist;
-	pva->debugfs_context.allowlist_ena_fops.pdev = pva;
-	pva_kmd_debugfs_create_file(pva, "vpu_app_authentication",
-				    &pva->debugfs_context.allowlist_ena_fops);
-
-	pva->debugfs_context.allowlist_path_fops.read = &get_vpu_allowlist_path;
-	pva->debugfs_context.allowlist_path_fops.write =
-		&update_vpu_allowlist_path;
-	pva->debugfs_context.allowlist_path_fops.pdev = pva;
-	pva_kmd_debugfs_create_file(pva, "allowlist_path",
-				    &pva->debugfs_context.allowlist_path_fops);
-
-	pva->debugfs_context.fw_debug_log_level_fops.write =
-		&update_fw_debug_log_level;
-	pva->debugfs_context.fw_debug_log_level_fops.read = NULL;
-	pva->debugfs_context.fw_debug_log_level_fops.pdev = pva;
-	pva_kmd_debugfs_create_file(
-		pva, "fw_debug_log_level",
-		&pva->debugfs_context.fw_debug_log_level_fops);
-
-	pva_kmd_device_init_profiler(pva);
-	pva_kmd_device_init_tegra_stats(pva);
-}
-
-void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *pva)
-{
-	pva_kmd_device_deinit_tegra_stats(pva);
-	pva_kmd_device_deinit_profiler(pva);
-	pva_kmd_debugfs_remove_nodes(pva);
-}
-
 static int64_t print_vpu_stats(struct pva_kmd_tegrastats *kmd_tegra_stats,
 			       uint8_t *out_buffer, uint64_t offset,
 			       uint64_t len)
@@ -236,8 +125,9 @@ static int64_t print_vpu_stats(struct pva_kmd_tegrastats *kmd_tegra_stats,
 					formatted_len);
 }

-int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
-		      uint8_t *out_buffer, uint64_t offset, uint64_t size)
+static int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
+			     uint8_t *out_buffer, uint64_t offset,
+			     uint64_t size)
 {
 	struct pva_kmd_tegrastats kmd_tegra_stats;

@@ -251,9 +141,9 @@ int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
 	return print_vpu_stats(&kmd_tegra_stats, out_buffer, offset, size);
 }

-int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva, void *file_data,
-				  uint8_t *out_buffer, uint64_t offset,
-				  uint64_t size)
+static int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva,
+					 void *file_data, uint8_t *out_buffer,
+					 uint64_t offset, uint64_t size)
 {
 	// 1 byte for '0' or '1' and another 1 byte for the Null character
 	char out_str[2];
@@ -267,7 +157,7 @@ int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva, void *file_data,
 					sizeof(out_str));
 }

-int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
+static int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
 				    const uint8_t *in_buffer, uint64_t offset,
 				    uint64_t size)
 {
@@ -302,9 +192,9 @@ int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
 	return size;
 }

-int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
-			       uint8_t *out_buffer, uint64_t offset,
-			       uint64_t size)
+static int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva,
+				      void *file_data, uint8_t *out_buffer,
+				      uint64_t offset, uint64_t size)
 {
 	uint64_t len;
 	pva_kmd_mutex_lock(&(pva->pva_auth->allow_list_lock));
@@ -317,13 +207,18 @@ int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 	return len;
 }

-int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
-				  const uint8_t *in_buffer, uint64_t offset,
-				  uint64_t size)
+static int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva,
+					 void *file_data,
+					 const uint8_t *in_buffer,
+					 uint64_t offset, uint64_t size)
 {
 	char buffer[ALLOWLIST_FILE_LEN];
 	unsigned long retval;

+	if (size == 0) {
+		return 0;
+	}
+
 	if (size > sizeof(buffer)) {
 		pva_kmd_log_err_u64(
 			"Length of allowlist path is too long. It must be less than ",
@@ -338,7 +233,7 @@ int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 	}

 	//Replacing last character from new-line to null terminator
-	buffer[safe_subu64(size, 1u)] = '\0';
+	buffer[size - 1u] = '\0';

 	pva_kmd_mutex_lock(&(pva->pva_auth->allow_list_lock));
 	pva_kmd_update_allowlist_path(pva, buffer);
@@ -347,9 +242,10 @@ int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
 	return size;
 }

-int64_t update_fw_debug_log_level(struct pva_kmd_device *pva, void *file_data,
-				  const uint8_t *in_buffer, uint64_t offset,
-				  uint64_t size)
+static int64_t update_fw_debug_log_level(struct pva_kmd_device *pva,
+					 void *file_data,
+					 const uint8_t *in_buffer,
+					 uint64_t offset, uint64_t size)
 {
 	uint32_t log_level;
 	unsigned long retval;
@@ -387,10 +283,143 @@ int64_t update_fw_debug_log_level(struct pva_kmd_device *pva, void *file_data,
 			goto err_end;
 		}

-		pva_kmd_notify_fw_set_debug_log_level(pva, log_level);
-
+		err = pva_kmd_notify_fw_set_debug_log_level(pva, log_level);
 		pva_kmd_device_idle(pva);
+
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err(
+				"Failed to notify FW about debug log level change");
+		}
 	}
 err_end:
 	return copy_size;
 }
+
+static int64_t get_fw_debug_log_level(struct pva_kmd_device *dev,
+				      void *file_data, uint8_t *out_buffer,
+				      uint64_t offset, uint64_t size)
+{
+	char print_buffer[64];
+	int formatted_len;
+
+	formatted_len = snprintf(print_buffer, sizeof(print_buffer), "%u\n",
+				 dev->fw_debug_log_level);
+
+	if (formatted_len <= 0) {
+		return -1;
+	}
+
+	return read_from_buffer_to_user(out_buffer, size, offset, print_buffer,
+					(uint64_t)formatted_len);
+}
+
+enum pva_error pva_kmd_debugfs_create_nodes(struct pva_kmd_device *pva)
+{
+	static const char *vpu_ocd_names[NUM_VPU_BLOCKS] = { "ocd_vpu0_v3",
+							     "ocd_vpu1_v3" };
+	struct pva_kmd_file_ops *profiling_fops;
+	enum pva_error err;
+
+	pva_kmd_debugfs_create_bool(pva, "stats_enabled",
+				    &pva->debugfs_context.stats_enable);
+	pva_kmd_debugfs_create_bool(pva, "vpu_debug",
+				    &pva->debugfs_context.vpu_debug);
+
+	// Create profiling_level file operations
+	profiling_fops = &pva->debugfs_context.profiling_level_fops;
+	profiling_fops->read = profiling_level_read;
+	profiling_fops->write = profiling_level_write;
+	profiling_fops->open = NULL;
+	profiling_fops->release = NULL;
+	profiling_fops->pdev = pva;
+	err = pva_kmd_debugfs_create_file(pva, "profiling_level",
+					  profiling_fops);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Failed to create profiling_level debugfs file");
+		return err;
+	}
+
+	pva->debugfs_context.vpu_fops.read = &get_vpu_stats;
+	pva->debugfs_context.vpu_fops.write = NULL;
+	pva->debugfs_context.vpu_fops.pdev = pva;
+	err = pva_kmd_debugfs_create_file(pva, "vpu_stats",
+					  &pva->debugfs_context.vpu_fops);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Failed to create vpu_stats debugfs file");
+		return err;
+	}
+
+	for (uint32_t i = 0; i < NUM_VPU_BLOCKS; i++) {
+		pva->debugfs_context.vpu_ocd_fops[i].open =
+			&pva_kmd_vpu_ocd_open;
+		pva->debugfs_context.vpu_ocd_fops[i].release =
+			&pva_kmd_vpu_ocd_release;
+		pva->debugfs_context.vpu_ocd_fops[i].read =
+			&pva_kmd_vpu_ocd_read;
+		pva->debugfs_context.vpu_ocd_fops[i].write =
+			&pva_kmd_vpu_ocd_write;
+		pva->debugfs_context.vpu_ocd_fops[i].pdev = pva;
+		pva->debugfs_context.vpu_ocd_fops[i].file_data =
+			(void *)&pva->regspec.vpu_dbg_instr_reg_offset[i];
+		err = pva_kmd_debugfs_create_file(
+			pva, vpu_ocd_names[i],
+			&pva->debugfs_context.vpu_ocd_fops[i]);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err(
+				"Failed to create vpu_ocd debugfs file");
+			return err;
+		}
+	}
+
+	pva->debugfs_context.allowlist_ena_fops.read =
+		&get_vpu_allowlist_enabled;
+	pva->debugfs_context.allowlist_ena_fops.write = &update_vpu_allowlist;
+	pva->debugfs_context.allowlist_ena_fops.pdev = pva;
+	err = pva_kmd_debugfs_create_file(
+		pva, "vpu_app_authentication",
+		&pva->debugfs_context.allowlist_ena_fops);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Failed to create vpu_app_authentication debugfs file");
+		return err;
+	}
+
+	pva->debugfs_context.allowlist_path_fops.read = &get_vpu_allowlist_path;
+	pva->debugfs_context.allowlist_path_fops.write =
+		&update_vpu_allowlist_path;
+	pva->debugfs_context.allowlist_path_fops.pdev = pva;
+	err = pva_kmd_debugfs_create_file(
+		pva, "allowlist_path",
+		&pva->debugfs_context.allowlist_path_fops);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Failed to create allowlist_path debugfs file");
+		return err;
+	}
+
+	pva->debugfs_context.fw_debug_log_level_fops.write =
+		&update_fw_debug_log_level;
+	pva->debugfs_context.fw_debug_log_level_fops.read =
+		&get_fw_debug_log_level;
+	pva->debugfs_context.fw_debug_log_level_fops.pdev = pva;
+	err = pva_kmd_debugfs_create_file(
+		pva, "fw_debug_log_level",
+		&pva->debugfs_context.fw_debug_log_level_fops);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Failed to create fw_debug_log_level debugfs file");
+		return err;
+	}
+
+	pva_kmd_device_init_profiler(pva);
+	pva_kmd_device_init_tegra_stats(pva);
+
+	return PVA_SUCCESS;
+}
+
+void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *pva)
+{
+	pva_kmd_device_deinit_tegra_stats(pva);
+	pva_kmd_device_deinit_profiler(pva);
+	pva_kmd_debugfs_remove_nodes(pva);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.h
@@ -37,24 +37,7 @@ struct pva_kmd_debugfs_context {
 	struct pva_kmd_file_ops fw_debug_log_level_fops;
 };

-void pva_kmd_debugfs_create_nodes(struct pva_kmd_device *dev);
+enum pva_error pva_kmd_debugfs_create_nodes(struct pva_kmd_device *dev);
 void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *dev);
-int64_t get_vpu_stats(struct pva_kmd_device *dev, void *file_data,
-		      uint8_t *out_buffer, uint64_t offset, uint64_t size);
-int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
-			     const uint8_t *in_buffer, uint64_t offset,
-			     uint64_t size);
-int64_t get_vpu_allowlist_enabled(struct pva_kmd_device *pva, void *file_data,
-				  uint8_t *out_buffer, uint64_t offset,
-				  uint64_t size);
-int64_t update_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
-				  const uint8_t *in_buffer, uint64_t offset,
-				  uint64_t size);
-int64_t get_vpu_allowlist_path(struct pva_kmd_device *pva, void *file_data,
-			       uint8_t *out_buffer, uint64_t offset,
-			       uint64_t size);
-int64_t update_fw_debug_log_level(struct pva_kmd_device *dev, void *file_data,
-				  const uint8_t *in_buffer, uint64_t offset,
-				  uint64_t size);

 #endif //PVA_KMD_DEBUGFS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.c
@@ -33,10 +33,11 @@
 * Initialization through CCQ is only intended for KMD's own resource table (the
 * first resource table created).
 */
-void pva_kmd_send_resource_table_info_by_ccq(
+static enum pva_error pva_kmd_send_resource_table_info_by_ccq(
 	struct pva_kmd_device *pva, struct pva_kmd_resource_table *res_table)
 {
 	enum pva_error err;
+
 	uint64_t addr = res_table->table_mem->iova;
 	uint32_t n_entries = res_table->n_entries;
 	uint64_t ccq_entry =
@@ -51,8 +52,9 @@ void pva_kmd_send_resource_table_info_by_ccq(
 	err = pva_kmd_ccq_push_with_timeout(pva, PVA_PRIV_CCQ_ID, ccq_entry,
 					    PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 					    PVA_KMD_WAIT_FW_TIMEOUT_US);
-	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_mutex_unlock(&pva->ccq0_lock);
+
+	return err;
 }

 /**
@@ -61,7 +63,8 @@ void pva_kmd_send_resource_table_info_by_ccq(
 * Initialization through CCQ is only intended for KMD's own queue (the first
 * queue created).
 */
-void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
+static enum pva_error
+pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
 			       struct pva_kmd_queue *queue)
 {
 	enum pva_error err;
@@ -78,8 +81,9 @@ void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
 	err = pva_kmd_ccq_push_with_timeout(pva, PVA_PRIV_CCQ_ID, ccq_entry,
 					    PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 					    PVA_KMD_WAIT_FW_TIMEOUT_US);
-	ASSERT(err == PVA_SUCCESS);
 	pva_kmd_mutex_unlock(&pva->ccq0_lock);
+
+	return err;
 }

 /**
@@ -113,13 +117,13 @@ static void pva_kmd_device_init_submission(struct pva_kmd_device *pva)
 	/* Init KMD's resource table */
 	err = pva_kmd_resource_table_init(&pva->dev_resource_table, pva,
 					  PVA_R5_SMMU_CONTEXT_ID,
-					  PVA_KMD_MAX_NUM_KMD_RESOURCES,
-					  PVA_KMD_MAX_NUM_KMD_DMA_CONFIGS);
+					  PVA_KMD_MAX_NUM_KMD_RESOURCES);
 	ASSERT(err == PVA_SUCCESS);

 	/* Allocate memory for submission*/
 	chunk_mem_size = pva_kmd_cmdbuf_pool_get_required_mem_size(
-		PVA_MAX_CMDBUF_CHUNK_SIZE, PVA_KMD_MAX_NUM_KMD_CHUNKS);
+		pva_kmd_get_max_cmdbuf_chunk_size(pva),
+		PVA_KMD_MAX_NUM_KMD_CHUNKS);

 	size = safe_addu64(chunk_mem_size, (uint64_t)sizeof(uint32_t));
 	/* Allocate one post fence at the end. We don't need to free this memory
@@ -138,7 +142,7 @@ static void pva_kmd_device_init_submission(struct pva_kmd_device *pva)
 	/* Init chunk pool */
 	pva_kmd_cmdbuf_chunk_pool_init(
 		&pva->chunk_pool, pva->submit_memory_resource_id, 0,
-		chunk_mem_size, PVA_MAX_CMDBUF_CHUNK_SIZE,
+		chunk_mem_size, pva_kmd_get_max_cmdbuf_chunk_size(pva),
 		PVA_KMD_MAX_NUM_KMD_CHUNKS, pva->submit_memory->va);

 	/* Init fence */
@@ -167,21 +171,25 @@ static void pva_kmd_device_deinit_submission(struct pva_kmd_device *pva)
 	pva_kmd_drop_resource(&pva->dev_resource_table,
 			      pva->submit_memory_resource_id);
 	pva_kmd_resource_table_deinit(&pva->dev_resource_table);
-	pva_kmd_queue_deinit(&pva->dev_queue);
 	pva_kmd_device_memory_free(pva->queue_memory);
 }

 struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,
 					     uint32_t device_index,
-					     bool app_authenticate)
+					     bool app_authenticate,
+					     bool test_mode)
 {
 	struct pva_kmd_device *pva;
 	enum pva_error err;
-	uint32_t chunk_size;
 	uint32_t size;

+	if (test_mode) {
+		pva_kmd_log_err("Test mode is enabled");
+	}
+
 	pva = pva_kmd_zalloc_nofail(sizeof(*pva));

+	pva->test_mode = test_mode;
 	pva->device_index = device_index;
 	pva->load_from_gsc = false;
 	pva->is_hv_mode = true;
@@ -211,13 +219,6 @@ struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,

 	pva_kmd_device_plat_init(pva);

-	chunk_size = safe_mulu32((uint32_t)sizeof(struct pva_syncpt_rw_info),
-				 (uint32_t)PVA_NUM_RW_SYNCPTS_PER_CONTEXT);
-	err = pva_kmd_block_allocator_init(&pva->syncpt_allocator,
-					   pva->syncpt_rw, 0, chunk_size,
-					   PVA_MAX_NUM_USER_CONTEXTS);
-	ASSERT(err == PVA_SUCCESS);
-
 	pva_kmd_device_init_submission(pva);

 	err = pva_kmd_init_vpu_app_auth(pva, app_authenticate);
@@ -257,7 +258,6 @@ void pva_kmd_device_destroy(struct pva_kmd_device *pva)
 	pva_kmd_wait_for_active_contexts(pva);
 	pva_kmd_device_deinit_submission(pva);
 	pva_kmd_device_plat_deinit(pva);
-	pva_kmd_block_allocator_deinit(&pva->syncpt_allocator);
 	pva_kmd_block_allocator_deinit(&pva->context_allocator);
 	pva_kmd_free(pva->context_mem);
 	pva_kmd_mutex_deinit(&pva->ccq0_lock);
@@ -266,44 +266,71 @@ void pva_kmd_device_destroy(struct pva_kmd_device *pva)
 	pva_kmd_free(pva);
 }

-static enum pva_error
-pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
-				      uint32_t level)
+static enum pva_error config_fw_by_cmds(struct pva_kmd_device *pva)
 {
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
-	struct pva_cmd_set_profiling_level *cmd;
-	uint32_t fence_val;
-	enum pva_error err;
+	enum pva_error err = PVA_SUCCESS;

-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	err = pva_kmd_notify_fw_enable_profiling(pva);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}

-	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
-	ASSERT(cmd != NULL);
-	pva_kmd_set_cmd_set_profiling_level(cmd, level);
-
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	/* Set FW debug log level */
+	err = pva_kmd_notify_fw_set_debug_log_level(pva,
+						    pva->fw_debug_log_level);
 	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}

-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
-				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
-				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	// If the user had set profiling level before power-on, send the update to FW
+	err = pva_kmd_notify_fw_set_profiling_level(
+		pva, pva->debugfs_context.profiling_level);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when setting profiling level");
 		goto err_out;
 	}

-	return PVA_SUCCESS;
-
 err_out:
 	return err;
 }
+
+enum pva_error pva_kmd_config_fw_after_boot(struct pva_kmd_device *pva)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	/* Reset KMD queue */
+	pva->dev_queue.queue_header->cb_head = 0;
+	pva->dev_queue.queue_header->cb_tail = 0;
+
+	err = pva_kmd_send_resource_table_info_by_ccq(pva,
+						      &pva->dev_resource_table);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	err = pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	err = pva_kmd_shared_buffer_init(pva, PVA_PRIV_CCQ_ID,
+					 PVA_KMD_FW_BUF_ELEMENT_SIZE,
+					 PVA_KMD_FW_PROFILING_BUF_NUM_ELEMENTS,
+					 NULL, NULL);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err_u64(
+			"pva kmd buffer initialization failed for interface ",
+			PVA_PRIV_CCQ_ID);
+		goto err_out;
+	}
+
+	err = config_fw_by_cmds(pva);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+err_out:
+	return err;
+}
+
 enum pva_error pva_kmd_device_busy(struct pva_kmd_device *pva)
 {
 	enum pva_error err = PVA_SUCCESS;
@@ -321,36 +348,26 @@ enum pva_error pva_kmd_device_busy(struct pva_kmd_device *pva)
 		if (err != PVA_SUCCESS) {
 			goto poweroff;
 		}
-		/* Reset KMD queue */
-		pva->dev_queue.queue_header->cb_head = 0;
-		pva->dev_queue.queue_header->cb_tail = 0;

-		pva_kmd_send_resource_table_info_by_ccq(
-			pva, &pva->dev_resource_table);
-		pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
-
-		// TODO: need better error handling here
-		err = pva_kmd_shared_buffer_init(
-			pva, PVA_PRIV_CCQ_ID, PVA_KMD_FW_BUF_ELEMENT_SIZE,
-			PVA_KMD_FW_PROFILING_BUF_NUM_ELEMENTS, NULL, NULL);
+		err = pva_kmd_config_fw_after_boot(pva);
 		if (err != PVA_SUCCESS) {
-			pva_kmd_log_err_u64(
-				"pva kmd buffer initialization failed for interface ",
-				PVA_PRIV_CCQ_ID);
 			goto deinit_fw;
 		}
-		pva_kmd_notify_fw_enable_profiling(pva);
-
-		/* Set FW debug log level */
-		pva_kmd_notify_fw_set_debug_log_level(pva,
-						      pva->fw_debug_log_level);
-
-		// If the user had set profiling level before power-on, send the update to FW
-		pva_kmd_notify_fw_set_profiling_level(
-			pva, pva->debugfs_context.profiling_level);
+	} else {
+		// Once firwmare is aborted, we no longer allow incrementing PVA
+		// refcount. This makes sure refcount will eventually reach 0 and allow
+		// device to be powered off.
+		if (pva->recovery) {
+			pva_kmd_log_err_u64(
+				"PVA firmware aborted. "
+				"Waiting for active PVA uses to finish. Remaining",
+				pva->refcount);
+			err = PVA_ERR_FW_ABORTED;
+			goto unlock;
+		}
 	}
-	pva->refcount = safe_addu32(pva->refcount, 1U);

+	pva->refcount = safe_addu32(pva->refcount, 1U);
 	pva_kmd_mutex_unlock(&pva->powercycle_lock);
 	return PVA_SUCCESS;

@@ -371,15 +388,15 @@ void pva_kmd_device_idle(struct pva_kmd_device *pva)
 	ASSERT(pva->refcount > 0);
 	pva->refcount--;
 	if (pva->refcount == 0) {
-		if (!pva->recovery) {
-			/* Disable FW profiling */
-			/* TODO: once debugfs is up, move these calls */
-			pva_kmd_notify_fw_disable_profiling(pva);
+		err = pva_kmd_notify_fw_disable_profiling(pva);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err(
+				"pva_kmd_notify_fw_disable_profiling failed during device idle");
 		}
-		// TOOD: need better error handling here
 		err = pva_kmd_shared_buffer_deinit(pva, PVA_PRIV_CCQ_ID);
 		if (err != PVA_SUCCESS) {
-			pva_kmd_log_err("pva_kmd_shared_buffer_deinit failed");
+			pva_kmd_log_err(
+				"pva_kmd_shared_buffer_deinit failed during device idle");
 		}
 		pva_kmd_deinit_fw(pva);
 		pva_kmd_power_off(pva);
@@ -397,9 +414,12 @@ enum pva_error pva_kmd_ccq_push_with_timeout(struct pva_kmd_device *pva,
 		if (timeout_us == 0) {
 			pva_kmd_log_err(
 				"pva_kmd_ccq_push_with_timeout Timed out");
-			pva_kmd_abort(pva);
+			pva_kmd_abort_fw(pva);
 			return PVA_TIMEDOUT;
 		}
+		if (pva->recovery) {
+			return PVA_ERR_FW_ABORTED;
+		}
 		pva_kmd_sleep_us(sleep_interval_us);
 		timeout_us = sat_sub64(timeout_us, sleep_interval_us);
 	}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.h
@@ -4,6 +4,7 @@
 #ifndef PVA_KMD_DEVICE_H
 #define PVA_KMD_DEVICE_H
 #include "pva_constants.h"
+#include "pva_fw.h"
 #include "pva_kmd_cmdbuf.h"
 #include "pva_kmd_utils.h"
 #include "pva_kmd_mutex.h"
@@ -26,9 +27,6 @@
 #define NV_PVA1_CLASS_ID 0xF2

 struct pva_syncpt_rw_info {
-	/** Dont switch order since syncpt_id and syncpt_iova is prefilled during kmd boot
-	 * and first field gets updated by pva_kmd_allocator everytime its freed */
-	uint32_t syncpt_value;
 	uint32_t syncpt_id;
 	uint64_t syncpt_iova;
 };
@@ -127,12 +125,13 @@ struct pva_kmd_device {
 	uint8_t bl_sector_pack_format;

 	/** Offset between 2 syncpoints */
-	uint32_t syncpt_offset;
-	uint64_t syncpt_ro_iova;
-	uint64_t syncpt_rw_iova;
-	uint32_t num_syncpts;
-	struct pva_syncpt_rw_info syncpt_rw[PVA_NUM_RW_SYNCPTS];
-	struct pva_kmd_block_allocator syncpt_allocator;
+	uint32_t syncpt_page_size;
+	uint64_t ro_syncpt_base_iova;
+	uint32_t num_ro_syncpts;
+
+	uint64_t rw_syncpt_base_iova;
+	uint32_t rw_syncpt_region_size;
+	struct pva_syncpt_rw_info rw_syncpts[PVA_NUM_RW_SYNCPTS];

 	struct vmem_region *vmem_regions_tab;
 	bool support_hwseq_frame_linking;
@@ -145,11 +144,14 @@ struct pva_kmd_device {

 	/** Carveout info for FW */
 	struct pva_co_info fw_carveout;
+
+	bool test_mode;
 };

 struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,
 					     uint32_t device_index,
-					     bool app_authenticate);
+					     bool app_authenticate,
+					     bool test_mode);

 void pva_kmd_device_destroy(struct pva_kmd_device *pva);

@@ -161,11 +163,7 @@ enum pva_error pva_kmd_ccq_push_with_timeout(struct pva_kmd_device *pva,
 					     uint64_t sleep_interval_us,
 					     uint64_t timeout_us);

-void pva_kmd_send_resource_table_info_by_ccq(
-	struct pva_kmd_device *pva, struct pva_kmd_resource_table *res_table);
-
-void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
-				    struct pva_kmd_queue *queue);
+enum pva_error pva_kmd_config_fw_after_boot(struct pva_kmd_device *pva);

 bool pva_kmd_device_maybe_on(struct pva_kmd_device *pva);

@@ -177,4 +175,14 @@ static inline uint32_t pva_kmd_get_device_class_id(struct pva_kmd_device *pva)
 		return NV_PVA1_CLASS_ID;
 	}
 }
+
+static inline uint16_t
+pva_kmd_get_max_cmdbuf_chunk_size(struct pva_kmd_device *pva)
+{
+	if (pva->test_mode) {
+		return PVA_TEST_MODE_MAX_CMDBUF_CHUNK_SIZE;
+	} else {
+		return PVA_MAX_CMDBUF_CHUNK_SIZE;
+	}
+}
 #endif // PVA_KMD_DEVICE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#include "pva_kmd_devmem_pool.h"
+#include "pva_kmd_utils.h"
+#include "pva_api.h"
+#include "pva_utils.h"
+
+static uint64_t get_devmem_offset(struct pva_kmd_devmem_element const *devmem)
+{
+	return (uint64_t)safe_mulu32(devmem->ele_idx,
+				     devmem->segment->owner_pool->element_size);
+}
+
+uint64_t pva_kmd_get_devmem_iova(struct pva_kmd_devmem_element const *devmem)
+{
+	return safe_addu64(devmem->segment->mem->iova,
+			   get_devmem_offset(devmem));
+}
+
+void *pva_kmd_get_devmem_va(struct pva_kmd_devmem_element const *devmem)
+{
+	return pva_offset_pointer(devmem->segment->mem->va,
+				  get_devmem_offset(devmem));
+}
+
+static struct pva_kmd_devmem_pool_segment *
+allocate_segment(struct pva_kmd_devmem_pool *pool)
+{
+	struct pva_kmd_devmem_pool_segment *segment;
+	struct pva_kmd_device_memory *mem = NULL;
+	uint64_t segment_size = safe_mulu64((uint64_t)pool->element_size,
+					    (uint64_t)pool->n_element_incr);
+	void *va;
+	enum pva_error err;
+
+	/* Allocate the segment structure */
+	segment = pva_kmd_zalloc(sizeof(*segment));
+	if (segment == NULL) {
+		goto err_out;
+	}
+
+	/* Allocate device memory */
+	mem = pva_kmd_device_memory_alloc_map(
+		segment_size, pool->pva, PVA_ACCESS_RW, pool->smmu_ctx_idx);
+	if (mem == NULL) {
+		goto free_segment;
+	}
+
+	segment->mem = mem;
+	segment->owner_pool = pool;
+	segment->n_free_ele =
+		pool->n_element_incr; /* Initialize all elements as free */
+	va = mem->va;
+
+	/* Initialize the segment allocator */
+	err = pva_kmd_block_allocator_init(&segment->elem_allocator, va, 0,
+					   pool->element_size,
+					   pool->n_element_incr);
+	if (err != PVA_SUCCESS) {
+		goto free_mem;
+	}
+
+	/* Add segment to the pool */
+	segment->next = pool->segment_list_head;
+	pool->segment_list_head = segment;
+	pool->n_free_element =
+		safe_addu32(pool->n_free_element, pool->n_element_incr);
+
+	return segment;
+
+free_mem:
+	pva_kmd_device_memory_free(mem);
+free_segment:
+	pva_kmd_free(segment);
+err_out:
+	return NULL;
+}
+
+enum pva_error pva_kmd_devmem_pool_init(struct pva_kmd_devmem_pool *pool,
+					struct pva_kmd_device *pva,
+					uint8_t smmu_ctx_idx,
+					uint32_t element_size,
+					uint32_t ele_incr_count)
+{
+	struct pva_kmd_devmem_pool_segment *segment;
+	enum pva_error err = PVA_SUCCESS;
+
+	/* Initialize the pool structure */
+	memset(pool, 0, sizeof(*pool));
+	pool->smmu_ctx_idx = smmu_ctx_idx;
+	pool->element_size =
+		safe_pow2_roundup_u32(element_size, sizeof(uint64_t));
+	pool->n_element_incr = ele_incr_count;
+	pool->n_free_element = 0;
+	pool->segment_list_head = NULL;
+	pool->pva = pva;
+
+	err = pva_kmd_mutex_init(&pool->pool_lock);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	/* Allocate the first segment */
+	segment = allocate_segment(pool);
+	if (segment == NULL) {
+		err = PVA_NOMEM;
+		goto deinit_mutex;
+	}
+
+	return PVA_SUCCESS;
+
+deinit_mutex:
+	pva_kmd_mutex_deinit(&pool->pool_lock);
+err_out:
+	return err;
+}
+
+static enum pva_error
+pva_kmd_devmem_pool_alloc(struct pva_kmd_devmem_pool *pool,
+			  struct pva_kmd_devmem_element *devmem)
+{
+	struct pva_kmd_devmem_pool_segment *segment = NULL;
+	struct pva_kmd_devmem_pool_segment *new_segment = NULL;
+	uint32_t ele_idx = (uint32_t)-1;
+	enum pva_error err = PVA_SUCCESS;
+
+	pva_kmd_mutex_lock(&pool->pool_lock);
+
+	/* Check if we have any free elements */
+	if (pool->n_free_element == 0) {
+		/* Need to allocate a new segment */
+		new_segment = allocate_segment(pool);
+		if (new_segment == NULL) {
+			err = PVA_NOMEM;
+			goto unlock;
+		}
+	}
+
+	/* Try to find a free element in the pool */
+	segment = pool->segment_list_head;
+	while (segment != NULL) {
+		void *va = NULL;
+		va = pva_kmd_alloc_block_unsafe(&segment->elem_allocator,
+						&ele_idx);
+		if (va != NULL) {
+			/* Found a free element */
+			break;
+		}
+		segment = segment->next;
+	}
+
+	ASSERT(segment != NULL);
+
+	devmem->segment = segment;
+	devmem->ele_idx = ele_idx;
+	pool->n_free_element = safe_subu32(pool->n_free_element, 1);
+	segment->n_free_ele = safe_subu32(segment->n_free_ele, 1);
+
+unlock:
+	pva_kmd_mutex_unlock(&pool->pool_lock);
+	return err;
+}
+
+enum pva_error pva_kmd_devmem_pool_zalloc(struct pva_kmd_devmem_pool *pool,
+					  struct pva_kmd_devmem_element *devmem)
+{
+	enum pva_error err = pva_kmd_devmem_pool_alloc(pool, devmem);
+	if (err != PVA_SUCCESS) {
+		return err;
+	}
+
+	memset(pva_kmd_get_devmem_va(devmem), 0, pool->element_size);
+	return PVA_SUCCESS;
+}
+
+static void free_segment(struct pva_kmd_devmem_pool *pool,
+			 struct pva_kmd_devmem_pool_segment *target_segment)
+{
+	struct pva_kmd_devmem_pool_segment *segment;
+	struct pva_kmd_devmem_pool_segment *prev_segment = NULL;
+
+	/* Find previous segment to update the linked list */
+	segment = pool->segment_list_head;
+	while (segment != NULL && segment != target_segment) {
+		prev_segment = segment;
+		segment = segment->next;
+	}
+
+	/* Segment not found in the list */
+	ASSERT(segment != NULL);
+
+	/* Remove this segment from the list */
+	if (prev_segment == NULL) {
+		/* This is the head segment */
+		pool->segment_list_head = target_segment->next;
+	} else {
+		prev_segment->next = target_segment->next;
+	}
+
+	/* Free the segment allocator */
+	pva_kmd_block_allocator_deinit(&target_segment->elem_allocator);
+
+	/* Free the device memory */
+	pva_kmd_device_memory_free(target_segment->mem);
+
+	/* Free the segment structure */
+	pva_kmd_free(target_segment);
+
+	/* Update the free element count */
+	pool->n_free_element =
+		safe_subu32(pool->n_free_element, pool->n_element_incr);
+}
+
+void pva_kmd_devmem_pool_free(struct pva_kmd_devmem_element *devmem)
+{
+	struct pva_kmd_devmem_pool *pool = devmem->segment->owner_pool;
+	struct pva_kmd_devmem_pool_segment *current_segment = devmem->segment;
+	uint32_t threshold;
+
+	pva_kmd_mutex_lock(&pool->pool_lock);
+
+	/* Free the element */
+	pva_kmd_free_block_unsafe(&current_segment->elem_allocator,
+				  devmem->ele_idx);
+	pool->n_free_element = safe_addu32(pool->n_free_element, 1);
+	current_segment->n_free_ele =
+		safe_addu32(current_segment->n_free_ele, 1);
+
+	/* Check if the current segment is now empty using n_free_ele counter */
+	if (current_segment->n_free_ele ==
+	    current_segment->elem_allocator.max_num_blocks) {
+		/* We only free the segment if we still have n_ele_incr free elements
+		after the free */
+		threshold = safe_mulu32(pool->n_element_incr, 2);
+		if (pool->n_free_element >= threshold) {
+			free_segment(pool, current_segment);
+		}
+	}
+
+	pva_kmd_mutex_unlock(&pool->pool_lock);
+}
+
+void pva_kmd_devmem_pool_deinit(struct pva_kmd_devmem_pool *pool)
+{
+	struct pva_kmd_devmem_pool_segment *segment = pool->segment_list_head;
+	struct pva_kmd_devmem_pool_segment *next;
+
+	/* Free all segments */
+	while (segment != NULL) {
+		next = segment->next;
+
+		/* Free the segment allocator */
+		pva_kmd_block_allocator_deinit(&segment->elem_allocator);
+
+		/* Free the device memory */
+		pva_kmd_device_memory_free(segment->mem);
+
+		/* Free the segment structure */
+		pva_kmd_free(segment);
+
+		segment = next;
+	}
+
+	pool->segment_list_head = NULL;
+	pva_kmd_mutex_deinit(&pool->pool_lock);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_devmem_pool.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#ifndef PVA_KMD_DEVMEM_POOL_H
+#define PVA_KMD_DEVMEM_POOL_H
+#include "pva_api.h"
+#include "pva_kmd_block_allocator.h"
+#include "pva_kmd_device_memory.h"
+
+/** @brief A segment of a device memory pool.
+ *
+ * It holds a fixed size array of device memory blocks. A pool is a linked list
+ * of segments.
+ */
+struct pva_kmd_devmem_pool_segment {
+	/** The owner pool. */
+	struct pva_kmd_devmem_pool *owner_pool;
+	/** The next segment in the pool. */
+	struct pva_kmd_devmem_pool_segment *next;
+	/** The device memory for the segment. */
+	struct pva_kmd_device_memory *mem;
+	/** The allocator for the elements in the segment. */
+	struct pva_kmd_block_allocator elem_allocator;
+	/** The number of free elements in the segment. */
+	uint32_t n_free_ele;
+};
+
+/** @brief A device memory pool that holds fixed size elements.
+ *
+ * It allocates memory in segments, each segment contains n_element_incr
+ * elements.
+ * - element_size will be rounded up to the nearest 8 bytes for alignment.
+ * - The pool is initialized with element_size * n_element_incr capacity.
+ * - Once exhausted, the pool will allocate a new segment of memory and increase
+ *   the capacity by n_element_incr.
+ * - When an element is freed, the pool does not immediately release the whole
+ *   segment even if the whole segment is empty. However, if there are 2 *
+ *   n_element_incr free elements, the pool will release a whole segment, so
+ *   that there's still at least n_element_incr free elements.
+ * - The pool is thread safe.
+ */
+struct pva_kmd_devmem_pool {
+	/** The SMMU context index for the pool. */
+	uint8_t smmu_ctx_idx;
+	/** The size of each element in the pool. */
+	uint32_t element_size;
+	/** The number of elements to allocate in each segment. */
+	uint32_t n_element_incr;
+	/** The total number of free elements in the pool, across all segments. */
+	uint32_t n_free_element;
+	/** The head of the segment list. */
+	struct pva_kmd_devmem_pool_segment *segment_list_head;
+	/** The PVA device. */
+	struct pva_kmd_device *pva;
+	/** The mutex for the pool. */
+	pva_kmd_mutex_t pool_lock;
+};
+
+/** @brief Device memory from a pool.
+ *
+ * It is an element in a segment of a pool.
+ */
+struct pva_kmd_devmem_element {
+	/** The segment that contains the element. */
+	struct pva_kmd_devmem_pool_segment *segment;
+	/** The index of the element in the segment. */
+	uint32_t ele_idx;
+};
+
+/** @brief Get the IOVA of a device memory element. */
+uint64_t pva_kmd_get_devmem_iova(struct pva_kmd_devmem_element const *devmem);
+
+/** @brief Get the virtual address of a device memory element. */
+void *pva_kmd_get_devmem_va(struct pva_kmd_devmem_element const *devmem);
+
+/** @brief Initialize a device memory pool.
+ *
+ * @param pool The device memory pool to initialize.
+ * @param pva The PVA device.
+ * @param smmu_ctx_idx The SMMU context index for the pool.
+ * @param element_size The size of each element in the pool.
+ * @param ele_incr_count The number of elements to allocate in each segment.
+ */
+enum pva_error pva_kmd_devmem_pool_init(struct pva_kmd_devmem_pool *pool,
+					struct pva_kmd_device *pva,
+					uint8_t smmu_ctx_idx,
+					uint32_t element_size,
+					uint32_t ele_incr_count);
+
+/** @brief Allocate a device memory element from a pool and zero-initialize it. */
+enum pva_error
+pva_kmd_devmem_pool_zalloc(struct pva_kmd_devmem_pool *pool,
+			   struct pva_kmd_devmem_element *devmem);
+
+/** @brief Free a device memory element from a pool. */
+void pva_kmd_devmem_pool_free(struct pva_kmd_devmem_element *devmem);
+
+/** @brief Deinitialize a device memory pool. */
+void pva_kmd_devmem_pool_deinit(struct pva_kmd_devmem_pool *pool);
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.c
@@ -62,42 +62,41 @@ pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
 	struct pva_dma_config dma_config;
 	struct pva_fw_dma_slot *dyn_slots;
 	struct pva_fw_dma_reloc *dyn_relocs;
-	struct pva_fw_dma_slot *static_slots = dma_aux->static_slots;
-	struct pva_fw_dma_reloc *static_relocs = dma_aux->static_relocs;
-	struct pva_kmd_dma_access *access_sizes = dma_aux->access_sizes;
+	struct pva_kmd_dma_scratch_buffer *scratch_buf;
 	// Mapping descriptor index to channel index
 	uint8_t desc_to_ch[PVA_MAX_NUM_DMA_DESC];

+	scratch_buf = pva_kmd_zalloc(sizeof(*scratch_buf));
+	if (scratch_buf == NULL) {
+		err = PVA_NOMEM;
+		goto err_out;
+	}
+
 	for (uint32_t i = 0; i < PVA_MAX_NUM_DMA_DESC; i++) {
 		desc_to_ch[i] = PVA_KMD_INVALID_CH_IDX;
 	}

-	//set access_sizes to 0 by default
-	(void)memset(
-		access_sizes, 0,
-		(PVA_MAX_NUM_DMA_DESC * sizeof(struct pva_kmd_dma_access)));
-
 	err = pva_kmd_parse_dma_config(dma_cfg_hdr, dma_config_size,
 				       &dma_config,
 				       &resource_table->pva->hw_consts);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}

 	err = pva_kmd_validate_dma_config(&dma_config,
 					  &resource_table->pva->hw_consts,
-					  access_sizes,
-					  dma_aux->hw_dma_descs_mask);
+					  scratch_buf->access_sizes,
+					  scratch_buf->hw_dma_descs_mask);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}

 	trace_dma_channels(&dma_config, desc_to_ch);

-	err = pva_kmd_compute_dma_access(&dma_config, access_sizes,
-					 dma_aux->hw_dma_descs_mask);
+	err = pva_kmd_compute_dma_access(&dma_config, scratch_buf->access_sizes,
+					 scratch_buf->hw_dma_descs_mask);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}

 	dyn_slots = pva_offset_pointer(fw_dma_cfg,
@@ -107,9 +106,10 @@ pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
 					dma_config.header.num_dynamic_slots *
 						sizeof(*dyn_slots));

-	pva_kmd_collect_relocs(&dma_config, access_sizes, static_slots,
+	pva_kmd_collect_relocs(&dma_config, scratch_buf->access_sizes,
+			       scratch_buf->static_slots,
 			       dma_config.header.num_static_slots,
-			       static_relocs, dyn_slots,
+			       scratch_buf->static_relocs, dyn_slots,
 			       dma_config.header.num_dynamic_slots, dyn_relocs,
 			       desc_to_ch);

@@ -117,26 +117,27 @@ pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
 		&dma_config, fw_dma_cfg, &fw_fetch_size,
 		resource_table->pva->support_hwseq_frame_linking);

-	dma_aux->res_table = resource_table;
 	err = pva_kmd_dma_use_resources(&dma_config, dma_aux);
 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto free_scratch_buf;
 	}

-	err = pva_kmd_bind_static_buffers(fw_dma_cfg, dma_aux, static_slots,
-					  dma_config.header.num_static_slots,
-					  static_relocs,
-					  dma_config.static_bindings,
-					  dma_config.header.num_static_slots);
+	err = pva_kmd_bind_static_buffers(
+		fw_dma_cfg, dma_aux, scratch_buf->static_slots,
+		dma_config.header.num_static_slots, scratch_buf->static_relocs,
+		dma_config.static_bindings, dma_config.header.num_static_slots);
 	if (err != PVA_SUCCESS) {
 		goto drop_res;
 	}

 	*out_fw_fetch_size = fw_fetch_size;

+	pva_kmd_free(scratch_buf);
 	return PVA_SUCCESS;
 drop_res:
 	pva_kmd_unload_dma_config_unsafe(dma_aux);
+free_scratch_buf:
+	pva_kmd_free(scratch_buf);
 err_out:
 	return err;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.h
@@ -50,9 +50,10 @@ struct pva_kmd_dma_resource_aux {
 	uint32_t dram_res_count;
 	/** DRAM buffers statically referenced by the DMA configuration */
 	uint32_t static_dram_res_ids[PVA_KMD_MAX_NUM_DMA_DRAM_SLOTS];
+};

-	/* Below are work buffers need during DMA configuration loading. They
-	 * don't fit on stack. */
+/* Scratch buffers needed during DMA configuration loading. They don't fit on stack. */
+struct pva_kmd_dma_scratch_buffer {
 	struct pva_fw_dma_slot static_slots[PVA_KMD_MAX_NUM_DMA_SLOTS];
 	struct pva_fw_dma_reloc static_relocs[PVA_KMD_MAX_NUM_DMA_SLOTS];
 	struct pva_kmd_dma_access access_sizes[PVA_MAX_NUM_DMA_DESC];
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_binding.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_binding.c
@@ -94,6 +94,9 @@ bind_static_dram_slot(struct pva_dma_config_resource *dma_config,
 	int64_t slot_access_end_addr = 0LL;
 	uint64_t slot_surface_combined_offset = 0ULL;
 	pva_math_error math_error = MATH_OP_SUCCESS;
+	uint8_t slot_access_flags =
+		PVA_EXTRACT16(slot->flags, PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB,
+			      PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB, uint8_t);

 	if ((slot->flags & PVA_FW_DMA_SLOT_FLAG_DRAM) == 0) {
 		pva_kmd_log_err("Binding DRAM buffer to incompatible slot");
@@ -101,6 +104,14 @@ bind_static_dram_slot(struct pva_dma_config_resource *dma_config,
 		goto out;
 	}

+	if ((slot_access_flags & dram_res->mem->iova_access_flags) !=
+	    slot_access_flags) {
+		pva_kmd_log_err(
+			"DRAM buffer does not have the required access permissions");
+		err = PVA_INVALID_BINDING;
+		goto out;
+	}
+
 	if (is_block_linear) {
 		if (slot->flags & PVA_FW_DMA_SLOT_FLAG_CB) {
 			pva_kmd_log_err(
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_validate.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_validate.c
@@ -218,7 +218,7 @@ validate_descriptor(const struct pva_dma_descriptor *desc,
 	/* DMA_DESC_LDID */
 	if ((desc->link_desc_id > cfg_hdr->num_descriptors) ||
 	    ((desc->link_desc_id != 0) &&
-	     pva_is_reserved_desc(desc->link_desc_id - PVA_DMA_DESC0))) {
+	     pva_is_reserved_desc(desc->link_desc_id - PVA_DMA_DESC_ID_BASE))) {
 		pva_kmd_log_err("ERR: Invalid linker Desc ID");
 		return PVA_INVAL;
 	}
@@ -423,6 +423,8 @@ pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,
 			err = PVA_INVAL;
 			goto err_out;
 		}
+		dma_aux->vpu_bin_res_id = dma_cfg->header.vpu_exec_resource_id;
+
 		if (vpu_bin_rec->type != PVA_RESOURCE_TYPE_EXEC_BIN) {
 			pva_kmd_log_err(
 				"Invalid VPU exec resource id used by DMA config");
@@ -432,9 +434,6 @@ pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,
 		vpu_bin = &vpu_bin_rec->vpu_bin;
 	}

-	dma_aux->vpu_bin_res_id = dma_cfg->header.vpu_exec_resource_id;
-
-	dma_aux->dram_res_count = 0;
 	/* Increment reference count for all static DRAM buffers; For static
 	 * VMEM buffers, check that symbol ID is valid. */
 	for (i = 0; i < dma_cfg->header.num_static_slots; i++) {
@@ -455,7 +454,8 @@ pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,

 			dma_aux->static_dram_res_ids[dma_aux->dram_res_count] =
 				slot_buf->dram.resource_id;
-			dma_aux->dram_res_count += 1;
+			dma_aux->dram_res_count =
+				safe_addu32(dma_aux->dram_res_count, 1U);

 			if (rec->type != PVA_RESOURCE_TYPE_DRAM) {
 				pva_kmd_log_err(
@@ -505,9 +505,10 @@ static uint16_t get_slot_id(uint16_t slot)
 	return slot & PVA_DMA_SLOT_ID_MASK;
 }

-static uint8_t get_slot_flag(uint8_t transfer_mode, bool cb_enable)
+static uint16_t get_slot_flag(uint8_t transfer_mode, bool cb_enable,
+			      bool is_dst)
 {
-	uint8_t flags = 0;
+	uint16_t flags = 0;
 	if (transfer_mode == PVA_DMA_TRANS_MODE_VMEM) {
 		flags |= PVA_FW_DMA_SLOT_FLAG_VMEM_DATA;
 	} else if (transfer_mode == PVA_DMA_TRANS_MODE_L2SRAM) {
@@ -521,6 +522,15 @@ static uint8_t get_slot_flag(uint8_t transfer_mode, bool cb_enable)
 	if (cb_enable) {
 		flags |= PVA_FW_DMA_SLOT_FLAG_CB;
 	}
+	if (is_dst) {
+		flags |= PVA_INSERT(PVA_ACCESS_WO,
+				    PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB,
+				    PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB);
+	} else {
+		flags |= PVA_INSERT(PVA_ACCESS_RO,
+				    PVA_FW_DMA_SLOT_FLAG_ACCESS_MSB,
+				    PVA_FW_DMA_SLOT_FLAG_ACCESS_LSB);
+	}
 	return flags;
 }

@@ -529,7 +539,7 @@ static void update_reloc_count(uint16_t slot, uint8_t transfer_mode,
 			       struct pva_fw_dma_slot *out_static_slots,
 			       uint16_t num_static_slots,
 			       struct pva_fw_dma_slot *out_dyn_slots,
-			       uint16_t num_dyn_slots)
+			       uint16_t num_dyn_slots, bool is_dst)
 {
 	uint8_t slot_id = get_slot_id(slot);

@@ -537,13 +547,12 @@ static void update_reloc_count(uint16_t slot, uint8_t transfer_mode,
 		out_dyn_slots[slot_id].reloc_count =
 			safe_addu16(out_dyn_slots[slot_id].reloc_count, 1U);
 		out_dyn_slots[slot_id].flags |=
-			get_slot_flag(transfer_mode, cb_enable);
+			get_slot_flag(transfer_mode, cb_enable, is_dst);
 	} else if (slot & PVA_DMA_STATIC_SLOT) {
 		out_static_slots[slot_id].reloc_count =
 			safe_addu16(out_static_slots[slot_id].reloc_count, 1U);
-		;
 		out_static_slots[slot_id].flags |=
-			get_slot_flag(transfer_mode, cb_enable);
+			get_slot_flag(transfer_mode, cb_enable, is_dst);
 	}
 }

@@ -567,17 +576,17 @@ static void count_relocs(struct pva_dma_config const *dma_cfg,
 		update_reloc_count(desc->src.slot, desc->src.transfer_mode,
 				   desc->src.cb_enable, out_static_slots,
 				   num_static_slots, out_dyn_slots,
-				   num_dyn_slots);
+				   num_dyn_slots, false);

 		update_reloc_count(desc->dst.slot, desc->dst.transfer_mode,
 				   desc->dst.cb_enable, out_static_slots,
 				   num_static_slots, out_dyn_slots,
-				   num_dyn_slots);
+				   num_dyn_slots, true);

 		update_reloc_count(desc->dst2_slot, desc->dst.transfer_mode,
 				   desc->dst.cb_enable, out_static_slots,
 				   num_static_slots, out_dyn_slots,
-				   num_dyn_slots);
+				   num_dyn_slots, true);
 	}
 }

@@ -867,10 +876,6 @@ void pva_kmd_collect_relocs(struct pva_dma_config const *dma_cfg,
 	uint8_t static_reloc_off[PVA_MAX_NUM_DMA_DESC * 3];
 	uint8_t dyn_reloc_off[PVA_MAX_NUM_DMA_DESC * 3];

-	memset(out_static_slots, 0,
-	       num_static_slots * sizeof(*out_static_slots));
-	memset(out_dyn_slots, 0, num_dyn_slots * sizeof(*out_dyn_slots));
-
 	/* First pass: count the number of relocates for each slot */
 	count_relocs(dma_cfg, out_static_slots, num_static_slots, out_dyn_slots,
 		     num_dyn_slots);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.c
@@ -16,42 +16,23 @@
 enum pva_error pva_kmd_notify_fw_set_debug_log_level(struct pva_kmd_device *pva,
 						     uint32_t log_level)
 {
-	struct pva_kmd_submitter *submitter = &pva->submitter;
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_cmd_set_debug_log_level *cmd;
-	uint32_t fence_val;
-	enum pva_error err;
+	struct pva_cmd_set_debug_log_level cmd = { 0 };
+	pva_kmd_set_cmd_set_debug_log_level(&cmd, log_level);

-	err = pva_kmd_submitter_prepare(submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-
-	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
-	ASSERT(cmd != NULL);
-
-	pva_kmd_set_cmd_set_debug_log_level(cmd, log_level);
-
-	err = pva_kmd_submitter_submit(submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err("set debug log level cmd submission failed");
-		goto cancel_builder;
-	}
-
-	err = pva_kmd_submitter_wait(submitter, fence_val,
+	return pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 				       PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				       PVA_KMD_WAIT_FW_TIMEOUT_US);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when setting debug log level");
-		goto err_out;
-	}
+}

-cancel_builder:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
+enum pva_error pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
+						     uint32_t level)
+{
+	struct pva_cmd_set_profiling_level cmd = { 0 };
+	pva_kmd_set_cmd_set_profiling_level(&cmd, level);

-err_out:
-	return err;
+	return pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
+				       PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				       PVA_KMD_WAIT_FW_TIMEOUT_US);
 }

 void pva_kmd_drain_fw_print(struct pva_kmd_fw_print_buffer *print_buffer)
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.h
@@ -15,6 +15,9 @@ struct pva_kmd_fw_print_buffer {
 enum pva_error pva_kmd_notify_fw_set_debug_log_level(struct pva_kmd_device *pva,
 						     uint32_t log_level);

+enum pva_error pva_kmd_notify_fw_set_profiling_level(struct pva_kmd_device *pva,
+						     uint32_t level);
+
 void pva_kmd_drain_fw_print(struct pva_kmd_fw_print_buffer *print_buffer);

 #endif // PVA_KMD_FW_DEBUG_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.c
@@ -10,6 +10,7 @@
 #include "pva_utils.h"
 #include "pva_kmd_fw_profiler.h"
 #include "pva_kmd_shared_buffer.h"
+#include "pva_api_private.h"

 // TODO: This is here temporarily just for testing. Should be moved to a common header
 #define CMD_ID(x) PVA_EXTRACT(x, 6, 0, uint8_t)
@@ -101,13 +102,11 @@ void pva_kmd_device_deinit_profiler(struct pva_kmd_device *pva)

 enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva)
 {
-	struct pva_kmd_cmdbuf_builder builder;
 	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
-	struct pva_cmd_enable_fw_profiling *cmd;
+	struct pva_cmd_enable_fw_profiling cmd = { 0 };
 	uint32_t filter = 0U;
 	uint8_t timestamp_type = TIMESTAMP_TYPE_CYCLE_COUNT;
-	uint32_t fence_val;
-	enum pva_error err;
+	enum pva_error err = PVA_SUCCESS;

 	struct pva_kmd_shared_buffer *profiling_buffer =
 		&pva->kmd_fw_buffers[PVA_PRIV_CCQ_ID];
@@ -123,26 +122,14 @@ enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva)
 		return PVA_SUCCESS;
 	}

-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
-	ASSERT(cmd != NULL);
-	pva_kmd_set_cmd_enable_fw_profiling(cmd, filter, timestamp_type);
+	pva_kmd_set_cmd_enable_fw_profiling(&cmd, filter, timestamp_type);

-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+	err = pva_kmd_submit_cmd_sync(dev_submitter, &cmd, sizeof(cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when initializing context");
-		goto err_out;
+		pva_kmd_log_err("Failed to submit command");
+		goto out;
 	}

 	pva->debugfs_context.g_fw_profiling_config.enabled = true;
@@ -155,38 +142,22 @@ enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva)
 			      8 :
 			      4;

-	return PVA_SUCCESS;
-err_out:
+out:
 	return err;
 }

 enum pva_error pva_kmd_notify_fw_disable_profiling(struct pva_kmd_device *pva)
 {
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
-	struct pva_cmd_disable_fw_profiling *cmd;
-	uint32_t fence_val;
+	struct pva_cmd_disable_fw_profiling cmd = { 0 };
 	enum pva_error err;

-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
-	ASSERT(cmd != NULL);
-	pva_kmd_set_cmd_disable_fw_profiling(cmd);
+	pva_kmd_set_cmd_disable_fw_profiling(&cmd);

-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when initializing context");
+		pva_kmd_log_err("Failed to submit command");
 		goto err_out;
 	}

@@ -194,6 +165,7 @@ enum pva_error pva_kmd_notify_fw_disable_profiling(struct pva_kmd_device *pva)
 	pva->debugfs_context.g_fw_profiling_config.filter = 0x0;

 	return PVA_SUCCESS;
+
 err_out:
 	return err;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.c
@@ -50,7 +50,7 @@ void pva_kmd_handle_hyp_msg(void *pva_dev, uint32_t const *data, uint8_t len)
 		memcpy(abort_msg + 2, &data[1], size);
 		abort_msg[PVA_FW_MSG_ABORT_STR_MAX_LEN] = '\0';
 		pva_kmd_log_err(abort_msg);
-		pva_kmd_abort(pva);
+		pva_kmd_abort_fw(pva);
 	} break;
 	case PVA_FW_MSG_TYPE_FLUSH_PRINT:
 		pva_kmd_drain_fw_print(&pva->fw_print_buffer);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.c
@@ -86,7 +86,6 @@ pva_kmd_op_memory_register_async(struct pva_kmd_context *ctx,
 		err = PVA_NOMEM;
 		goto err_out;
 	}
-
 	if (args->segment == PVA_MEMORY_SEGMENT_R5) {
 		smmu_ctx_id = PVA_R5_SMMU_CONTEXT_ID;
 	} else {
@@ -168,8 +167,8 @@ static enum pva_error pva_kmd_op_executable_register_async(
 	}

 	args = (struct pva_ops_executable_register *)input_buffer;
-	if (args->exec_size + sizeof(struct pva_ops_executable_register) >
-	    size) {
+	if (args->exec_size >
+	    (size - sizeof(struct pva_ops_executable_register))) {
 		pva_kmd_log_err("Executable register payload size too small");
 		return PVA_INVAL;
 	}
@@ -404,8 +403,10 @@ exit_loop:
 	post_fence->flags |= PVA_FW_POSTFENCE_FLAGS_USER_FENCE;
 	submit_error = pva_kmd_submitter_submit_with_fence(
 		&ctx->submitter, &cmdbuf_builder, post_fence);
-	ASSERT(submit_error == PVA_SUCCESS);

+	if (err == PVA_SUCCESS) {
+		err = submit_error;
+	}
 out:
 	return err;
 }
@@ -434,97 +435,14 @@ pva_kmd_op_context_init(struct pva_kmd_context *ctx, const void *input_buffer,
 	err = pva_kmd_context_init(ctx, ctx_init_args->resource_table_capacity);
 	ctx_init_out.error = err;
 	ctx_init_out.ccq_shm_hdl = (uint64_t)ctx->ccq_shm_handle;
+	ctx_init_out.max_cmdbuf_chunk_size =
+		pva_kmd_get_max_cmdbuf_chunk_size(ctx->pva);

 	produce_data(out_buffer, &ctx_init_out, sizeof(ctx_init_out));

 	return PVA_SUCCESS;
 }

-static enum pva_error pva_kmd_op_syncpt_register_async(
-	struct pva_kmd_context *ctx, const void *input_buffer,
-	uint32_t input_buffer_size, struct pva_kmd_ops_buffer *out_buffer,
-	struct pva_kmd_cmdbuf_builder *cmdbuf_builder)
-{
-	enum pva_error err;
-	struct pva_syncpt_rw_info *syncpts;
-	struct pva_kmd_device_memory dev_mem;
-	uint32_t resource_id = 0;
-	struct pva_cmd_update_resource_table *update_cmd;
-	struct pva_resource_entry entry = { 0 };
-	struct pva_ops_response_syncpt_register syncpt_register_out = { 0 };
-
-	if (input_buffer_size != sizeof(struct pva_ops_syncpt_register)) {
-		pva_kmd_log_err("Syncpt register size is not correct");
-		return PVA_INVAL;
-	}
-
-	if (!access_ok(out_buffer,
-		       sizeof(struct pva_ops_response_syncpt_register))) {
-		return PVA_INVAL;
-	}
-
-	/* Register RO syncpts */
-	dev_mem.iova = ctx->pva->syncpt_ro_iova;
-	dev_mem.va = 0;
-	dev_mem.size = ctx->pva->syncpt_offset * ctx->pva->num_syncpts;
-	dev_mem.pva = ctx->pva;
-	dev_mem.smmu_ctx_idx = PVA_R5_SMMU_CONTEXT_ID;
-	err = pva_kmd_add_syncpt_resource(&ctx->ctx_resource_table, &dev_mem,
-					  &resource_id);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-	syncpt_register_out.syncpt_ro_res_id = resource_id;
-	syncpt_register_out.num_ro_syncpoints = ctx->pva->num_syncpts;
-	update_cmd =
-		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
-	ASSERT(update_cmd != NULL);
-	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
-					  &entry);
-	ASSERT(err == PVA_SUCCESS);
-	pva_kmd_set_cmd_update_resource_table(
-		update_cmd, ctx->resource_table_id, resource_id, &entry);
-
-	/* Register RW syncpts */
-	pva_kmd_mutex_lock(&ctx->pva->syncpt_allocator.allocator_lock);
-	syncpts = (struct pva_syncpt_rw_info *)pva_kmd_get_block_unsafe(
-		&ctx->pva->syncpt_allocator, ctx->syncpt_block_index);
-	ASSERT(syncpts != NULL);
-
-	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS_PER_CONTEXT; i++) {
-		ctx->syncpt_ids[i] = syncpts[i].syncpt_id;
-		syncpt_register_out.synpt_ids[i] = syncpts[i].syncpt_id;
-	}
-
-	dev_mem.iova = syncpts[0].syncpt_iova;
-	pva_kmd_mutex_unlock(&ctx->pva->syncpt_allocator.allocator_lock);
-	dev_mem.va = 0;
-	dev_mem.size = ctx->pva->syncpt_offset * PVA_NUM_RW_SYNCPTS_PER_CONTEXT;
-	dev_mem.pva = ctx->pva;
-	dev_mem.smmu_ctx_idx = PVA_R5_SMMU_CONTEXT_ID;
-	err = pva_kmd_add_syncpt_resource(&ctx->ctx_resource_table, &dev_mem,
-					  &resource_id);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-	syncpt_register_out.syncpt_rw_res_id = resource_id;
-	syncpt_register_out.synpt_size = ctx->pva->syncpt_offset;
-	update_cmd =
-		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
-	ASSERT(update_cmd != NULL);
-	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
-					  &entry);
-	ASSERT(err == PVA_SUCCESS);
-	pva_kmd_set_cmd_update_resource_table(
-		update_cmd, ctx->resource_table_id, resource_id, &entry);
-
-err_out:
-	syncpt_register_out.error = err;
-	produce_data(out_buffer, &syncpt_register_out,
-		     sizeof(syncpt_register_out));
-	return PVA_SUCCESS;
-}
-
 static enum pva_error
 pva_kmd_op_queue_create(struct pva_kmd_context *ctx, const void *input_buffer,
 			uint32_t input_buffer_size,
@@ -532,6 +450,7 @@ pva_kmd_op_queue_create(struct pva_kmd_context *ctx, const void *input_buffer,
 {
 	const struct pva_ops_queue_create *queue_create_args;
 	struct pva_ops_response_queue_create queue_out_args = { 0 };
+	const struct pva_syncpt_rw_info *syncpt_info;
 	uint32_t queue_id = PVA_INVALID_QUEUE_ID;
 	enum pva_error err = PVA_SUCCESS;

@@ -553,10 +472,12 @@ pva_kmd_op_queue_create(struct pva_kmd_context *ctx, const void *input_buffer,
 		goto out;
 	}

+	syncpt_info = pva_kmd_queue_get_rw_syncpt_info(ctx, queue_id);
 	queue_out_args.error = err;
 	queue_out_args.queue_id = queue_id;
-	pva_kmd_read_syncpt_val(ctx->pva, ctx->syncpt_ids[queue_id],
-				&queue_out_args.syncpt_fence_counter);
+	queue_out_args.syncpt_id = syncpt_info->syncpt_id;
+	pva_kmd_read_syncpt_val(ctx->pva, syncpt_info->syncpt_id,
+				&queue_out_args.syncpt_current_value);

 out:
 	produce_data(out_buffer, &queue_out_args,
@@ -687,15 +608,16 @@ pva_kmd_op_synced_submit(struct pva_kmd_context *ctx, const void *input_buffer,

 	err = pva_kmd_submitter_submit(&ctx->submitter, &cmdbuf_builder,
 				       &fence_val);
-	/* TODO: handle this error */
-	ASSERT(err == PVA_SUCCESS);
+	if (err != PVA_SUCCESS) {
+		goto cancel_submit;
+	}

 	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
 				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				     PVA_KMD_WAIT_FW_TIMEOUT_US);

 	if (err != PVA_SUCCESS) {
-		goto err_out;
+		goto cancel_submit;
 	}

 	return PVA_SUCCESS;
@@ -758,11 +680,6 @@ pva_kmd_sync_ops_handler(struct pva_kmd_context *ctx,
 			ctx, input_buffer, input_buffer_size, out_arg,
 			pva_kmd_op_memory_register_async);
 		break;
-	case PVA_OPS_OPCODE_SYNCPT_REGISTER:
-		err = pva_kmd_op_synced_submit(
-			ctx, input_buffer, input_buffer_size, out_arg,
-			pva_kmd_op_syncpt_register_async);
-		break;
 	case PVA_OPS_OPCODE_EXECUTABLE_REGISTER:
 		err = pva_kmd_op_synced_submit(
 			ctx, input_buffer, input_buffer_size, out_arg,
@@ -798,11 +715,6 @@ enum pva_error pva_kmd_ops_handler(struct pva_kmd_context *ctx,
 	struct pva_kmd_ops_buffer in_buffer = { 0 }, out_buffer = { 0 };
 	enum pva_error err = PVA_SUCCESS;

-	if (ctx->pva->recovery) {
-		pva_kmd_log_err("PVA firmware aborted. No KMD ops allowed.");
-		return PVA_ERR_FW_ABORTED;
-	}
-
 	in_buffer.base = ops_buffer;
 	in_buffer.size = ops_size;

--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 #include "pva_kmd_utils.h"
 #include "pva_fw.h"
 #include "pva_kmd_device_memory.h"
@@ -14,11 +15,8 @@

 enum pva_error pva_kmd_prepare_suspend(struct pva_kmd_device *pva)
 {
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
 	enum pva_error err = PVA_SUCCESS;
-	struct pva_cmd_suspend_fw *fw_suspend;
-	uint32_t fence_val;
+	struct pva_cmd_suspend_fw cmd = { 0 };

 	pva_kmd_mutex_lock(&pva->powercycle_lock);
 	if (pva->refcount == 0u) {
@@ -27,44 +25,16 @@ enum pva_error pva_kmd_prepare_suspend(struct pva_kmd_device *pva)
 		goto err_out;
 	}

-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"PVA: Prepare submitter for FW suspend command failed\n");
-		goto err_out;
-	}
+	pva_kmd_set_cmd_suspend_fw(&cmd);

-	//Build args
-	fw_suspend = pva_kmd_reserve_cmd_space(&builder, sizeof(*fw_suspend));
-	if (fw_suspend == NULL) {
-		pva_kmd_log_err(
-			"PVA: Memory alloc for FW suspend command failed\n");
-		err = PVA_NOMEM;
-		goto cancel_submit;
-	}
-
-	pva_kmd_set_cmd_suspend_fw(fw_suspend);
-
-	//Submit
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"PVA: Submission for FW suspend command failed\n");
-		goto cancel_submit;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"PVA: Waiting for FW timed out when preparing for suspend state\n");
+		pva_kmd_log_err("PVA: Failed to submit FW suspend command\n");
 		goto err_out;
 	}

-cancel_submit:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
-
 err_out:
 	pva_kmd_mutex_unlock(&pva->powercycle_lock);
 	return err;
@@ -77,9 +47,11 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 	struct pva_cmd_init_resource_table *res_cmd;
 	struct pva_cmd_init_queue *queue_cmd;
 	struct pva_cmd_resume_fw *fw_resume;
+	struct pva_cmd_init_shared_dram_buffer *shared_buf_cmd;
 	enum pva_error err;
 	uint32_t fence_val;
 	struct pva_kmd_queue *queue;
+	const struct pva_syncpt_rw_info *syncpt_info;

 	pva_kmd_mutex_lock(&pva->powercycle_lock);
 	if (pva->refcount == 0u) {
@@ -89,8 +61,10 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 		goto err_out;
 	}

-	pva_kmd_send_resource_table_info_by_ccq(pva, &pva->dev_resource_table);
-	pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
+	err = pva_kmd_config_fw_after_boot(pva);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}

 	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
 	if (err != PVA_SUCCESS) {
@@ -140,14 +114,38 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 				goto cancel_builder;
 			}

+			/* Initialize shared buffer */
+			shared_buf_cmd = pva_kmd_reserve_cmd_space(
+				&builder, sizeof(*shared_buf_cmd));
+			if (shared_buf_cmd == NULL) {
+				pva_kmd_log_err(
+					"PVA: Memory alloc for shared buffer registration in FW resume command failed\n");
+				err = PVA_NOMEM;
+				goto cancel_builder;
+			}
+
+			pva_dbg_printf(
+				"PVA: Resume shared buffer for context %d\n",
+				ctx->ccq_id);
+			pva_kmd_set_cmd_init_shared_dram_buffer(
+				shared_buf_cmd, ctx->ccq_id,
+				pva->kmd_fw_buffers[ctx->ccq_id]
+					.resource_memory->iova,
+				pva->kmd_fw_buffers[ctx->ccq_id]
+					.resource_memory->size);
+
 			pva_dbg_printf(
 				"PVA: Resume priv queue for context %d\n",
 				ctx->ccq_id);
+			syncpt_info = pva_kmd_queue_get_rw_syncpt_info(
+				PVA_PRIV_CCQ_ID, ctx->ccq_id);
 			pva_kmd_set_cmd_init_queue(
 				queue_cmd, PVA_PRIV_CCQ_ID,
 				ctx->ccq_id, /* For privileged queues, queue ID == user CCQ ID*/
 				ctx->ctx_queue.queue_memory->iova,
-				ctx->ctx_queue.max_num_submit);
+				ctx->ctx_queue.max_num_submit,
+				syncpt_info->syncpt_id,
+				syncpt_info->syncpt_iova);

 			/**Initialize resource table */
 			for (uint32_t j = 0; j < ctx->max_n_queues; j++) {
@@ -168,11 +166,16 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 						goto cancel_builder;
 					}

+					syncpt_info =
+						pva_kmd_queue_get_rw_syncpt_info(
+							ctx, queue->queue_id);
 					pva_kmd_set_cmd_init_queue(
 						queue_cmd, queue->ccq_id,
 						queue->queue_id,
 						queue->queue_memory->iova,
-						queue->max_num_submit);
+						queue->max_num_submit,
+						syncpt_info->syncpt_id,
+						syncpt_info->syncpt_iova);
 				}
 				pva_kmd_mutex_unlock(
 					&ctx->queue_allocator.allocator_lock);
@@ -194,9 +197,12 @@ enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
 	if (err != PVA_SUCCESS) {
 		pva_kmd_log_err(
 			"Waiting for FW timed out when resuming from suspend state");
-		goto err_out;
+		goto cancel_builder;
 	}

+	pva_kmd_mutex_unlock(&pva->powercycle_lock);
+	return PVA_SUCCESS;
+
 cancel_builder:
 	pva_kmd_cmdbuf_builder_cancel(&builder);

--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#include "pva_constants.h"
 #include "pva_kmd.h"
 #include "pva_kmd_utils.h"
 #include "pva_fw.h"
@@ -74,48 +75,23 @@ pva_kmd_queue_submit(struct pva_kmd_queue *queue,

 	return err;
 }
-
-void pva_kmd_queue_deinit(struct pva_kmd_queue *queue)
-{
-	queue->queue_memory = NULL;
-	queue->ccq_id = PVA_INVALID_QUEUE_ID;
-	queue->max_num_submit = 0;
-}
-
 static enum pva_error notify_fw_queue_deinit(struct pva_kmd_context *ctx,
 					     struct pva_kmd_queue *queue)
 {
-	enum pva_error err = PVA_SUCCESS;
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_cmd_deinit_queue *queue_cmd;
-	uint32_t fence_val;
+	struct pva_cmd_deinit_queue cmd = { 0 };
+	enum pva_error err;

-	err = pva_kmd_submitter_prepare(&ctx->submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto end;
-	}
+	pva_kmd_set_cmd_deinit_queue(&cmd, queue->ccq_id, queue->queue_id);

-	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
-	if (queue_cmd == NULL) {
-		err = PVA_NOMEM;
-		goto cancel_submitter;
-	}
-	pva_kmd_set_cmd_deinit_queue(queue_cmd, queue->ccq_id, queue->queue_id);
-
-	err = pva_kmd_submitter_submit(&ctx->submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		goto cancel_submitter;
-	}
-
-	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
+	err = pva_kmd_submit_cmd_sync(&ctx->submitter, &cmd, sizeof(cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		goto end;
 	}
+
 	return PVA_SUCCESS;
-cancel_submitter:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
+
 end:
 	return err;
 }
@@ -126,10 +102,9 @@ enum pva_error pva_kmd_queue_create(struct pva_kmd_context *ctx,
 {
 	struct pva_kmd_device_memory *submission_mem_kmd = NULL;
 	struct pva_kmd_queue *queue = NULL;
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_cmd_init_queue *queue_cmd;
-	uint32_t fence_val;
+	struct pva_cmd_init_queue cmd = { 0 };
 	enum pva_error err, tmperr;
+	const struct pva_syncpt_rw_info *syncpt_info;

 	queue = pva_kmd_zalloc_block(&ctx->queue_allocator, queue_id);
 	if (queue == NULL) {
@@ -160,42 +135,26 @@ enum pva_error pva_kmd_queue_create(struct pva_kmd_context *ctx,
 		goto err_free_kmd_memory;
 	}

-	err = pva_kmd_submitter_prepare(&ctx->submitter, &builder);
+	syncpt_info = pva_kmd_queue_get_rw_syncpt_info(ctx, queue->queue_id);
+	pva_kmd_set_cmd_init_queue(&cmd, queue->ccq_id, queue->queue_id,
+				   queue->queue_memory->iova,
+				   queue->max_num_submit,
+				   syncpt_info->syncpt_id,
+				   syncpt_info->syncpt_iova);
+
+	err = pva_kmd_submit_cmd_sync(&ctx->submitter, &cmd, sizeof(cmd),
+				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
 		goto unmap_iova;
 	}

-	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
-	if (queue_cmd == NULL) {
-		err = PVA_NOMEM;
-		goto cancel_submitter;
-	}
-	ASSERT(queue_cmd != NULL);
-	pva_kmd_set_cmd_init_queue(queue_cmd, queue->ccq_id, queue->queue_id,
-				   queue->queue_memory->iova,
-				   queue->max_num_submit);
-
-	err = pva_kmd_submitter_submit(&ctx->submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		goto cancel_submitter;
-	}
-
-	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
-				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
-				     PVA_KMD_WAIT_FW_TIMEOUT_US);
-	if (err != PVA_SUCCESS) {
-		goto cancel_submitter;
-	}
-
 	return PVA_SUCCESS;

-cancel_submitter:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
 unmap_iova:
 	pva_kmd_device_memory_iova_unmap(submission_mem_kmd);
 err_free_kmd_memory:
 	pva_kmd_device_memory_free(queue->queue_memory);
-	pva_kmd_queue_deinit(queue);
 err_free_queue:
 	tmperr = pva_kmd_free_block(&ctx->queue_allocator, *queue_id);
 	ASSERT(tmperr == PVA_SUCCESS);
@@ -210,35 +169,40 @@ enum pva_error pva_kmd_queue_destroy(struct pva_kmd_context *ctx,
 {
 	struct pva_kmd_queue *queue;
 	enum pva_error err = PVA_SUCCESS;
+	enum pva_error tmp_err;

-	/*
-	 * TODO :
-	 * Send command to FW to stop queue usage. Wait for ack.
-	 * This call needs to be added after syncpoint and ccq functions are ready.
-	 */
 	pva_kmd_mutex_lock(&ctx->queue_allocator.allocator_lock);
 	queue = pva_kmd_get_block_unsafe(&ctx->queue_allocator, queue_id);
 	if (queue == NULL) {
-		pva_kmd_mutex_unlock(&ctx->queue_allocator.allocator_lock);
-		return PVA_INVAL;
+		pva_kmd_log_err("Destroying non-existent queue");
+		err = PVA_INVAL;
+		goto unlock;
 	}
-	if (!ctx->pva->recovery) {
+
 	err = notify_fw_queue_deinit(ctx, queue);
 	if (err != PVA_SUCCESS) {
-			pva_kmd_mutex_unlock(
-				&ctx->queue_allocator.allocator_lock);
-			return err;
-		}
+		//Might happen if FW is aborted. It's safe to keep going.
+		pva_kmd_log_err("Failed to notify FW to destroy queue");
 	}

 	pva_kmd_device_memory_iova_unmap(queue->queue_memory);
-
 	pva_kmd_device_memory_free(queue->queue_memory);
-
-	pva_kmd_queue_deinit(queue);
+	tmp_err = pva_kmd_free_block_unsafe(&ctx->queue_allocator, queue_id);
+	// This cannot fail as we have already checked for queue existence and we
+	// are still holding the lock
+	ASSERT(tmp_err == PVA_SUCCESS);
+unlock:
 	pva_kmd_mutex_unlock(&ctx->queue_allocator.allocator_lock);
-
-	err = pva_kmd_free_block(&ctx->queue_allocator, queue_id);
-	ASSERT(err == PVA_SUCCESS);
-	return PVA_SUCCESS;
+	return err;
+}
+
+const struct pva_syncpt_rw_info *
+pva_kmd_queue_get_rw_syncpt_info(struct pva_kmd_context *ctx, uint8_t queue_id)
+{
+	uint8_t ctx_offset =
+		safe_mulu32(ctx->ccq_id, PVA_NUM_RW_SYNCPTS_PER_CONTEXT);
+	uint32_t syncpt_index = safe_addu32(ctx_offset, queue_id);
+
+	ASSERT(syncpt_index < PVA_NUM_RW_SYNCPTS);
+	return &ctx->pva->rw_syncpts[syncpt_index];
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.h
@@ -33,6 +33,8 @@ enum pva_error
 pva_kmd_queue_submit(struct pva_kmd_queue *queue,
 		     struct pva_fw_cmdbuf_submit_info const *submit_info);
 uint32_t pva_kmd_queue_space(struct pva_kmd_queue *queue);
-void pva_kmd_queue_deinit(struct pva_kmd_queue *queue);
+
+const struct pva_syncpt_rw_info *
+pva_kmd_queue_get_rw_syncpt_info(struct pva_kmd_context *ctx, uint8_t queue_id);

 #endif // PVA_KMD_QUEUE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.c
@@ -46,8 +46,7 @@ static uint32_t get_max_dma_config_size(struct pva_kmd_device *pva)
 enum pva_error
 pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
 			    struct pva_kmd_device *pva,
-			    uint8_t user_smmu_ctx_id, uint32_t n_entries,
-			    uint32_t max_num_dma_configs)
+			    uint8_t user_smmu_ctx_id, uint32_t n_entries)
 {
 	uint32_t max_dma_config_size = get_max_dma_config_size(pva);
 	enum pva_error err;
@@ -56,45 +55,55 @@ pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
 	res_table->pva = pva;
 	res_table->n_entries = n_entries;
 	res_table->user_smmu_ctx_id = user_smmu_ctx_id;
+	pva_kmd_sema_init(&res_table->resource_semaphore, n_entries);
+	pva_kmd_mutex_init(&res_table->resource_table_lock);

 	size = (uint64_t)safe_mulu32(
 		n_entries, (uint32_t)sizeof(struct pva_resource_entry));
 	res_table->table_mem = pva_kmd_device_memory_alloc_map(
 		size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
-	ASSERT(res_table->table_mem != NULL);
-
-	pva_kmd_sema_init(&res_table->resource_semaphore, n_entries);
-	pva_kmd_mutex_init(&res_table->resource_table_lock);
+	if (res_table->table_mem == NULL) {
+		err = PVA_NOMEM;
+		goto deinit_locks;
+	}

 	size = (uint64_t)safe_mulu32(sizeof(struct pva_kmd_resource_record),
 				     n_entries);
 	res_table->records_mem = pva_kmd_zalloc(size);

-	ASSERT(res_table->records_mem != NULL);
+	if (res_table->records_mem == NULL) {
+		err = PVA_NOMEM;
+		goto free_table_mem;
+	}

 	err = pva_kmd_block_allocator_init(
 		&res_table->resource_record_allocator, res_table->records_mem,
 		PVA_RESOURCE_ID_BASE, sizeof(struct pva_kmd_resource_record),
 		n_entries);
-	ASSERT(err == PVA_SUCCESS);
+	if (err != PVA_SUCCESS) {
+		goto free_records_mem;
+	}

-	size = (uint64_t)safe_mulu32(max_num_dma_configs, max_dma_config_size);
-	res_table->dma_config_mem = pva_kmd_device_memory_alloc_map(
-		size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
-	ASSERT(res_table->dma_config_mem != NULL);
-
-	err = pva_kmd_block_allocator_init(&res_table->dma_config_allocator,
-					   res_table->dma_config_mem->va, 0,
+	err = pva_kmd_devmem_pool_init(&res_table->dma_config_pool, pva,
+				       PVA_R5_SMMU_CONTEXT_ID,
 				       max_dma_config_size,
-					   max_num_dma_configs);
-	ASSERT(err == PVA_SUCCESS);
-
-	res_table->dma_aux = pva_kmd_zalloc(
-		safe_mulu32((uint32_t)sizeof(struct pva_kmd_dma_resource_aux),
-			    max_num_dma_configs));
-	ASSERT(res_table->dma_aux != NULL);
+				       PVA_KMD_DMA_CONFIG_POOL_INCR);
+	if (err != PVA_SUCCESS) {
+		goto free_resource_record_allocator;
+	}

 	return PVA_SUCCESS;
+
+free_resource_record_allocator:
+	pva_kmd_block_allocator_deinit(&res_table->resource_record_allocator);
+free_records_mem:
+	pva_kmd_free(res_table->records_mem);
+free_table_mem:
+	pva_kmd_device_memory_free(res_table->table_mem);
+deinit_locks:
+	pva_kmd_mutex_deinit(&res_table->resource_table_lock);
+	pva_kmd_sema_deinit(&res_table->resource_semaphore);
+	return err;
 }

 static struct pva_kmd_resource_record *
@@ -118,7 +127,7 @@ pva_kmd_alloc_resource_id(struct pva_kmd_resource_table *resource_table,
 		goto out;
 	}

-	rec = (struct pva_kmd_resource_record *)pva_kmd_alloc_block(
+	rec = (struct pva_kmd_resource_record *)pva_kmd_zalloc_block(
 		&resource_table->resource_record_allocator, out_resource_id);
 	ASSERT(rec != NULL);

@@ -141,9 +150,8 @@ pva_kmd_free_resource_id(struct pva_kmd_resource_table *resource_table,

 static void
 pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,
-			 uint32_t resource_id)
+			 uint32_t resource_id, bool drop_dma_reference)
 {
-	enum pva_error err;
 	struct pva_kmd_resource_record *rec = pva_kmd_get_block_unsafe(
 		&resource_table->resource_record_allocator, resource_id);

@@ -151,9 +159,7 @@ pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,

 	switch (rec->type) {
 	case PVA_RESOURCE_TYPE_DRAM:
-		if (rec->dram.syncpt != true) {
 		pva_kmd_device_memory_free(rec->dram.mem);
-		}
 		break;
 	case PVA_RESOURCE_TYPE_EXEC_BIN:
 		pva_kmd_unload_executable(&rec->vpu_bin.symbol_table,
@@ -161,12 +167,12 @@ pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,
 					  rec->vpu_bin.sections_mem);
 		break;
 	case PVA_RESOURCE_TYPE_DMA_CONFIG: {
-		struct pva_kmd_dma_resource_aux *dma_aux;
-		dma_aux = &resource_table->dma_aux[rec->dma_config.block_index];
-		pva_kmd_unload_dma_config_unsafe(dma_aux);
-		err = pva_kmd_free_block(&resource_table->dma_config_allocator,
-					 rec->dma_config.block_index);
-		ASSERT(err == PVA_SUCCESS);
+		if (drop_dma_reference) {
+			pva_kmd_unload_dma_config_unsafe(
+				rec->dma_config.aux_mem);
+		}
+		pva_kmd_free(rec->dma_config.aux_mem);
+		pva_kmd_devmem_pool_free(&rec->dma_config.devmem);
 		break;
 	}

@@ -177,33 +183,6 @@ pva_kmd_release_resource(struct pva_kmd_resource_table *resource_table,
 	pva_kmd_free_resource_id(resource_table, resource_id);
 }

-enum pva_error
-pva_kmd_add_syncpt_resource(struct pva_kmd_resource_table *resource_table,
-			    struct pva_kmd_device_memory *dev_mem,
-			    uint32_t *out_resource_id)
-{
-	struct pva_kmd_resource_record *rec =
-		pva_kmd_alloc_resource_id(resource_table, out_resource_id);
-
-	if (rec == NULL) {
-		pva_kmd_log_err("No more resource id");
-		return PVA_NO_RESOURCE_ID;
-	}
-
-	pva_kmd_mutex_lock(&resource_table->resource_table_lock);
-	if (*out_resource_id > resource_table->curr_max_resource_id) {
-		resource_table->curr_max_resource_id = *out_resource_id;
-	}
-	pva_kmd_mutex_unlock(&resource_table->resource_table_lock);
-
-	rec->type = PVA_RESOURCE_TYPE_DRAM;
-	rec->dram.mem = dev_mem;
-	rec->dram.syncpt = true;
-	rec->ref_count = 1;
-
-	return PVA_SUCCESS;
-}
-
 enum pva_error
 pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,
 				 struct pva_kmd_device_memory *dev_mem,
@@ -225,7 +204,6 @@ pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,

 	rec->type = PVA_RESOURCE_TYPE_DRAM;
 	rec->dram.mem = dev_mem;
-	rec->dram.syncpt = false;
 	rec->ref_count = 1;

 	return PVA_SUCCESS;
@@ -271,6 +249,7 @@ void pva_kmd_update_fw_resource_table(struct pva_kmd_resource_table *res_table)
 			entry->size_lo = iova_lo(rec->dram.mem->size);
 			entry->size_hi = iova_hi(rec->dram.mem->size);
 			entry->smmu_context_id = rec->dram.mem->smmu_ctx_idx;
+			entry->access_flags = rec->dram.mem->iova_access_flags;
 			break;
 		case PVA_RESOURCE_TYPE_INVALID:
 			break;
@@ -349,7 +328,7 @@ void pva_kmd_drop_resource_unsafe(struct pva_kmd_resource_table *resource_table,

 	rec->ref_count = safe_subu32(rec->ref_count, 1U);
 	if (rec->ref_count == 0) {
-		pva_kmd_release_resource(resource_table, resource_id);
+		pva_kmd_release_resource(resource_table, resource_id, true);
 	}
 }

@@ -414,6 +393,7 @@ pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
 		entry->size_lo = iova_lo(rec->dram.mem->size);
 		entry->size_hi = iova_hi(rec->dram.mem->size);
 		entry->smmu_context_id = rec->dram.mem->smmu_ctx_idx;
+		entry->access_flags = rec->dram.mem->iova_access_flags;
 		break;
 	case PVA_RESOURCE_TYPE_EXEC_BIN:
 		entry->type = rec->type;
@@ -423,6 +403,7 @@ pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
 		entry->size_hi = iova_hi(rec->vpu_bin.metainfo_mem->size);
 		entry->smmu_context_id =
 			rec->vpu_bin.metainfo_mem->smmu_ctx_idx;
+		entry->access_flags = PVA_ACCESS_RO;
 		break;
 	case PVA_RESOURCE_TYPE_DMA_CONFIG:
 		entry->type = rec->type;
@@ -431,6 +412,7 @@ pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
 		entry->size_lo = iova_lo(rec->dma_config.size);
 		entry->size_hi = iova_hi(rec->dma_config.size);
 		entry->smmu_context_id = PVA_R5_SMMU_CONTEXT_ID;
+		entry->access_flags = PVA_ACCESS_RO;
 		break;
 	default:
 		pva_kmd_log_err("Unsupported resource type");
@@ -447,24 +429,30 @@ enum pva_error pva_kmd_add_dma_config_resource(
 	uint32_t dma_config_size, uint32_t *out_resource_id)
 {
 	enum pva_error err = PVA_SUCCESS;
-	uint32_t block_idx, fw_fetch_size;
+	uint32_t fw_fetch_size;
 	void *fw_dma_cfg;
 	struct pva_kmd_dma_resource_aux *dma_aux;
 	struct pva_kmd_resource_record *rec;
 	uint32_t res_id;
+	struct pva_kmd_devmem_element dma_cfg_mem = { 0 };

-	fw_dma_cfg = pva_kmd_zalloc_block(&resource_table->dma_config_allocator,
-					  &block_idx);
-	if (fw_dma_cfg == NULL) {
-		err = PVA_NOMEM;
+	err = pva_kmd_devmem_pool_zalloc(&resource_table->dma_config_pool,
+					 &dma_cfg_mem);
+	if (err != PVA_SUCCESS) {
 		goto err_out;
 	}
+	fw_dma_cfg = pva_kmd_get_devmem_va(&dma_cfg_mem);

 	// Must satisfy alignment requirement for converting to struct
 	// pva_dma_config_resource*
 	ASSERT(((uintptr_t)fw_dma_cfg) % sizeof(uint64_t) == 0);

-	dma_aux = &resource_table->dma_aux[block_idx];
+	dma_aux = pva_kmd_zalloc(sizeof(struct pva_kmd_dma_resource_aux));
+	if (dma_aux == NULL) {
+		err = PVA_NOMEM;
+		goto free_dma_cfg_mem;
+	}
+	dma_aux->res_table = resource_table;

 	pva_kmd_mutex_lock(&resource_table->resource_table_lock);
 	err = pva_kmd_load_dma_config(resource_table, dma_cfg_hdr,
@@ -472,7 +460,7 @@ enum pva_error pva_kmd_add_dma_config_resource(
 				      &fw_fetch_size);
 	pva_kmd_mutex_unlock(&resource_table->resource_table_lock);
 	if (err != PVA_SUCCESS) {
-		goto free_block;
+		goto free_dma_aux;
 	}

 	rec = pva_kmd_alloc_resource_id(resource_table, &res_id);
@@ -489,12 +477,9 @@ enum pva_error pva_kmd_add_dma_config_resource(

 	rec->type = PVA_RESOURCE_TYPE_DMA_CONFIG;
 	rec->ref_count = 1;
-	rec->dma_config.block_index = block_idx;
-	rec->dma_config.iova_addr = safe_addu64(
-		resource_table->dma_config_mem->iova,
-		(uint64_t)safe_mulu32(
-			block_idx,
-			resource_table->dma_config_allocator.block_size));
+	rec->dma_config.devmem = dma_cfg_mem;
+	rec->dma_config.aux_mem = dma_aux;
+	rec->dma_config.iova_addr = pva_kmd_get_devmem_iova(&dma_cfg_mem);
 	rec->dma_config.size = fw_fetch_size;

 	*out_resource_id = res_id;
@@ -504,8 +489,10 @@ unload_dma:
 	pva_kmd_mutex_lock(&resource_table->resource_table_lock);
 	pva_kmd_unload_dma_config_unsafe(dma_aux);
 	pva_kmd_mutex_unlock(&resource_table->resource_table_lock);
-free_block:
-	pva_kmd_free_block(&resource_table->dma_config_allocator, block_idx);
+free_dma_aux:
+	pva_kmd_free(dma_aux);
+free_dma_cfg_mem:
+	pva_kmd_devmem_pool_free(&dma_cfg_mem);
 err_out:
 	return err;
 }
@@ -523,7 +510,7 @@ pva_kmd_release_all_resources(struct pva_kmd_resource_table *res_table)
 		struct pva_kmd_resource_record *rec =
 			pva_kmd_peek_resource(res_table, id);
 		if (rec != NULL) {
-			pva_kmd_release_resource(res_table, id);
+			pva_kmd_release_resource(res_table, id, false);
 		}
 	}
 	pva_kmd_mutex_unlock(&res_table->resource_table_lock);
@@ -533,11 +520,9 @@ pva_kmd_release_all_resources(struct pva_kmd_resource_table *res_table)
 void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table)
 {
 	pva_kmd_release_all_resources(res_table);
-	pva_kmd_free(res_table->dma_aux);
-	pva_kmd_block_allocator_deinit(&res_table->dma_config_allocator);
-	pva_kmd_device_memory_free(res_table->dma_config_mem);
 	pva_kmd_block_allocator_deinit(&res_table->resource_record_allocator);
 	pva_kmd_free(res_table->records_mem);
+	pva_kmd_devmem_pool_deinit(&res_table->dma_config_pool);
 	pva_kmd_mutex_deinit(&res_table->resource_table_lock);
 	pva_kmd_sema_deinit(&res_table->resource_semaphore);
 	pva_kmd_device_memory_free(res_table->table_mem);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.h
@@ -14,12 +14,12 @@
 #include "pva_kmd_dma_cfg.h"
 #include "pva_kmd_mutex.h"
 #include "pva_kmd_thread_sema.h"
+#include "pva_kmd_devmem_pool.h"

 struct pva_kmd_device;

 struct pva_kmd_dram_resource {
 	struct pva_kmd_device_memory *mem;
-	bool syncpt;
 };

 struct pva_kmd_vpu_bin_resource {
@@ -29,7 +29,8 @@ struct pva_kmd_vpu_bin_resource {
 };

 struct pva_kmd_dma_config_resource {
-	uint32_t block_index;
+	struct pva_kmd_devmem_element devmem;
+	struct pva_kmd_dma_resource_aux *aux_mem;
 	uint64_t size;
 	uint64_t iova_addr;
 };
@@ -70,13 +71,8 @@ struct pva_kmd_resource_table {
 	/** Memory for resource table entries, in R5 segment */
 	struct pva_kmd_device_memory *table_mem;

-	/** Memory for fw dma configs, in DMA segment */
-	struct pva_kmd_device_memory *dma_config_mem;
-	struct pva_kmd_block_allocator dma_config_allocator;
-
-	/** Memory for tracking resources used by DMA configuration. Single
-	 * allocation shared by all DMA configs */
-	struct pva_kmd_dma_resource_aux *dma_aux;
+	/** Pool for FW DMA configurations */
+	struct pva_kmd_devmem_pool dma_config_pool;

 	/** Memory for resource records */
 	void *records_mem;
@@ -88,8 +84,7 @@ struct pva_kmd_resource_table {
 enum pva_error
 pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
 			    struct pva_kmd_device *pva,
-			    uint8_t user_smmu_ctx_id, uint32_t n_entries,
-			    uint32_t max_num_dma_configs);
+			    uint8_t user_smmu_ctx_id, uint32_t n_entries);
 void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table);

 /** KMD only writes to FW resource table during init time. Once the address of
@@ -97,11 +92,6 @@ void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table);
 */
 void pva_kmd_update_fw_resource_table(struct pva_kmd_resource_table *res_table);

-enum pva_error
-pva_kmd_add_syncpt_resource(struct pva_kmd_resource_table *resource_table,
-			    struct pva_kmd_device_memory *dev_mem,
-			    uint32_t *out_resource_id);
-
 enum pva_error
 pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,
 				 struct pva_kmd_device_memory *memory,
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.c
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

 #include "pva_kmd_sha256.h"
+#include "pva_math_utils.h"

 #define ROTLEFT(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
 #define ROTRIGHT(a, b) (((a) >> (b)) | ((a) << (32 - (b))))
@@ -58,9 +59,11 @@ static void sha256_transform(struct sha256_ctx *ctx, const void *data_in)
 		m[i] = SWAP32(data[i]);
 	}
 	for (i = 0; i < U32(64) - U32(16); ++i) {
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-		m[i + U32(16)] = SIG1(m[U32(14) + i]) + m[U32(9) + i] +
-				 SIG0(m[U32(1) + i]) + m[i];
+		m[i + U32(16)] = safe_wrap_add_u32(
+			safe_wrap_add_u32(safe_wrap_add_u32(SIG1(m[U32(14) + i]),
+							    m[U32(9) + i]),
+					  SIG0(m[U32(1) + i])),
+			m[i]);
 	}

 	a = ctx->state[0];
@@ -73,38 +76,32 @@ static void sha256_transform(struct sha256_ctx *ctx, const void *data_in)
 	h = ctx->state[7];

 	for (i = 0; i < U32(64); ++i) {
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-		t1 = h + SHA_EP1(e) + CH(e, f, g) + k[i] + m[i];
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-		t2 = SHA_EP0(a) + MAJ(a, b, c);
+		t1 = safe_wrap_add_u32(
+			safe_wrap_add_u32(
+				safe_wrap_add_u32(safe_wrap_add_u32(h,
+								    SHA_EP1(e)),
+						  CH(e, f, g)),
+				k[i]),
+			m[i]);
+		t2 = safe_wrap_add_u32(SHA_EP0(a), MAJ(a, b, c));
 		h = g;
 		g = f;
 		f = e;
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-		e = d + t1;
+		e = safe_wrap_add_u32(d, t1);
 		d = c;
 		c = b;
 		b = a;
-		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-		a = t1 + t2;
+		a = safe_wrap_add_u32(t1, t2);
 	}

-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[0] += a;
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[1] += b;
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[2] += c;
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[3] += d;
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[4] += e;
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[5] += f;
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[6] += g;
-	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
-	ctx->state[7] += h;
+	ctx->state[0] = safe_wrap_add_u32(ctx->state[0], a);
+	ctx->state[1] = safe_wrap_add_u32(ctx->state[1], b);
+	ctx->state[2] = safe_wrap_add_u32(ctx->state[2], c);
+	ctx->state[3] = safe_wrap_add_u32(ctx->state[3], d);
+	ctx->state[4] = safe_wrap_add_u32(ctx->state[4], e);
+	ctx->state[5] = safe_wrap_add_u32(ctx->state[5], f);
+	ctx->state[6] = safe_wrap_add_u32(ctx->state[6], g);
+	ctx->state[7] = safe_wrap_add_u32(ctx->state[7], h);
 }

 void sha256_init(struct sha256_ctx *ctx)
@@ -127,7 +124,8 @@ void sha256_update(struct sha256_ctx *ctx, const void *data, size_t len)
 	for (i = 0; i < len; i += U32(64)) {
 		ctx->bitlen &= U32(0xffffffff);
 		sha256_transform(ctx, ((const uint8_t *)data) + i);
-		ctx->bitlen += U32(512);
+		ctx->bitlen =
+			safe_wrap_add_u32((uint32_t)ctx->bitlen, U32(512));
 	}
 }

@@ -148,7 +146,9 @@ void sha256_finalize(struct sha256_ctx *ctx, const void *input,

 	/* the false of this condition is illegal for this API agreement */
 	/* this check is here only for Coverity INT30-C */
-	ctx->bitlen += input_size * U32(8);
+	ctx->bitlen = safe_wrap_add_u32((uint32_t)ctx->bitlen,
+					safe_wrap_mul_u32((uint32_t)input_size,
+							  U32(8)));
 	(void)memcpy(p, input, input_size);
 	data[input_size] = 0x80;

--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_shared_buffer.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_shared_buffer.c
@@ -7,82 +7,6 @@
 #include "pva_kmd_shim_trace_event.h"
 #include "pva_kmd_shared_buffer.h"

-static void
-setup_cmd_init_shared_dram_buffer(void *cmd, uint8_t interface,
-				  struct pva_kmd_shared_buffer *fw_buffer)
-{
-	struct pva_cmd_init_shared_dram_buffer *init_cmd =
-		(struct pva_cmd_init_shared_dram_buffer *)cmd;
-
-	pva_kmd_set_cmd_init_shared_dram_buffer(
-		init_cmd, interface, fw_buffer->resource_memory->iova,
-		fw_buffer->resource_memory->size);
-}
-
-static void
-setup_cmd_deinit_shared_dram_buffer(void *cmd, uint8_t interface,
-				    struct pva_kmd_shared_buffer *fw_buffer)
-{
-	struct pva_cmd_deinit_shared_dram_buffer *deinit_cmd =
-		(struct pva_cmd_deinit_shared_dram_buffer *)cmd;
-
-	pva_kmd_set_cmd_deinit_shared_dram_buffer(deinit_cmd, interface);
-}
-
-static enum pva_error
-notify_fw(struct pva_kmd_device *pva, uint8_t interface,
-	  void (*setup_cmd_cb)(void *cmd, uint8_t interface,
-			       struct pva_kmd_shared_buffer *fw_buffer),
-	  size_t cmd_size)
-{
-	enum pva_error err;
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
-	struct pva_kmd_shared_buffer *fw_buffer;
-	void *cmd_space;
-	uint32_t fence_val;
-
-	ASSERT(interface < PVA_MAX_NUM_CCQ);
-
-	fw_buffer = &pva->kmd_fw_buffers[interface];
-
-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto err_out;
-	}
-
-	// Make sure FW buffer was allocated
-	ASSERT(fw_buffer->header != NULL);
-
-	cmd_space = pva_kmd_reserve_cmd_space(&builder, cmd_size);
-	ASSERT(cmd_space != NULL);
-
-	// Let the setup callback configure the specific command
-	setup_cmd_cb(cmd_space, interface, fw_buffer);
-
-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		// Error is either QUEUE_FULL or TIMEDOUT
-		goto cancel_builder;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
-				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
-				     PVA_KMD_WAIT_FW_TIMEOUT_US);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out while processing buffer command");
-		goto err_out;
-	}
-
-	return PVA_SUCCESS;
-
-cancel_builder:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
-err_out:
-	return err;
-}
-
 enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,
 					  uint8_t interface,
 					  uint32_t element_size,
@@ -95,17 +19,24 @@ enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,
 	struct pva_kmd_device_memory *device_memory;
 	struct pva_kmd_shared_buffer *buffer;
 	uint64_t buffer_size;
+	struct pva_cmd_init_shared_dram_buffer init_cmd = { 0 };

 	ASSERT(interface < PVA_MAX_NUM_CCQ);
 	buffer = &pva->kmd_fw_buffers[interface];

+	// If the buffer is already initialized, skip buffer allocation and just notify FW.
+	// This is needed to support suspend/resume.
+	if (buffer->header == NULL) {
 		// Ensure that the buffer body is a multiple of 'element size'
 		buffer_size = safe_mulu64(num_entries, element_size);
-	buffer_size = safe_addu64(buffer_size,
+		buffer_size =
+			safe_addu64(buffer_size,
 				    sizeof(struct pva_fw_shared_buffer_header));

-	device_memory = pva_kmd_device_memory_alloc_map(
-		buffer_size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+		device_memory =
+			pva_kmd_device_memory_alloc_map(buffer_size, pva,
+							PVA_ACCESS_RW,
+							PVA_R5_SMMU_CONTEXT_ID);
 		if (device_memory == NULL) {
 			return PVA_NOMEM;
 		}
@@ -116,8 +47,8 @@ enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,
 		buffer->header->element_size = element_size;
 		buffer->header->head = 0U;
 		buffer->header->tail = 0U;
-	buffer->body =
-		(pva_offset_pointer(buffer->header, sizeof(*buffer->header)));
+		buffer->body = (pva_offset_pointer(buffer->header,
+						   sizeof(*buffer->header)));
 		buffer->lock_cb = lock_cb;
 		buffer->unlock_cb = unlock_cb;
 		buffer->resource_offset = 0U;
@@ -125,12 +56,24 @@ enum pva_error pva_kmd_shared_buffer_init(struct pva_kmd_device *pva,

 		err = pva_kmd_bind_shared_buffer_handler(pva, interface, pva);
 		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err_u64(
+				"Failed to bind shared buffer handler for interface",
+				interface);
 			goto free_buffer_memory;
 		}
+	} else {
+		device_memory = buffer->resource_memory;
+	}

-	err = notify_fw(pva, interface, setup_cmd_init_shared_dram_buffer,
-			sizeof(struct pva_cmd_init_shared_dram_buffer));
+	pva_kmd_set_cmd_init_shared_dram_buffer(
+		&init_cmd, interface, device_memory->iova, device_memory->size);
+
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &init_cmd,
+				      sizeof(init_cmd),
+				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Failed to submit command");
 		goto release_handler;
 	}

@@ -140,6 +83,8 @@ release_handler:
 	pva_kmd_release_shared_buffer_handler(pva, interface);
 free_buffer_memory:
 	pva_kmd_device_memory_free(device_memory);
+	buffer->header = NULL;
+	buffer->resource_memory = NULL;
 	return err;
 }

@@ -148,22 +93,26 @@ enum pva_error pva_kmd_shared_buffer_deinit(struct pva_kmd_device *pva,
 {
 	enum pva_error err = PVA_SUCCESS;
 	struct pva_kmd_shared_buffer *buffer;
+	struct pva_cmd_deinit_shared_dram_buffer deinit_cmd = { 0 };

 	ASSERT(interface < PVA_MAX_NUM_CCQ);
 	buffer = &pva->kmd_fw_buffers[interface];

-	if (!pva->recovery) {
-		err = notify_fw(
-			pva, interface, setup_cmd_deinit_shared_dram_buffer,
-			sizeof(struct pva_cmd_deinit_shared_dram_buffer));
+	pva_kmd_set_cmd_deinit_shared_dram_buffer(&deinit_cmd, interface);
+
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &deinit_cmd,
+				      sizeof(deinit_cmd),
+				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-			pva_kmd_log_err("Failed to deinit FW buffer");
-		}
+		// This might happen if FW is aborted. It's safe to keep going.
+		pva_kmd_log_err("Failed to notify FW of buffer deinit");
 	}
 	pva_kmd_release_shared_buffer_handler(pva, interface);

 	pva_kmd_shared_buffer_process(pva, interface);

+	buffer->header = NULL;
 	pva_kmd_device_memory_free(buffer->resource_memory);
 	buffer->resource_memory = NULL;

@@ -176,6 +125,7 @@ static void shared_buffer_process_msg(struct pva_kmd_device *pva,
 	enum pva_error err = PVA_SUCCESS;
 	struct pva_kmd_fw_buffer_msg_header header;
 	struct pva_kmd_fw_msg_vpu_trace vpu_trace;
+	struct pva_kmd_fw_msg_fence_trace fence_trace;
 	struct pva_kmd_fw_msg_res_unreg unreg_data;
 	struct pva_kmd_context *ctx = NULL;
 	void *msg_body;
@@ -214,6 +164,12 @@ static void shared_buffer_process_msg(struct pva_kmd_device *pva,
 		pva_kmd_shim_add_trace_vpu_exec(pva, &vpu_trace);
 		break;
 	}
+	case PVA_KMD_FW_BUF_MSG_TYPE_FENCE_TRACE: {
+		ASSERT(msg_size == sizeof(struct pva_kmd_fw_msg_fence_trace));
+		memcpy(&fence_trace, msg_body, sizeof(fence_trace));
+		pva_kmd_shim_add_trace_fence(pva, &fence_trace);
+		break;
+	}
 	case PVA_KMD_FW_BUF_MSG_TYPE_RES_UNREG: {
 		ASSERT(msg_size == sizeof(struct pva_kmd_fw_msg_res_unreg));
 		memcpy(&unreg_data, msg_body, sizeof(unreg_data));
@@ -281,7 +237,7 @@ void pva_kmd_shared_buffer_process(void *pva_dev, uint8_t interface)
 			// Note that ideally this should never happen as the buffer is expected to be
 			// the same size as the resource table.
 			// TODO: abort only the user context, not the device.
-			pva_kmd_abort(pva);
+			pva_kmd_abort_fw(pva);
 		}

 		// Buffer corresponding to CCQ 0 is used for sending messages common to a VM.
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.c
@@ -4,6 +4,7 @@
 #include "pva_kmd_device.h"
 #include "pva_fw_address_map.h"
 #include "pva_fw_hyp.h"
+#include "pva_kmd_shim_init.h"
 #include "pva_kmd_thread_sema.h"
 #include "pva_kmd_constants.h"
 #include "pva_kmd_silicon_isr.h"
@@ -153,27 +154,12 @@ void pva_kmd_config_sid(struct pva_kmd_device *pva)
 	}
 }

-static uint32_t pva_kmd_get_syncpt_ro_offset(struct pva_kmd_device *pva)
+static uint32_t get_syncpt_offset(struct pva_kmd_device *pva,
+				  uint64_t syncpt_iova)
 {
-	if (pva->num_syncpts > 0U) {
+	if (pva->num_ro_syncpts > 0U) {
 		uint64_t offset;
-		offset = safe_subu64(pva->syncpt_ro_iova,
-				     pva_kmd_get_r5_iova_start());
-
-		ASSERT(offset <= UINT32_MAX);
-		return (uint32_t)offset;
-	} else {
-		// This is only for SIM mode where syncpoints are not supported.
-		return PVA_R5_SYNCPT_REGION_IOVA_OFFSET_NOT_SET;
-	}
-}
-
-static uint32_t pva_kmd_get_syncpt_rw_offset(struct pva_kmd_device *pva)
-{
-	if (pva->num_syncpts > 0U) {
-		uint64_t offset;
-		offset = safe_subu64(pva->syncpt_rw_iova,
-				     pva_kmd_get_r5_iova_start());
+		offset = safe_subu64(syncpt_iova, pva_kmd_get_r5_iova_start());

 		ASSERT(offset <= UINT32_MAX);
 		return (uint32_t)offset;
@@ -249,12 +235,17 @@ enum pva_error pva_kmd_init_fw(struct pva_kmd_device *pva)
 	if (pva->bl_sector_pack_format == PVA_BL_XBAR_RAW) {
 		boot_sema = PVA_BOOT_SEMA_USE_XBAR_RAW;
 	}
+	if (pva->test_mode) {
+		boot_sema |= PVA_BOOT_SEMA_TEST_MODE;
+	}
 	pva_kmd_set_sema(pva, PVA_BOOT_SEMA, boot_sema);

-	pva_kmd_write(pva, PVA_REG_HSP_SS2_SET_ADDR,
-		      pva_kmd_get_syncpt_ro_offset(pva));
-	pva_kmd_write(pva, PVA_REG_HSP_SS3_SET_ADDR,
-		      pva_kmd_get_syncpt_rw_offset(pva));
+	pva_kmd_set_sema(pva, PVA_RO_SYNC_BASE_SEMA,
+			 get_syncpt_offset(pva, pva->ro_syncpt_base_iova));
+	pva_kmd_set_sema(pva, PVA_RW_SYNC_BASE_SEMA,
+			 get_syncpt_offset(pva, pva->rw_syncpt_base_iova));
+	pva_kmd_set_sema(pva, PVA_RW_SYNC_SIZE_SEMA,
+			 pva->rw_syncpt_region_size);

 	pva_kmd_config_sid_regs(pva);

@@ -290,6 +281,7 @@ free_sec_lic:
 	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
 free_fw_debug_mem:
 	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
+	pva_kmd_freeze_fw(pva);
 	pva_kmd_device_memory_free(pva->fw_debug_mem);
 free_fw_mem:
 	if (!pva->load_from_gsc) {
@@ -299,17 +291,14 @@ out:
 	return err;
 }

-void pva_kmd_deinit_fw(struct pva_kmd_device *pva)
+void pva_kmd_freeze_fw(struct pva_kmd_device *pva)
 {
-	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
-	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
-
 	/*
-	 * Before powering off PVA, disable SEC error reporting.
-	 * While powering off, PVA might generate (unexplained) error interrupts
-	 * This causes HSM to read some PVA SEC registers. However, since PVA might
-	 * already be powergated by this time, access to PVA SEC registers from HSM
-	 * fails. This was discussed in Bug 3785498.
+	 * Before freezing PVA, disable SEC error reporting.
+	 * While setting the reset line, PVA might generate (unexplained) error
+	 * interrupts This causes HSM to read some PVA SEC registers. However,
+	 * since PVA might already be powergated by this time, access to PVA SEC
+	 * registers from HSM fails. This was discussed in Bug 3785498.
 	 *
 	 * Note: we do not explicity enable these errors during power on since
 	 *	 'enable' is their reset value
@@ -317,6 +306,17 @@ void pva_kmd_deinit_fw(struct pva_kmd_device *pva)
 	disable_sec_mission_error_reporting(pva);
 	disable_sec_latent_error_reporting(pva);

+	pva_kmd_set_reset_line(pva);
+}
+
+void pva_kmd_deinit_fw(struct pva_kmd_device *pva)
+{
+	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
+	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
+
+	// FW so that we can free memory
+	pva_kmd_freeze_fw(pva);
+
 	pva_kmd_device_memory_free(pva->fw_debug_mem);
 	if (!pva->load_from_gsc) {
 		pva_kmd_device_memory_free(pva->fw_bin_mem);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_executable.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_executable.c
@@ -748,9 +748,11 @@ load_metainfo(struct pva_kmd_device *pva, uint64_t section_iova,
 	metainfo->num_vmem_buffers = n_symbols;

 	data_sections_mem = pva_offset_pointer(metainfo, sizeof(*metainfo));
+	if (n_data_sections > 0U && section_infos != NULL) {
 		memcpy(data_sections_mem, section_infos,
 		       mulu32(n_data_sections, (uint32_t)sizeof(*section_infos),
 			      &math_err));
+	}

 	vmem_buffers_mem = pva_offset_pointer(
 		data_sections_mem,
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.c
@@ -42,6 +42,7 @@ int pva_kmd_hwpm_ip_pm(void *ip_dev, bool disable)
 {
 	struct pva_kmd_device *dev = ip_dev;
 	enum pva_error err = PVA_SUCCESS;
+	int ret = 0;

 	if (disable) {
 		err = pva_kmd_device_busy(dev);
@@ -51,5 +52,10 @@ int pva_kmd_hwpm_ip_pm(void *ip_dev, bool disable)
 	} else {
 		pva_kmd_device_idle(dev);
 	}
-	return err;
+
+	if (err != PVA_SUCCESS) {
+		ret = -1;
+	}
+
+	return ret;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.c
@@ -45,22 +45,16 @@ void pva_kmd_hyp_isr(void *data, enum pva_kmd_intr_line intr_line)

 	if (wdt_val != 0) {
 		/* Clear interrupt status */
-		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status,
-			      intr_status &
-				      PVA_MASK(PVA_REG_SEC_LIC_INTR_WDT_MSB,
-					       PVA_REG_SEC_LIC_INTR_WDT_LSB));
+		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status, wdt_val);
 		pva_kmd_log_err("PVA watchdog timeout!");
-		pva_kmd_abort(pva);
+		pva_kmd_abort_fw(pva);
 	}

 	if (h1x_val != 0) {
 		pva_kmd_log_err_u64("Host1x errors", h1x_val);
 		/* Clear interrupt status */
-		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status,
-			      intr_status &
-				      PVA_MASK(PVA_REG_SEC_LIC_INTR_H1X_MSB,
-					       PVA_REG_SEC_LIC_INTR_H1X_LSB));
-		pva_kmd_abort(pva);
+		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status, h1x_val);
+		pva_kmd_abort_fw(pva);
 	}

 	if (hsp_val != 0) {
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_misc.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_misc.c
@@ -23,3 +23,10 @@ uint32_t pva_kmd_get_ccq_space(struct pva_kmd_device *pva, uint8_t ccq_id)
 			    PVA_REG_CCQ_STATUS2_NUM_ENTRIES_LSB, uint32_t);
 	return safe_subu32((uint32_t)PVA_CCQ_DEPTH, len) / 2U;
 }
+
+void pva_kmd_disable_all_interrupts_nosync(struct pva_kmd_device *pva)
+{
+	for (int i = 0; i < PVA_KMD_INTR_LINE_COUNT; i++) {
+		pva_kmd_disable_intr_nosync(pva, (enum pva_kmd_intr_line)i);
+	}
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.c
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

 #include "pva_kmd_submitter.h"
+#include "pva_api_types.h"
 #include "pva_kmd_utils.h"
 #include "pva_kmd_abort.h"

@@ -70,6 +71,7 @@ pva_kmd_submitter_submit_with_fence(struct pva_kmd_submitter *submitter,
 	submit_info.first_chunk_offset_lo = iova_lo(first_chunk_offset);
 	submit_info.first_chunk_offset_hi = iova_hi(first_chunk_offset);
 	submit_info.first_chunk_size = first_chunk_size;
+	submit_info.execution_timeout_ms = PVA_EXEC_TIMEOUT_INF;

 	pva_kmd_mutex_lock(submitter->submit_lock);
 	err = pva_kmd_queue_submit(submitter->queue, &submit_info);
@@ -108,6 +110,7 @@ enum pva_error pva_kmd_submitter_submit(struct pva_kmd_submitter *submitter,
 	submit_info.first_chunk_offset_lo = iova_lo(first_chunk_offset);
 	submit_info.first_chunk_offset_hi = iova_hi(first_chunk_offset);
 	submit_info.first_chunk_size = first_chunk_size;
+	submit_info.execution_timeout_ms = PVA_EXEC_TIMEOUT_INF;
 	/* TODO: remove these flags after FW execute command buffer with no engines. */
 	submit_info.flags =
 		PVA_INSERT8(0x3, PVA_CMDBUF_FLAGS_ENGINE_AFFINITY_MSB,
@@ -137,16 +140,63 @@ enum pva_error pva_kmd_submitter_wait(struct pva_kmd_submitter *submitter,
 {
 	uint32_t volatile *fence_addr = submitter->post_fence_va;
 	uint32_t time_spent = 0;
+	struct pva_kmd_device *pva = submitter->queue->pva;

 	while (*fence_addr < fence_val) {
+		if (pva->recovery) {
+			return PVA_ERR_FW_ABORTED;
+		}
 		pva_kmd_sleep_us(poll_interval_us);
 		time_spent = safe_addu32(time_spent, poll_interval_us);
 		if (time_spent >= timeout_us) {
 			pva_kmd_log_err("pva_kmd_submitter_wait Timed out");
-			pva_kmd_abort(submitter->queue->pva);
+			pva_kmd_abort_fw(submitter->queue->pva);
 			return PVA_TIMEDOUT;
 		}
 	}

 	return PVA_SUCCESS;
 }
+
+enum pva_error pva_kmd_submit_cmd_sync(struct pva_kmd_submitter *submitter,
+				       void *cmds, uint32_t size,
+				       uint32_t poll_interval_us,
+				       uint32_t timeout_us)
+{
+	struct pva_kmd_cmdbuf_builder builder = { 0 };
+	enum pva_error err;
+	void *cmd_dst = NULL;
+	uint32_t fence_val = 0;
+
+	err = pva_kmd_submitter_prepare(submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	cmd_dst = pva_kmd_reserve_cmd_space(&builder, size);
+	if (cmd_dst == NULL) {
+		err = PVA_INVAL;
+		pva_kmd_log_err(
+			"Trying to submit too many commands using pva_kmd_submit_cmd_sync.");
+		goto cancel_builder;
+	}
+
+	memcpy(cmd_dst, cmds, size);
+	err = pva_kmd_submitter_submit(submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		goto cancel_builder;
+	}
+
+	err = pva_kmd_submitter_wait(submitter, fence_val, poll_interval_us,
+				     timeout_us);
+	if (err != PVA_SUCCESS) {
+		goto cancel_builder;
+	}
+
+	return err;
+
+cancel_builder:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+err_out:
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.h
@@ -57,4 +57,11 @@ pva_kmd_submitter_submit_with_fence(struct pva_kmd_submitter *submitter,
 /* add cmd */
 /* do submit with fence (provide a fence) */

+/* Helper function to submit several commands and wait for them to complete.
+Total size must be smaller than a chunk. */
+enum pva_error pva_kmd_submit_cmd_sync(struct pva_kmd_submitter *submitter,
+				       void *cmds, uint32_t size,
+				       uint32_t poll_interval_us,
+				       uint32_t timeout_us);
+
 #endif // PVA_KMD_SUBMITTER_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.c
@@ -59,11 +59,8 @@ enum pva_error
 pva_kmd_notify_fw_get_tegra_stats(struct pva_kmd_device *pva,
 				  struct pva_kmd_tegrastats *kmd_tegra_stats)
 {
-	struct pva_kmd_cmdbuf_builder builder;
-	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
-	struct pva_cmd_get_tegra_stats *cmd;
+	struct pva_cmd_get_tegra_stats cmd = { 0 };
 	uint64_t buffer_offset = 0U;
-	uint32_t fence_val;
 	enum pva_error err = PVA_SUCCESS;
 	struct pva_kmd_fw_tegrastats fw_tegra_stats = { 0 };
 	bool stats_enabled = pva->debugfs_context.stats_enable;
@@ -86,29 +83,15 @@ pva_kmd_notify_fw_get_tegra_stats(struct pva_kmd_device *pva,
 		goto err_out;
 	}

-	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
-	if (err != PVA_SUCCESS) {
-		goto dev_idle;
-	}
-	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
-	ASSERT(cmd != NULL);
-
-	pva_kmd_set_cmd_get_tegra_stats(cmd, pva->tegra_stats_resource_id,
+	pva_kmd_set_cmd_get_tegra_stats(&cmd, pva->tegra_stats_resource_id,
 					pva->tegra_stats_buf_size,
 					buffer_offset, stats_enabled);

-	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
-	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err("tegra stats cmd submission failed");
-		goto cancel_builder;
-	}
-
-	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+	err = pva_kmd_submit_cmd_sync(&pva->submitter, &cmd, sizeof(cmd),
 				      PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
 				      PVA_KMD_WAIT_FW_TIMEOUT_US);
 	if (err != PVA_SUCCESS) {
-		pva_kmd_log_err(
-			"Waiting for FW timed out when getting tegra stats");
+		pva_kmd_log_err("tegra stats cmd submission failed");
 		goto dev_idle;
 	}

@@ -129,8 +112,7 @@ out:
 	kmd_tegra_stats->window_end_time = fw_tegra_stats.window_end_time;

 	return PVA_SUCCESS;
-cancel_builder:
-	pva_kmd_cmdbuf_builder_cancel(&builder);
+
 dev_idle:
 	pva_kmd_device_idle(pva);
 err_out:
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_device_memory.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_device_memory.h
@@ -21,6 +21,7 @@ struct pva_kmd_device_memory {
 	uint64_t size; /**< Size of the mapping. */
 	struct pva_kmd_device *pva; /**< The PVA this memory is mapped to. */
 	uint32_t smmu_ctx_idx; /**< The SMMU context this memory is mapped to. */
+	uint32_t iova_access_flags; /**< Access flags for the memory. RO - 1/WO - 2/RW - 3 */
 };

 /**
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_debugfs.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_debugfs.h
@@ -9,7 +9,8 @@ void pva_kmd_debugfs_create_bool(struct pva_kmd_device *pva, const char *name,
 				 bool *val);
 void pva_kmd_debugfs_create_u32(struct pva_kmd_device *pva, const char *name,
 				uint32_t *val);
-void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
+enum pva_error pva_kmd_debugfs_create_file(struct pva_kmd_device *pva,
+					   const char *name,
 					   struct pva_kmd_file_ops *fops);
 void pva_kmd_debugfs_remove_nodes(struct pva_kmd_device *pva);
 unsigned long pva_kmd_copy_data_from_user(void *dst, const void *src,
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_init.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_init.h
@@ -14,9 +14,6 @@ void pva_kmd_device_plat_deinit(struct pva_kmd_device *pva);
 void pva_kmd_read_syncpt_val(struct pva_kmd_device *pva, uint32_t syncpt_id,
 			     uint32_t *syncpt_value);

-void pva_kmd_get_syncpt_iova(struct pva_kmd_device *pva, uint32_t syncpt_id,
-			     uint64_t *syncpt_iova);
-
 void pva_kmd_allocate_syncpts(struct pva_kmd_device *pva);

 /**
@@ -34,7 +31,7 @@ void pva_kmd_power_off(struct pva_kmd_device *pva);
 * user submission halted. This is requied for host1x
 * watchdog, or kmd submission timeout failures.
 */
-void pva_kmd_fw_reset_assert(struct pva_kmd_device *pva);
+void pva_kmd_freeze_fw(struct pva_kmd_device *pva);

 /**
 * @brief Initialize firmware.
@@ -60,4 +57,18 @@ enum pva_error pva_kmd_init_fw(struct pva_kmd_device *pva);
 * @param pva pointer to the PVA device to de-initialize
 */
 void pva_kmd_deinit_fw(struct pva_kmd_device *pva);
+
+/**
+ * @brief Disable all interrupts without waiting for running interrupt handlers
+ * to complete.
+ *
+ * We don't wait for running interrupt handlers to complete because we want to
+ * be able to call this function from interrupt handles themselves.
+ *
+ * This function is to be called when PVA enters bad state and we want to
+ * protect KMD from potential interrupt floods from PVA (particularly watchdog
+ * interrupt that will trigger repeatedly by HW).
+ */
+void pva_kmd_disable_all_interrupts_nosync(struct pva_kmd_device *pva);
+
 #endif // PVA_KMD_SHIM_INIT_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_silicon.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_silicon.h
@@ -81,9 +81,9 @@ void pva_kmd_enable_intr(struct pva_kmd_device *pva,
 			 enum pva_kmd_intr_line intr_line);

 /**
- * @brief Disable an interrupt line.
+ * @brief Disable an interrupt line without waiting for running interrupt handlers to complete.
 */
-void pva_kmd_disable_intr(struct pva_kmd_device *pva,
+void pva_kmd_disable_intr_nosync(struct pva_kmd_device *pva,
 				 enum pva_kmd_intr_line intr_line);

 /**
@@ -104,13 +104,6 @@ void pva_kmd_free_intr(struct pva_kmd_device *pva,
 */
 enum pva_error pva_kmd_read_fw_bin(struct pva_kmd_device *pva);

-/**
- * @brief Reset assert FW so it can be in recovery and
- * user submission halted. This is requied for host1x
- * watchdog, or kmd submission timeout failures.
- */
-void pva_kmd_fw_reset_assert(struct pva_kmd_device *pva);
-
 /**
 * @brief Get starting IOVA of the memory shared by R5 and KMD.
 *
@@ -141,4 +134,9 @@ void pva_kmd_config_evp_seg_scr_regs(struct pva_kmd_device *pva);
 */
 void pva_kmd_config_sid_regs(struct pva_kmd_device *pva);

+/**
+ * @brief Set the PVA HW reset line.
+ */
+void pva_kmd_set_reset_line(struct pva_kmd_device *pva);
+
 #endif // PVA_KMD_SHIM_SILICON_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_trace_event.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_trace_event.h
@@ -9,4 +9,8 @@ void pva_kmd_shim_add_trace_vpu_exec(
 	struct pva_kmd_device *pva,
 	struct pva_kmd_fw_msg_vpu_trace const *trace_info);

+void pva_kmd_shim_add_trace_fence(
+	struct pva_kmd_device *pva,
+	struct pva_kmd_fw_msg_fence_trace const *trace_info);
+
 #endif // PVA_KMD_SHIM_TRACE_EVENT_H
--- a/drivers/video/tegra/host/pva/src/kmd/include/pva_kmd.h
+++ b/drivers/video/tegra/host/pva/src/kmd/include/pva_kmd.h
@@ -20,24 +20,10 @@ struct pva_ops_context_init {

 struct pva_ops_response_context_init {
 	enum pva_error error;
+	uint16_t max_cmdbuf_chunk_size;
 	uint64_t ccq_shm_hdl;
 };

-struct pva_ops_syncpt_register {
-#define PVA_OPS_OPCODE_SYNCPT_REGISTER (2U | PVA_OPS_PRIVATE_OPCODE_FLAG)
-	struct pva_ops_header header;
-};
-
-struct pva_ops_response_syncpt_register {
-	enum pva_error error;
-	uint32_t syncpt_ro_res_id;
-	uint32_t syncpt_rw_res_id;
-	uint32_t synpt_size;
-	uint32_t synpt_ids[PVA_NUM_RW_SYNCPTS_PER_CONTEXT];
-	uint32_t num_ro_syncpoints;
-	uint32_t pad;
-};
-
 /**
 * Calculates the total memory size required for a PVA submission queue.
 * This includes the size of the queue header and the combined size of all command buffer submission info structures.
@@ -67,7 +53,8 @@ struct pva_ops_queue_create {
 struct pva_ops_response_queue_create {
 	enum pva_error error;
 	uint32_t queue_id;
-	uint32_t syncpt_fence_counter;
+	uint32_t syncpt_id;
+	uint32_t syncpt_current_value;
 };

 /* KMD API: queue destroy */
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_debugfs.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_debugfs.c
@@ -130,7 +130,8 @@ void pva_kmd_debugfs_create_u32(struct pva_kmd_device *pva, const char *name,
 	debugfs_create_u32(name, 0644, de, pdata);
 }

-void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
+enum pva_error pva_kmd_debugfs_create_file(struct pva_kmd_device *pva,
+					   const char *name,
 					   struct pva_kmd_file_ops *pvafops)
 {
 	struct pva_kmd_linux_device_data *device_data =
@@ -142,7 +143,12 @@ void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
 	struct dentry *file;

 	file = debugfs_create_file(name, 0644, de, pvafops, fops);
-	ASSERT(file != NULL);
+	if (file == NULL) {
+		pva_kmd_log_err("Failed to create debugfs file");
+		return PVA_INVAL;
+	}
+
+	return PVA_SUCCESS;
 }

 void pva_kmd_debugfs_remove_nodes(struct pva_kmd_device *pva)
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device.c
@@ -53,15 +53,6 @@ void pva_kmd_read_syncpt_val(struct pva_kmd_device *pva, uint32_t syncpt_id,
 	}
 }

-void pva_kmd_get_syncpt_iova(struct pva_kmd_device *pva, uint32_t syncpt_id,
-			     uint64_t *syncpt_iova)
-{
-	uint32_t offset = 0;
-
-	offset = nvpva_syncpt_unit_interface_get_byte_offset_ext(syncpt_id);
-	*syncpt_iova = safe_addu64(pva->syncpt_ro_iova, (uint64_t)offset);
-}
-
 void pva_kmd_linux_host1x_init(struct pva_kmd_device *pva)
 {
 	phys_addr_t base;
@@ -69,7 +60,6 @@ void pva_kmd_linux_host1x_init(struct pva_kmd_device *pva)
 	int err = 0;
 	uint32_t stride, num_syncpts;
 	uint32_t syncpt_page_size;
-	uint32_t syncpt_offset[PVA_NUM_RW_SYNCPTS];
 	dma_addr_t sp_start;
 	struct device *dev;
 	struct pva_kmd_linux_device_data *device_data =
@@ -92,53 +82,38 @@ void pva_kmd_linux_host1x_init(struct pva_kmd_device *pva)
 	syncpt_page_size = nvpva_syncpt_unit_interface_get_byte_offset_ext(1);
 	dev = &device_data->smmu_contexts[PVA_R5_SMMU_CONTEXT_ID]->dev;
 	if (iommu_get_domain_for_dev(dev)) {
-		sp_start = dma_map_resource(dev, base, size, DMA_TO_DEVICE,
+		sp_start = dma_map_resource(dev, base, size, DMA_BIDIRECTIONAL,
 					    DMA_ATTR_SKIP_CPU_SYNC);
 		if (dma_mapping_error(dev, sp_start)) {
-			FAULT("Failed to pin RO syncpoints\n");
+			FAULT("Failed to pin syncpoints\n");
 		}
 	} else {
-		FAULT("Failed to pin RO syncpoints\n");
+		FAULT("Failed to pin syncpoints\n");
 	}
-	pva->syncpt_ro_iova = sp_start;
-	pva->syncpt_offset = syncpt_page_size;
-	pva->num_syncpts = (size / syncpt_page_size);
+	pva->ro_syncpt_base_iova = sp_start;
+	pva->syncpt_page_size = syncpt_page_size;
+	pva->num_ro_syncpts = num_syncpts;
+
+	// The same region is also used for RW syncpts...
+	pva->rw_syncpt_base_iova = sp_start;
+	pva->rw_syncpt_region_size = size;

 	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS; i++) {
-		pva->syncpt_rw[i].syncpt_id = nvpva_get_syncpt_client_managed(
-			props->pdev, "pva_syncpt");
-		if (pva->syncpt_rw[i].syncpt_id == 0) {
+		uint32_t syncpt_id;
+		uint64_t syncpt_iova;
+
+		syncpt_id = nvpva_get_syncpt_client_managed(props->pdev,
+							    "pva_syncpt");
+		if (syncpt_id == 0) {
 			FAULT("Failed to get syncpt\n");
 		}
-		syncpt_offset[i] =
+		syncpt_iova = safe_addu64(
+			sp_start,
 			nvpva_syncpt_unit_interface_get_byte_offset_ext(
-				pva->syncpt_rw[i].syncpt_id);
-		err = nvpva_syncpt_read_ext_check(
-			props->pdev, pva->syncpt_rw[i].syncpt_id,
-			&pva->syncpt_rw[i].syncpt_value);
-		if (err < 0) {
-			FAULT("Failed to read syncpoint value\n");
-		}
-	}
+				syncpt_id));

-	pva->syncpt_rw_iova =
-		dma_map_resource(dev,
-				 safe_addu64(base, (uint64_t)syncpt_offset[0]),
-				 safe_mulu64((uint64_t)pva->syncpt_offset,
-					     (uint64_t)PVA_NUM_RW_SYNCPTS),
-				 DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
-	if (dma_mapping_error(dev, pva->syncpt_rw_iova)) {
-		FAULT("Failed to pin RW syncpoints\n");
-	}
-	pva->syncpt_rw[0].syncpt_iova = pva->syncpt_rw_iova;
-	for (uint32_t i = 1; i < PVA_NUM_RW_SYNCPTS; i++) {
-		if (safe_addu32(syncpt_offset[i - 1], pva->syncpt_offset) !=
-		    syncpt_offset[i]) {
-			FAULT("RW syncpts are not contiguous\n");
-		}
-		pva->syncpt_rw[i].syncpt_iova = safe_addu64(
-			pva->syncpt_rw_iova,
-			safe_mulu64((uint64_t)pva->syncpt_offset, (uint64_t)i));
+		pva->rw_syncpts[i].syncpt_iova = syncpt_iova;
+		pva->rw_syncpts[i].syncpt_id = syncpt_id;
 	}
 }

@@ -166,25 +141,19 @@ void pva_kmd_linux_host1x_deinit(struct pva_kmd_device *pva)

 	dev = &device_data->smmu_contexts[PVA_R5_SMMU_CONTEXT_ID]->dev;
 	if (iommu_get_domain_for_dev(dev)) {
-		dma_unmap_resource(dev, pva->syncpt_ro_iova, size,
-				   DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
-		dma_unmap_resource(dev, pva->syncpt_rw_iova,
-				   safe_mulu64((uint64_t)pva->syncpt_offset,
-					       (uint64_t)PVA_NUM_RW_SYNCPTS),
+		dma_unmap_resource(dev, pva->ro_syncpt_base_iova, size,
 				   DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
 	} else {
 		FAULT("Failed to unmap syncpts\n");
 	}
 	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS; i++) {
 		nvpva_syncpt_put_ref_ext(props->pdev,
-					 pva->syncpt_rw[i].syncpt_id);
-		pva->syncpt_rw[i].syncpt_id = 0;
-		pva->syncpt_rw[i].syncpt_iova = 0;
-		pva->syncpt_rw[i].syncpt_value = 0;
+					 pva->rw_syncpts[i].syncpt_id);
+		pva->rw_syncpts[i].syncpt_id = 0;
+		pva->rw_syncpts[i].syncpt_iova = 0;
 	}
-	pva->syncpt_ro_iova = 0;
-	pva->syncpt_rw_iova = 0;
-	pva->syncpt_offset = 0;
+	pva->ro_syncpt_base_iova = 0;
+	pva->syncpt_page_size = 0;
 	nvpva_syncpt_unit_interface_deinit(props->pdev);
 }

@@ -235,21 +204,11 @@ void pva_kmd_power_off(struct pva_kmd_device *pva)
 		pva_kmd_linux_device_get_data(pva);
 	struct nvpva_device_data *props = device_data->pva_device_properties;

-	// Set reset line before cutting off power
-
-	/* Power management operation is asynchronous. We don't control when PVA
-	 * will really be powered down. However, we need to free memories after
-	 * this call. Therefore, we assert the reset line to stop PVA from any
-	 * further activity. */
-	reset_control_acquire(props->reset_control);
-	reset_control_assert(props->reset_control);
-	reset_control_release(props->reset_control);
-
 	pm_runtime_mark_last_busy(&props->pdev->dev);
 	pm_runtime_put(&props->pdev->dev);
 }

-void pva_kmd_fw_reset_assert(struct pva_kmd_device *pva)
+void pva_kmd_set_reset_line(struct pva_kmd_device *pva)
 {
 	struct pva_kmd_linux_device_data *device_data =
 		pva_kmd_linux_device_get_data(pva);
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device_memory.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device_memory.c
@@ -23,7 +23,7 @@ struct pva_kmd_device_memory_impl {
 	struct pva_kmd_device_memory dev_mem;
 	struct dma_buf *dmabuf;
 	struct iosys_map iosysmap;
-	struct dma_buf_attachment *dmabuf_attch;
+	struct dma_buf_attachment *dmabuf_attach;
 	struct sg_table *sgt;
 	uint64_t offset;
 };
@@ -36,11 +36,20 @@ pva_kmd_device_memory_alloc_map(uint64_t size, struct pva_kmd_device *pva,
 	struct device *dev = get_context_device(pva, smmu_ctx_idx);
 	dma_addr_t pa = 0U;
 	void *va = NULL;
+	struct pva_kmd_device_memory_impl *mem_impl;
+
+	mem_impl = pva_kmd_zalloc(sizeof(struct pva_kmd_device_memory_impl));
+	if (mem_impl == NULL) {
+		goto err_out;
+	}
+
+	if (size == 0u) {
+		pva_kmd_log_err("Invalid allocation size");
+		goto free_mem;
+	}

-	struct pva_kmd_device_memory_impl *mem_impl =
-		pva_kmd_zalloc(sizeof(struct pva_kmd_device_memory_impl));
 	va = dma_alloc_coherent(dev, size, &pa, GFP_KERNEL);
-	if (va == NULL) {
+	if (IS_ERR_OR_NULL(va)) {
 		pva_kmd_log_err("dma_alloc_coherent failed");
 		goto free_mem;
 	}
@@ -49,12 +58,13 @@ pva_kmd_device_memory_alloc_map(uint64_t size, struct pva_kmd_device *pva,
 	mem_impl->dev_mem.size = size;
 	mem_impl->dev_mem.pva = pva;
 	mem_impl->dev_mem.smmu_ctx_idx = smmu_ctx_idx;
+	mem_impl->dev_mem.iova_access_flags = iova_access_flags;
 	mem_impl->dmabuf = NULL;

 	return &mem_impl->dev_mem;
-
 free_mem:
 	pva_kmd_free(mem_impl);
+err_out:
 	return NULL;
 }

@@ -66,13 +76,16 @@ struct pva_kmd_device_memory *
 pva_kmd_device_memory_acquire(uint64_t memory_handle, uint64_t offset,
 			      uint64_t size, struct pva_kmd_context *ctx)
 {
-	struct pva_kmd_device_memory_impl *mem_impl =
-		(struct pva_kmd_device_memory_impl *)pva_kmd_zalloc(
-			sizeof(struct pva_kmd_device_memory_impl));
-
 	struct dma_buf *dma_buf;
+	struct pva_kmd_device_memory_impl *mem_impl;
+
+	mem_impl = pva_kmd_zalloc(sizeof(struct pva_kmd_device_memory_impl));
+	if (mem_impl == NULL) {
+		goto err_out;
+	}
+
 	dma_buf = dma_buf_get(memory_handle);
-	if (dma_buf == NULL) {
+	if (IS_ERR_OR_NULL(dma_buf)) {
 		pva_kmd_log_err("Failed to acquire memory");
 		goto free_mem;
 	}
@@ -92,6 +105,7 @@ put_dmabuf:
 	dma_buf_put(dma_buf);
 free_mem:
 	pva_kmd_free(mem_impl);
+err_out:
 	return NULL;
 }

@@ -103,7 +117,7 @@ void pva_kmd_device_memory_free(struct pva_kmd_device_memory *mem)

 	if (mem_impl->dmabuf != NULL) {
 		/* This memory comes from dma_buf_get */
-		if (mem->iova != 0U) {
+		if (mem_impl->dmabuf_attach != NULL) {
 			pva_kmd_device_memory_iova_unmap(mem);
 		}

@@ -160,14 +174,28 @@ pva_kmd_device_memory_iova_map(struct pva_kmd_device_memory *memory,
 	pva_math_error math_err = MATH_OP_SUCCESS;
 	struct pva_kmd_device_memory_impl *mem_impl = container_of(
 		memory, struct pva_kmd_device_memory_impl, dev_mem);
-
-	// struct pva_kmd_linux_device_plat_data *plat_data =
-	// 	pva_kmd_linux_device_get_plat_data(pva);
-	// struct device *dev = plat_data->dev[smmu_ctx_idx];
 	struct device *dev = get_context_device(pva, smmu_ctx_idx);
 	struct dma_buf_attachment *attach;
 	struct sg_table *sgt;
 	enum pva_error err = PVA_SUCCESS;
+	enum dma_data_direction dma_direction;
+	uint64_t iova;
+
+	switch (access_flags) {
+	case PVA_ACCESS_RO: // Read-Only
+		dma_direction = DMA_TO_DEVICE;
+		break;
+	case PVA_ACCESS_WO: // Write-Only
+		dma_direction = DMA_FROM_DEVICE;
+		break;
+	case PVA_ACCESS_RW: // Read-Write
+		dma_direction = DMA_BIDIRECTIONAL;
+		break;
+	default:
+		pva_kmd_log_err("Invalid access flags\n");
+		err = PVA_INVAL;
+		goto err_out;
+	}

 	attach = dma_buf_attach(mem_impl->dmabuf, dev);
 	if (IS_ERR_OR_NULL(attach)) {
@@ -176,28 +204,32 @@ pva_kmd_device_memory_iova_map(struct pva_kmd_device_memory *memory,
 		goto err_out;
 	}

-	mem_impl->dmabuf_attch = attach;
-	sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL);
+	sgt = dma_buf_map_attachment(attach, dma_direction);
 	if (IS_ERR_OR_NULL(sgt)) {
 		err = PVA_INVAL;
 		pva_kmd_log_err("Failed to map attachment\n");
 		goto detach;
 	}
-	mem_impl->sgt = sgt;
-	mem_impl->dev_mem.iova =
-		addu64(sg_dma_address(sgt->sgl), mem_impl->offset, &math_err);
+	iova = addu64(sg_dma_address(sgt->sgl), mem_impl->offset, &math_err);
 	if (math_err != MATH_OP_SUCCESS) {
 		err = PVA_INVAL;
 		pva_kmd_log_err(
 			"pva_kmd_device_memory_iova_map Invalid DMA address\n");
-		goto detach;
+		goto unmap;
 	}
+
+	mem_impl->sgt = sgt;
+	mem_impl->dmabuf_attach = attach;
+	mem_impl->dev_mem.iova = iova;
 	mem_impl->dev_mem.pva = pva;
 	mem_impl->dev_mem.smmu_ctx_idx = smmu_ctx_idx;
+	mem_impl->dev_mem.iova_access_flags = access_flags;
 	return PVA_SUCCESS;

+unmap:
+	dma_buf_unmap_attachment(attach, sgt, dma_direction);
 detach:
-	dma_buf_detach(mem_impl->dmabuf, mem_impl->dmabuf_attch);
+	dma_buf_detach(mem_impl->dmabuf, attach);
 err_out:
 	return err;
 }
@@ -209,10 +241,11 @@ void pva_kmd_device_memory_iova_unmap(struct pva_kmd_device_memory *memory)

 	ASSERT(mem_impl->dmabuf != NULL);

-	dma_buf_unmap_attachment(mem_impl->dmabuf_attch, mem_impl->sgt,
+	dma_buf_unmap_attachment(mem_impl->dmabuf_attach, mem_impl->sgt,
 				 DMA_BIDIRECTIONAL);
-	dma_buf_detach(mem_impl->dmabuf, mem_impl->dmabuf_attch);
-	memory->iova = 0;
+	dma_buf_detach(mem_impl->dmabuf, mem_impl->dmabuf_attach);
+	mem_impl->sgt = NULL;
+	mem_impl->dmabuf_attach = NULL;
 }

 uint64_t pva_kmd_get_r5_iova_start(void)
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_driver.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_driver.c
@@ -50,13 +50,13 @@
 extern struct platform_driver pva_kmd_linux_smmu_context_driver;
 extern atomic_t g_num_smmu_ctxs;
 static bool load_from_gsc = PVA_KMD_LOAD_FROM_GSC_DEFAULT;
-static bool app_authenticate = PVA_KMD_APP_AUTH_DEFAULT;
+static bool pva_test_mode; //false by default

 module_param(load_from_gsc, bool, 0);
 MODULE_PARM_DESC(load_from_gsc, "Load V3 FW from GSC");

-module_param(app_authenticate, bool, 0);
-MODULE_PARM_DESC(app_authenticate, "Enable app authentication");
+module_param(pva_test_mode, bool, 0);
+MODULE_PARM_DESC(pva_test_mode, "Enable test mode");

 struct nvpva_device_data t23x_pva0_props = {
 	.version = PVA_CHIP_T23X,
@@ -112,11 +112,15 @@ static int pva_get_gsc_priv_hwid(struct platform_device *pdev)
 	return fwspec->ids[0] & 0xffff;
 }

-static void pva_kmd_linux_register_hwpm(struct pva_kmd_device *pva)
+static int pva_kmd_linux_register_hwpm(struct pva_kmd_device *pva)
 {
 	struct tegra_soc_hwpm_ip_ops *hwpm_ip_ops =
 		pva_kmd_zalloc(sizeof(*hwpm_ip_ops));

+	if (hwpm_ip_ops == NULL) {
+		return -ENOMEM;
+	}
+
 	hwpm_ip_ops->ip_dev = pva;
 	hwpm_ip_ops->ip_base_address = safe_addu64(
 		pva->reg_phy_base[0], (uint64_t)pva->regspec.cfg_perf_mon);
@@ -125,6 +129,7 @@ static void pva_kmd_linux_register_hwpm(struct pva_kmd_device *pva)
 	hwpm_ip_ops->hwpm_ip_reg_op = &pva_kmd_hwpm_ip_reg_op;
 	tegra_soc_hwpm_ip_register(hwpm_ip_ops);
 	pva->debugfs_context.data_hwpm = hwpm_ip_ops;
+	return 0;
 }

 static void pva_kmd_linux_unregister_hwpm(struct pva_kmd_device *pva)
@@ -256,10 +261,57 @@ static void pva_kmd_free_co_mem(struct platform_device *pdev)
 	}
 }

+static bool pva_kmd_in_test_mode(struct device *dev, bool param_test_mode)
+{
+	const char *dt_test_mode = NULL;
+
+	if (of_property_read_string(dev->of_node, "nvidia,test_mode_enable",
+				    &dt_test_mode)) {
+		return param_test_mode;
+	}
+
+	if (strcmp(dt_test_mode, "true")) {
+		return param_test_mode;
+	}
+
+	return true;
+}
+
 static struct kobj_type nvpva_kobj_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 };

+/**
+ * Read VPU authentication property from device tree
+ *
+ * @param dev Pointer to the device structure
+ * @return true if authentication should be enabled, false otherwise
+ */
+static bool pva_kmd_linux_read_vpu_auth(const struct device *dev)
+{
+	bool auth_enabled = false;
+	int len;
+	const __be32 *val;
+
+	val = of_get_property(dev->of_node, "nvidia,vpu-auth", &len);
+	if ((val != NULL) && (len >= (int)sizeof(__be32))) {
+		u32 value = (u32)be32_to_cpu(*val);
+		if (value != 0U) {
+			auth_enabled = true;
+			dev_dbg(dev, "VPU authentication enabled\n");
+		} else {
+			auth_enabled = false;
+			dev_dbg(dev, "VPU authentication disabled\n");
+		}
+	} else {
+		dev_dbg(dev,
+			"No VPU authentication property found, using default: %d\n",
+			auth_enabled);
+	}
+
+	return auth_enabled;
+}
+
 static int pva_probe(struct platform_device *pdev)
 {
 	int err = 0U;
@@ -273,6 +325,9 @@ static int pva_probe(struct platform_device *pdev)
 	struct clk_bulk_data *clks;
 	struct clk *c;

+	bool pva_enter_test_mode = false;
+	bool app_authenticate;
+
 	device_id = of_match_device(tegra_pva_of_match, dev);
 	if (!device_id) {
 		dev_err(dev, "no match for pva dev\n");
@@ -286,6 +341,8 @@ static int pva_probe(struct platform_device *pdev)
 		return -ENODATA;
 	}

+	app_authenticate = pva_kmd_linux_read_vpu_auth(dev);
+
 	/* Create devices for child nodes of this device */
 	of_platform_default_populate(dev->of_node, NULL, dev);

@@ -300,17 +357,12 @@ static int pva_probe(struct platform_device *pdev)

 	pva_props->pdev = pdev;
 	mutex_init(&pva_props->lock);
-	pva_device =
-		pva_kmd_device_create(pva_props->version, 0, app_authenticate);
+	pva_enter_test_mode = pva_kmd_in_test_mode(dev, pva_test_mode);
+	pva_device = pva_kmd_device_create(
+		pva_props->version, 0, app_authenticate, pva_enter_test_mode);

 	pva_device->is_hv_mode = is_tegra_hypervisor_mode();

-	/* On L4T, forcing boot from file */
-	/* If needed to load from GSC, remove the below block */
-	if (!pva_device->is_hv_mode) {
-		load_from_gsc = false;
-	}
-
 	pva_device->load_from_gsc = load_from_gsc;
 	pva_device->stream_ids[pva_device->r5_image_smmu_context_id] =
 		pva_get_gsc_priv_hwid(pdev);
@@ -352,8 +404,17 @@ static int pva_probe(struct platform_device *pdev)

 	pva_kmd_linux_host1x_init(pva_device);

-	pva_kmd_debugfs_create_nodes(pva_device);
-	pva_kmd_linux_register_hwpm(pva_device);
+	err = pva_kmd_debugfs_create_nodes(pva_device);
+	if (err != PVA_SUCCESS) {
+		dev_err(dev, "debugfs creation failed\n");
+		goto err_cdev_init;
+	}
+
+	err = pva_kmd_linux_register_hwpm(pva_device);
+	if (err != PVA_SUCCESS) {
+		dev_err(dev, "pva_kmd_linux_register_hwpm failed\n");
+		goto err_cdev_init;
+	}

 	if (!pva_device->is_hv_mode && pva_device->load_from_gsc) {
 		err = pva_kmd_get_co_info(pdev);
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_event_trace.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_event_trace.c
@@ -6,6 +6,11 @@
 #include "trace/events/nvpva_ftrace.h"
 #include <linux/nvhost.h>

+static uint32_t get_job_id(uint32_t queue_id, uint64_t submit_id)
+{
+	return (queue_id & 0x000000FF) << 24 | (submit_id & 0xFFFFFFU);
+}
+
 void pva_kmd_shim_add_trace_vpu_exec(
 	struct pva_kmd_device *pva,
 	struct pva_kmd_fw_msg_vpu_trace const *trace_info)
@@ -38,7 +43,8 @@ void pva_kmd_shim_add_trace_vpu_exec(
 	// In V2, Job ID is a 32-bit value with the top 8 bits being the queue ID
 	// and the bottom 24 bits being a per-task counter. In V3, we only use the
 	// queue ID.
-	uint32_t job_id = (trace_info->queue_id & 0x000000FF) << 24;
+	uint32_t job_id =
+		get_job_id(trace_info->queue_id, trace_info->submit_id);

 	trace_pva_job_ext_event(job_id, trace_info->ccq_id,
 				0, // syncpt_thresh,
@@ -50,3 +56,42 @@ void pva_kmd_shim_add_trace_vpu_exec(
 			 trace_info->num_prefences, trace_info->prog_id,
 			 trace_info->submit_id, vpu_start);
 }
+
+void pva_kmd_shim_add_trace_fence(
+	struct pva_kmd_device *pva,
+	struct pva_kmd_fw_msg_fence_trace const *trace_info)
+{
+	uint32_t job_id;
+
+	// We want to log events only for user workloads
+	if (trace_info->ccq_id == PVA_PRIV_CCQ_ID) {
+		return;
+	}
+
+	job_id = get_job_id(trace_info->queue_id, trace_info->submit_id);
+
+	if (trace_info->action == PVA_KMD_FW_BUF_MSG_FENCE_ACTION_WAIT) {
+		if (trace_info->type == PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SYNCPT) {
+			trace_job_prefence(job_id, trace_info->fence_id,
+					   trace_info->value);
+		} else if (trace_info->type ==
+			   PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SEMAPHORE) {
+			trace_job_prefence_semaphore(job_id,
+						     trace_info->fence_id,
+						     trace_info->offset,
+						     trace_info->value);
+		}
+	} else if (trace_info->action ==
+		   PVA_KMD_FW_BUF_MSG_FENCE_ACTION_SIGNAL) {
+		if (trace_info->type == PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SYNCPT) {
+			trace_job_postfence(job_id, trace_info->fence_id,
+					    trace_info->value);
+		} else if (trace_info->type ==
+			   PVA_KMD_FW_BUF_MSG_FENCE_TYPE_SEMAPHORE) {
+			trace_job_postfence_semaphore(job_id,
+						      trace_info->fence_id,
+						      trace_info->offset,
+						      trace_info->value);
+		}
+	}
+}
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_isr.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_isr.c
@@ -14,9 +14,13 @@ static struct pva_kmd_isr_data *get_isr(struct pva_kmd_device *pva,
 	struct pva_kmd_isr_data *isr_data;
 	ASSERT(intr_line < PVA_KMD_INTR_LINE_COUNT);
 	isr_data = &plat_data->isr[intr_line];
-	ASSERT(isr_data->binded);
+	if (!isr_data->binded) {
+		return NULL;
+	}
+
 	return isr_data;
 }
+
 static irqreturn_t pva_isr(int irq, void *dev_id)
 {
 	struct pva_kmd_isr_data *isr_data = (struct pva_kmd_isr_data *)dev_id;
@@ -35,40 +39,60 @@ enum pva_error pva_kmd_bind_intr_handler(struct pva_kmd_device *pva,
 		pva_kmd_linux_device_get_data(pva);
 	struct pva_kmd_isr_data *isr_data = &plat_data->isr[intr_line];
 	struct nvpva_device_data *props = plat_data->pva_device_properties;
+	enum pva_error pva_err = PVA_SUCCESS;
+	int irq;

-	isr_data->irq = platform_get_irq(props->pdev, intr_line);
+	ASSERT(isr_data->binded == false);
+	irq = platform_get_irq(props->pdev, intr_line);
+	if (irq < 0) {
+		pva_kmd_log_err("Failed to get irq number");
+		pva_err = kernel_err2pva_err(irq);
+		goto err_out;
+	}
+
+	isr_data->irq = irq;
 	isr_data->handler = handler;
 	isr_data->handler_data = data;
-	isr_data->binded = true;
 	isr_data->intr_line = intr_line;
 	err = request_threaded_irq(isr_data->irq, NULL, pva_isr, IRQF_ONESHOT,
 				   "pva-isr", isr_data);
-
 	if (err != 0) {
 		pva_kmd_log_err("Failed to bind interrupt handler");
+		pva_err = kernel_err2pva_err(err);
+		goto err_out;
 	}

-	return kernel_err2pva_err(err);
+	isr_data->binded = true;
+
+	return PVA_SUCCESS;
+err_out:
+	return pva_err;
 }

 void pva_kmd_enable_intr(struct pva_kmd_device *pva,
 			 enum pva_kmd_intr_line intr_line)
 {
 	struct pva_kmd_isr_data *isr_data = get_isr(pva, intr_line);
+	if (isr_data != NULL) {
 		enable_irq(isr_data->irq);
+	}
 }

-void pva_kmd_disable_intr(struct pva_kmd_device *pva,
+void pva_kmd_disable_intr_nosync(struct pva_kmd_device *pva,
 				 enum pva_kmd_intr_line intr_line)
 {
 	struct pva_kmd_isr_data *isr_data = get_isr(pva, intr_line);
-	disable_irq(isr_data->irq);
+	if (isr_data != NULL) {
+		disable_irq_nosync(isr_data->irq);
+	}
 }

 void pva_kmd_free_intr(struct pva_kmd_device *pva,
 		       enum pva_kmd_intr_line intr_line)
 {
 	struct pva_kmd_isr_data *isr_data = get_isr(pva, intr_line);
-	free_irq(isr_data->irq, isr_data);
+	ASSERT(isr_data != NULL);
+
+	(void)free_irq(isr_data->irq, isr_data);
 	isr_data->binded = false;
 }
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_misc.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_misc.c
@@ -11,7 +11,12 @@

 void *pva_kmd_zalloc(uint64_t size)
 {
-	return kvzalloc(size, GFP_KERNEL);
+	void *ptr = kvzalloc(size, GFP_KERNEL);
+
+	if (IS_ERR_OR_NULL(ptr)) {
+		return NULL;
+	}
+	return ptr;
 }

 void pva_kmd_free(void *ptr)
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_smmu.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_smmu.c
@@ -132,12 +132,16 @@ void pva_kmd_linux_device_smmu_contexts_init(struct pva_kmd_device *pva_device)
 	     sid_idx < safe_subu32(pva_device->hw_consts.n_smmu_contexts, 2U);
 	     sid_idx++) {
 		uint32_t smmu_ctx_idx = safe_addu32(sid_idx, 1U);
-		pva_device->stream_ids[smmu_ctx_idx] = g_smmu_ctxs[sid_idx].sid;
-		device_data->smmu_contexts[smmu_ctx_idx] =
-			g_smmu_ctxs[sid_idx].pdev;
-		dma_set_mask_and_coherent(
-			&device_data->smmu_contexts[smmu_ctx_idx]->dev,
+		struct pva_kmd_linux_smmu_ctx *smmu_ctx = &g_smmu_ctxs[sid_idx];
+
+		pva_device->stream_ids[smmu_ctx_idx] = smmu_ctx->sid;
+		device_data->smmu_contexts[smmu_ctx_idx] = smmu_ctx->pdev;
+		dma_set_mask_and_coherent(&smmu_ctx->pdev->dev,
 					  DMA_BIT_MASK(39));
+		//set max segment size to UINT_MAX to avoid creating scatterlist >= 4GB
+		//during IOVA mapping, which will overflow the scatterlist length field,
+		//causing IOVA leak
+		dma_set_max_seg_size(&smmu_ctx->pdev->dev, UINT_MAX);
 	}

 	/* Configure SMMU contexts for privileged operations */
--- a/drivers/video/tegra/host/pva/src/libs/pva/include/pva_constants.h
+++ b/drivers/video/tegra/host/pva/src/libs/pva/include/pva_constants.h
@@ -6,7 +6,7 @@

 #define PVA_NUM_ENGINES 2U
 #define PVA_MAX_NUM_CCQ 8
-#define PVA_CCQ_DEPTH 8U
+#define PVA_CCQ_DEPTH 14U
 #define PVA_USER_CCQ_BASE 1
 #define PVA_INVALID_CCQ_ID 0xFF
 #define PVA_INVALID_ENGINE_ID 0xFFU
@@ -138,4 +138,6 @@
 #define PVA_KMD_CHIP_ID_T26X "GEN3"
 #define PVA_KMD_CHIP_ID_DEFAULT PVA_KMD_CHIP_ID_T23X

+#define PVA_KMD_TEST_MODE_ENV_VAR "PVA_TEST_MODE"
+
 #endif // PVA_CONSTANTS_H
--- a/drivers/video/tegra/host/pva/src/libs/pva/include/pva_math_utils.h
+++ b/drivers/video/tegra/host/pva/src/libs/pva/include/pva_math_utils.h
@@ -689,6 +689,21 @@ static inline uint32_t safe_wraparound_dec_u32(uint32_t counter)
 	return result;
 }

+static inline uint32_t safe_wrap_add_u32(uint32_t a, uint32_t b)
+{
+	return (uint32_t)(((uint64_t)a + (uint64_t)b) & 0xFFFFFFFFU);
+}
+
+static inline uint32_t safe_wrap_sub_u32(uint32_t a, uint32_t b)
+{
+	return (uint32_t)(((uint64_t)a - (uint64_t)b) & 0xFFFFFFFFU);
+}
+
+static inline uint32_t safe_wrap_mul_u32(uint32_t a, uint32_t b)
+{
+	return (uint32_t)(((uint64_t)a * (uint64_t)b) & 0xFFFFFFFFU);
+}
+
 #define SAT_ADD_DEFINE(a, b, name, type)                                       \
 	static inline type sat_add##name(type a, type b)                       \
 	{                                                                      \
--- a/drivers/video/tegra/host/pva/src/private_api/pva_api_private.h
+++ b/drivers/video/tegra/host/pva/src/private_api/pva_api_private.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#ifndef PVA_API_PRIVATE_H
+#define PVA_API_PRIVATE_H
+
+#include "pva_api.h"
+
+//For legacy support not exposed by public API
+#define PVA_CMD_FLAGS_USE_LEGACY_POINTER 0x1
+struct pva_fw_vpu_legacy_ptr_symbol {
+	uint64_t base;
+	uint32_t offset;
+	uint32_t size;
+};
+
+enum pva_error_inject_codes {
+	PVA_ERR_INJECT_WDT_HW_ERR, // watchdog Hardware error
+	PVA_ERR_INJECT_WDT_TIMEOUT, // watchdog Timeout error
+	PVA_ERR_INJECT_VMEM_CLEAR, // vmem clear
+	PVA_ERR_INJECT_ASSERT_CHECK, // assert check
+	PVA_ERR_INJECT_ARMV7_EXCEPTION, // ARMv7 exception
+};
+
+struct pva_cmd_run_unit_tests {
+#define PVA_CMD_OPCODE_RUN_UNIT_TESTS (PVA_CMD_OPCODE_MAX + 0U)
+	struct pva_cmd_header header;
+#define PVA_FW_UTESTS_MAX_ARGC 16U
+	uint8_t argc;
+	uint8_t pad[3];
+	uint32_t in_resource_id;
+	uint32_t in_offset;
+	uint32_t in_size;
+	uint32_t out_resource_id;
+	uint32_t out_offset;
+	uint32_t out_size;
+};
+
+struct pva_cmd_err_inject {
+#define PVA_CMD_OPCODE_ERR_INJECT (PVA_CMD_OPCODE_MAX + 1U)
+	struct pva_cmd_header header;
+	uint32_t err_inject_code; // enum pva_error_inject_codes
+};
+
+struct pva_cmd_gr_check {
+#define PVA_CMD_OPCODE_GR_CHECK (PVA_CMD_OPCODE_MAX + 2U)
+	struct pva_cmd_header header;
+};
+
+#define PVA_CMD_OPCODE_COUNT (PVA_CMD_OPCODE_MAX + 3U)
+
+#endif // PVA_API_PRIVATE_H