pva: deploy V3 KMD

Jira PVAAS-17785 Change-Id: I8ebc4c49aec209c5f82c6725605b62742402500a Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3299880 Tested-by: Nan Wang <nanwa@nvidia.com> Reviewed-by: Vishwas M <vishwasm@nvidia.com> Reviewed-by: Mohnish Jain <mohnishj@nvidia.com> GVS: buildbot_gerritrpt <buildbot_gerritrpt@nvidia.com> Reviewed-by: Nan Wang <nanwa@nvidia.com>
2025-12-22 09:11:26 +03:00 · 2025-02-10 14:20:57 -08:00
parent b5d768302a
commit b63a822a1b
113 changed files with 22508 additions and 0 deletions
--- a/drivers/video/tegra/host/pva/Makefile
+++ b/drivers/video/tegra/host/pva/Makefile
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+
+
+ifndef CONFIG_TEGRA_SYSTEM_TYPE_ACK
+ifeq ($(CONFIG_TEGRA_OOT_MODULE),m)
+
+ifeq ($(findstring ack_src,$(NV_BUILD_KERNEL_OPTIONS)),)
+
+obj-m := nvhost-pva.o
+
+
+PVA_SYS_DIR := .
+PVA_SYS_ABSDIR := $(srctree.nvidia-oot)/drivers/video/tegra/host/pva
+
+
+###### Begin generated section ######
+pva_objs += \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_block_allocator.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_cmdbuf.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_context.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_debugfs.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_device.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg_binding.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg_validate.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_dma_cfg_write.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_fw_debug.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_fw_profiler.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_hwseq_validate.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_msg.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_op_handler.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_pm.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_queue.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_resource_table.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_sha256.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_silicon_boot.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_silicon_elf_parser.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_silicon_executable.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_silicon_hwpm.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_silicon_isr.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_silicon_misc.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_submitter.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_t23x.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_t26x.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_tegra_stats.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_utils.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_vpu_app_auth.o \
+    $(PVA_SYS_DIR)/src/kmd/common/pva_kmd_vpu_ocd.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_debugfs.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_device.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_device_memory.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_driver.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_ioctl.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_isr.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_misc.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_smmu.o \
+    $(PVA_SYS_DIR)/src/kmd/linux/pva_kmd_linux_vpu_app_auth.o \
+
+pva_inc_flags += \
+    -I$(PVA_SYS_ABSDIR)/src/fw/baremetal/include \
+    -I$(PVA_SYS_ABSDIR)/src/fw/include \
+    -I$(PVA_SYS_ABSDIR)/src/include \
+    -I$(PVA_SYS_ABSDIR)/src/kmd/common \
+    -I$(PVA_SYS_ABSDIR)/src/kmd/common/shim \
+    -I$(PVA_SYS_ABSDIR)/src/kmd/include \
+    -I$(PVA_SYS_ABSDIR)/src/kmd/linux/include \
+    -I$(PVA_SYS_ABSDIR)/src/libs/pva/include \
+
+pva_def_flags += \
+    -DPVA_BUILD_MODE=PVA_BUILD_MODE_L4T \
+    -DPVA_BUILD_MODE_BAREMETAL=5 \
+    -DPVA_BUILD_MODE_L4T=3 \
+    -DPVA_BUILD_MODE_NATIVE=1 \
+    -DPVA_BUILD_MODE_QNX=2 \
+    -DPVA_BUILD_MODE_SIM=4 \
+    -DPVA_DEV_MAIN_COMPATIBLE=1 \
+    -DPVA_ENABLE_CUDA=1 \
+    -DPVA_IS_DEBUG=0 \
+    -DPVA_SAFETY=0 \
+    -DPVA_SKIP_SYMBOL_TYPE_CHECK \
+    -DPVA_SUPPORT_XBAR_RAW=1 \
+    -Dpva_kmd_linux_dummy_EXPORTS \
+
+###### End generated section ######
+
+
+nvhost-pva-objs += $(pva_objs)
+ccflags-y += $(pva_inc_flags)
+ccflags-y += $(pva_def_flags)
+ccflags-y += -std=gnu11
+
+endif
+endif
+endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-bit.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-bit.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Utility Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_BIT_H
+#define PVA_BIT_H
+
+/*
+ * Bit manipulation macros
+ */
+
+/**
+ * @brief Number of bits per byte.
+ */
+#define PVA_BITS_PER_BYTE (8UL)
+
+/**
+ * @defgroup PVA_BIT8_HELPER
+ *
+ * @brief Bit Manipulation macros for number which is of type uint8_t.
+ *        Parameter that convey the bit position should be in the range
+ *        of 0 to 7 inclusive.
+ *        Parameter with respect to MSB and LSB should satisfy the conditions
+ *        of both being in the range of 0 to 7 inclusive with MSB greater than LSB.
+ * @{
+ */
+/**
+ * @brief Macro to set a given bit position in a variable of type uint8_t.
+ */
+#define PVA_BIT8(_b_) ((uint8_t)(((uint8_t)1U << (_b_)) & 0xffu))
+
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Macro used to generate a bit-mask from MSB to LSB in a uint8_t variable.
+ *        This macro sets all the bits from MSB to LSB.
+ */
+#define PVA_MASK8(_msb_, _lsb_)                                                \
+	((uint8_t)((((PVA_BIT8(_msb_) - 1U) | PVA_BIT8(_msb_)) &               \
+		    ~(PVA_BIT8(_lsb_) - 1U)) &                                 \
+		   0xffu))
+//! @endcond
+/** @} */
+
+/**
+ * @defgroup PVA_BIT16_HELPER
+ *
+ * @brief Bit Manipulation macros for number which is of type uint16_t.
+ *        Parameter that convey the bit position should be in the range
+ *        of 0 to 15 inclusive.
+ *        Parameter with respect to MSB and LSB should satisfy the conditions
+ *        of both being in the range of 0 to 15 inclusive with MSB greater than LSB.
+ * @{
+ */
+/**
+ * @brief Macro to set a given bit position in a 16 bit number.
+ */
+#define PVA_BIT16(_b_) ((uint16_t)(((uint16_t)1U << (_b_)) & 0xffffu))
+
+/**
+ * @brief Macro to mask a range(MSB to LSB) of bit positions in a 16 bit number.
+ * This will set all the bit positions in specified range.
+ */
+#define PVA_MASK16(_msb_, _lsb_)                                               \
+	((uint16_t)((((PVA_BIT16(_msb_) - 1U) | PVA_BIT16(_msb_)) &            \
+		     ~(PVA_BIT16(_lsb_) - 1U)) &                               \
+		    0xffffu))
+
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Macro to extract bits from a 16 bit number.
+ * The bits are extracted from the range provided and the extracted
+ * number is finally type-casted to the type provided as argument.
+ */
+#define PVA_EXTRACT16(_x_, _msb_, _lsb_, _type_)                               \
+	((_type_)(((_x_)&PVA_MASK16((_msb_), (_lsb_))) >> (_lsb_)))
+//! @endcond
+
+/**
+ * @brief Macro used to generate a bit-mask from MSB to LSB in a uint16_t variable.
+ *        This macro sets all the bits from MSB to LSB.
+ */
+#define PVA_INSERT16(_x_, _msb_, _lsb_)                                        \
+	((((uint16_t)(_x_)) << (_lsb_)) & PVA_MASK16((_msb_), (_lsb_)))
+/** @} */
+
+/**
+ * @defgroup PVA_BIT32_HELPER
+ *
+ * @brief Bit Manipulation macros for number which is of type uint32_t.
+ *        Parameter that convey the bit position should be in the range
+ *        of 0 to 31 inclusive.
+ *        Parameter with respect to MSB and LSB should satisfy the conditions
+ *        of both being in the range of 0 to 31 inclusive with MSB greater than LSB.
+ * @{
+ */
+
+/**
+ * @brief Macro to set a given bit position in a 32 bit number.
+ */
+#define PVA_BIT(_b_) ((uint32_t)(((uint32_t)1U << (_b_)) & 0xffffffffu))
+
+/**
+ * @brief Macro to mask a range(MSB to LSB) of bit positions in a 32 bit number.
+ * This will set all the bit positions in specified range.
+ */
+#define PVA_MASK(_msb_, _lsb_)                                                 \
+	(((PVA_BIT(_msb_) - 1U) | PVA_BIT(_msb_)) & ~(PVA_BIT(_lsb_) - 1U))
+
+/**
+ * @brief Macro to extract bits from a 32 bit number.
+ * The bits are extracted from the range provided and the extracted
+ * number is finally type-casted to the type provided as argument.
+ */
+#define PVA_EXTRACT(_x_, _msb_, _lsb_, _type_)                                 \
+	((_type_)(((_x_)&PVA_MASK((_msb_), (_lsb_))) >> (_lsb_)))
+
+/**
+ * @brief Macro to insert a range of bits from a given 32 bit number.
+ * Range of bits are derived from the number passed as argument.
+ */
+#define PVA_INSERT(_x_, _msb_, _lsb_)                                          \
+	((((uint32_t)(_x_)) << (_lsb_)) & PVA_MASK((_msb_), (_lsb_)))
+/** @} */
+
+/**
+ * @defgroup PVA_BIT64_HELPER
+ *
+ * @brief Bit Manipulation macros for number which is of type uint64_t.
+ *        Parameter that convey the bit position should be in the range
+ *        of 0 to 63 inclusive.
+ *        Parameter with respect to MSB and LSB should satisfy the conditions
+ *        of both being in the range of 0 to 63 inclusive with MSB greater than LSB.
+ * @{
+ */
+/**
+ * @brief Macro to set a given bit position in a 64 bit number.
+ */
+#define PVA_BIT64(_b_)                                                         \
+	((uint64_t)(((uint64_t)1UL << (_b_)) & 0xffffffffffffffffu))
+
+/**
+ * @brief Macro used to generate a bit-mask from (MSB to LSB) in a uint64_t variable.
+ *        This macro sets all the bits from MSB to LSB.
+ */
+#define PVA_MASK64(_msb_, _lsb_)                                               \
+	(((PVA_BIT64(_msb_) - (uint64_t)1U) | PVA_BIT64(_msb_)) &              \
+	 ~(PVA_BIT64(_lsb_) - (uint64_t)1U))
+
+/**
+ * @brief Macro to extract bits from a 64 bit number.
+ * The bits are extracted from the range provided and the extracted
+ * number is finally type-casted to the type provided as argument.
+ */
+#define PVA_EXTRACT64(_x_, _msb_, _lsb_, _type_)                               \
+	((_type_)(((_x_)&PVA_MASK64((_msb_), (_lsb_))) >> (_lsb_)))
+
+/**
+ * @brief Macro to insert a range of bits into a 64 bit number.
+ * The bits are derived from the number passed as argument.
+ */
+#define PVA_INSERT64(_x_, _msb_, _lsb_)                                        \
+	((((uint64_t)(_x_)) << (_lsb_)) & PVA_MASK64((_msb_), (_lsb_)))
+
+/**
+ * @brief Macro to pack a 64 bit number.
+ * A 64 bit number is generated that has first 32 MSB derived from
+ * upper 32 bits of passed argument and has lower 32MSB derived from
+ * lower 32 bits of another passed argument.
+ */
+#define PVA_PACK64(_l_, _h_)                                                   \
+	(PVA_INSERT64((_h_), 63U, 32U) | PVA_INSERT64((_l_), 31U, 0U))
+
+/**
+ * @brief Macro to extract the higher 32 bits from a 64 bit number.
+ */
+#define PVA_HI32(_x_) ((uint32_t)(((_x_) >> 32U) & 0xFFFFFFFFU))
+
+/**
+ * @brief Macro to extract the lower 32 bits from a 64 bit number.
+ */
+#define PVA_LOW32(_x_) ((uint32_t)((_x_)&0xFFFFFFFFU))
+/** @} */
+
+#endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-checkpoint.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-checkpoint.h
@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Utility Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_CHECKPOINT_H
+#define PVA_CHECKPOINT_H
+
+/**
+ * @file pva-checkpoint.h
+ * @brief Defines macros to create a checkpoint
+ */
+
+/**
+ * @defgroup PVA_CHECKPOINT_MACROS Macros to define a checkpoint
+ *
+ * @brief Checkpoints are the 32-bit status values that can be written to status
+ * register during R5's execution. The 32-bit value is divided into four 8-bit values.
+ * These are:
+ *  - major code: major aspect (usually a unit) of the uCode. Bit Position: [31:24]
+ *                Valid values are defined at @ref PVA_CHECKPOINT_MAJOR_CODES.
+ *  - minor code: minor aspect (usually a function) of the uCode.The interpretation of the
+ *                minor value is determined by the major value. Bit Position: [23:16]
+ *  - flags: flags indicating type of the checkpoint such as error checkpoint,
+ *           performance checkpoint, checkpoint indicating start of an operation,
+ *           checkpoint indicating end of an operation etc. Bit Position: [15:8]
+ *           Valid values are defined at @ref PVA_CHECKPOINT_FLAGS.
+ *  - sequence: disambiguate multiple checkpoints within a minor code or to convey additional
+ *              information. The interpretation of the sequence value is determined by both the
+ *              major and minor values. Bit Position: [7:0]
+ *              Valid values are any values from 0 to UINT8_MAX
+ * @{
+ */
+
+/**
+ * @defgroup PVA_CHECKPOINT_MAJOR_CODES
+ * @brief Macros to define the major code field of the checkpoint @ingroup PVA_CHECKPOINT_MACROS
+ * @{
+ */
+
+/*
+ * Operational major codes
+ */
+
+/**
+ * @brief Major code for PVA during Boot.
+ */
+#define PVA_CHK_MAIN (0x01U)
+
+//! @endcond
+
+/**
+ * @brief Error related major codes
+ */
+#define PVA_CHK_ABORT (0xFFU)
+
+/** @} */
+
+/**
+ * @defgroup PVA_CHECKPOINT_HW_STATE_MINOR_CODES
+ * @brief Macros to define the minor code field of the checkpoints with major code PVA_CHK_HW_STATE
+ * @ingroup PVA_CHECKPOINT_MACROS
+ *
+ * @{
+ */
+/**
+ * @brief Minor code while doing a MMIO HW state check.
+ */
+#define PVA_CHK_HW_STATE_MMIO (0x01U)
+
+/**
+ * @brief Minor code while doing a VIC HW state check.
+ */
+#define PVA_CHK_HW_STATE_VIC (0x02U)
+
+/**
+ * @brief Minor code while doing a ARM register HW state check.
+ */
+#define PVA_CHK_HW_STATE_ARM (0x03U)
+
+/**
+ * @brief Minor code while doing a MPU HW state check.
+ */
+#define PVA_CHK_HW_STATE_MPU (0x04U)
+
+/**
+ * @brief Minor code while doing a DMA HW state check.
+ */
+#define PVA_CHK_HW_STATE_DMA (0x05U)
+
+/**
+ * @brief Minor code while doing a VIC HW state check.
+ */
+#define PVA_CHK_HW_STATE_GOLDEN (0x06U)
+/** @} */
+
+/** @} */
+
+/**
+ * @defgroup PVA_ABORT_REASONS
+ *
+ * @brief Macros to define the abort reasons
+ * @{
+ */
+/**
+ * @brief Minor code for abort due to assert.
+ */
+#define PVA_ABORT_ASSERT (0x01U)
+
+/**
+ * @brief Minor code for abort in case pva main call fails.
+ */
+#define PVA_ABORT_FALLTHRU (0x02U)
+
+/**
+ * @brief Minor code for abort in case of fatal IRQ.
+ */
+#define PVA_ABORT_IRQ (0x05U)
+
+/**
+ * @brief Minor code for abort in case of MPU failure.
+ */
+#define PVA_ABORT_MPU (0x06U)
+
+/**
+ * @brief Minor code for abort in case of ARM exception.
+ */
+#define PVA_ABORT_EXCEPTION (0x07U)
+
+/**
+ * @brief Minor code for abort in case of un-supported SID read.
+ */
+#define PVA_ABORT_UNSUPPORTED (0x09U)
+
+/**
+ * @brief Minor code for abort in case of DMA failures.
+ */
+#define PVA_ABORT_DMA_TASK (0x0cU)
+
+/**
+ * @brief Minor code for abort in case of WDT failures.
+ * Note: This code is not reported to HSM.
+ */
+#define PVA_ABORT_WATCHDOG (0x0eU)
+
+//! @endcond
+
+/**
+ * @brief Minor code for abort in case of VPU init failures.
+ */
+#define PVA_ABORT_VPU (0x0fU)
+
+/**
+ * @brief Minor code for abort in case of DMA MISR setup failures.
+ */
+#define PVA_ABORT_DMA (0x10U)
+
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Minor code for abort in case of Mbox errors.
+ * Note: This is used only in T19x
+ */
+#define PVA_ABORT_MBOX_WAR (0x12U)
+//! @endcond
+
+/**
+ * @brief Minor code for abort in case of AISR errors.
+ */
+#define PVA_ABORT_AISR_QUEUE (0x14U)
+
+/**
+ * @brief Minor code for abort in case of bad task.
+ */
+#define PVA_ABORT_BAD_TASK (0x15U)
+
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Minor code for abort in case of PPE init failures.
+ * Note: This is only used in T26x
+ */
+#define PVA_ABORT_PPE (0x16U)
+//! @endcond
+
+/**
+ * @brief Minor code for abort in case of RAMIC failures.
+ */
+#define PVA_ABORT_RAMIC (0x20U)
+
+/**
+ * @brief Minor Code for SEC for safety errors.
+ * Note: This code is not reported to HSM.
+ */
+#define PVA_ABORT_SEC_SERR (0x21U)
+
+/**
+ * @brief Minor Code for SEC for functional errors.
+ * Note: This code is not reported to HSM.
+ */
+#define PVA_ABORT_SEC_FERR (0x22U)
+
+/**
+ * @brief Minor code for abort in case of firewall decode error.
+ */
+#define PVA_ABORT_L2SRAM_FWDEC (0x23U)
+
+/**
+ * @brief Minor code for abort in case of kernel panic.
+ */
+#define PVA_ABORT_KERNEL_PANIC (0x30U)
+
+/**
+ * @brief Minor code for abort in case of Batch Timeout.
+ */
+#define PVA_ABORT_BATCH_TIMEOUT (0x40U)
+
+/**
+ * @brief Minor code for abort in case of DMA Transfer Timeout.
+ * while in launch phase for the VPU)
+ */
+#define PVA_ABORT_DMA_SETUP_TIMEOUT (0x41U)
+
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Minor code used when NOC BIST is run.
+ * Note: This is only used in T19x
+ */
+#define PVA_ABORT_NOC_BIST (0xfcU)
+//! @endcond
+
+/** @} */
+
+/**
+ * @defgroup PVA_ABORT_ARGUMENTS Macros to define the argument for pva_abort operation
+ *
+ * @brief Argument of pva_abort operation is updated in status register
+ *
+ */
+
+/**
+ * @defgroup PVA_ABORT_ARGUMENTS_MPU
+ * @brief Argument to pva_abort() from MPU operations
+ * @ingroup PVA_ABORT_ARGUMENTS
+ * @{
+ */
+/**
+ * @brief Minor code when there is an error while configuring MPU.
+ */
+#define PVA_ABORT_MPU_CONFIG (0xE001U)
+
+/**
+ * @brief Minor code when there is an error while initializing MPU.
+ */
+#define PVA_ABORT_MPU_INIT (0xE002U)
+/** @} */
+
+/**
+ * @defgroup PVA_ABORT_ARGUMENTS_VPU
+ * @brief Argument to pva_abort() from VPU operations
+ * @ingroup PVA_ABORT_ARGUMENTS
+ * @{
+ */
+/**
+ * @brief Minor code when VPU is in debug state.
+ */
+#define PVA_ABORT_VPU_DEBUG (0xE001U)
+/** @} */
+
+/**
+ * @defgroup PVA_ABORT_ARGUMENTS_PPE
+ * @brief Argument to pva_abort() from PPE operations
+ * @ingroup PVA_ABORT_ARGUMENTS
+ * @{
+ */
+/**
+ * @brief Minor code when PPE is in debug state.
+ */
+#define PVA_ABORT_PPE_DEBUG (0xE002U)
+/** @} */
+
+/**
+ * @brief Minor Code when DMA state is not idle to perform
+ * DMA MISR setup.
+ */
+#define PVA_ABORT_DMA_MISR_BUSY (0xE001U)
+/**
+ * @brief Minor Code in DMA when MISR has timed out
+ */
+#define PVA_ABORT_DMA_MISR_TIMEOUT (0xE002U)
+
+/**
+ * @defgroup PVA_ABORT_ARGUMENTS_IRQ Argument to pva_abort() from IRQs
+ * @ingroup PVA_ABORT_ARGUMENTS
+ * @{
+ */
+
+/**
+ * @brief Minor Code for Command FIFO used by Interrupt Handler.
+ */
+#define PVA_ABORT_IRQ_CMD_FIFO (0xE001U)
+
+#if (0 == DOXYGEN_DOCUMENTATION)
+#define PVA_ABORT_IRQ_TEST_HOST (0xE002U)
+#endif
+/** @} */
+#endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-config.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-config.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Utility Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_CONFIG_H
+#define PVA_CONFIG_H
+
+#include <pva-types.h>
+#include "pva_fw_constants.h"
+
+/**
+ * @defgroup PVA_CONFIG_PARAMS
+ *
+ * @brief PVA Configuration parameters.
+ * @{
+ */
+/**
+ * @brief Queue id for queue0.
+ */
+#define PVA_FW_QUEUE_0 (0U)
+
+/**
+ * @brief Total number of queues that are present
+ *        for communication between KMD and FW.
+ */
+#define PVA_NUM_QUEUES (8U)
+
+/**
+ * @brief Maximum queue id value in PVA System.
+ */
+#define PVA_MAX_QUEUE_ID (PVA_NUM_QUEUES - 1U)
+
+/**
+ * @brief Maximum number of tasks that is supported by a queue.
+ */
+#define MAX_QUEUE_DEPTH (256U)
+
+/**
+ * @brief Number of Hardware Semaphore registers in PVA System.
+ */
+#define PVA_NUM_SEMA_REGS (4U)
+
+/**
+ * @brief Number of Hardware Mailbox registers in PVA System.
+ */
+#define PVA_NUM_MBOX_REGS (8U)
+
+/**
+ * @brief Maximum number of Pre-Actions for a task.
+ */
+#define PVA_MAX_PREACTIONS (26U)
+
+/**
+ * @brief Maximum number of Post-Actions for a task.
+ */
+#define PVA_MAX_POSTACTIONS (28U)
+
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Maximum number of DMA channels for T26x.
+ */
+#define PVA_NUM_DMA_CHANNELS_T26X (8U)
+
+/**
+ * @brief Total number of AXI data buffers for T26x.
+ */
+#define PVA_NUM_DMA_ADB_BUFFS_T26X (304U)
+
+/**
+ * @brief Number of reserved AXI data buffers for T26x.
+ */
+#define PVA_NUM_RESERVED_ADB_BUFFERS_T26X (16U)
+
+/**
+ * @brief Number of dynamic AXI data buffers for T26x.
+ * These exclude the reserved AXI data buffers from total available ones.
+ */
+#define PVA_NUM_DYNAMIC_ADB_BUFFS_T26X                                         \
+	(PVA_NUM_DMA_ADB_BUFFS_T26X - PVA_NUM_RESERVED_ADB_BUFFERS_T26X)
+
+/**
+ * @brief Maximum number of DMA channels for T23x.
+ */
+#define PVA_NUM_DMA_CHANNELS_T23X (16U)
+//! @endcond
+
+/**
+ * @brief Number of DMA descriptors for T19x.
+ */
+#define PVA_NUM_DMA_DESCS_T19X (64U)
+/**
+ * @brief Number of DMA descriptors for T23x.
+ */
+#define PVA_NUM_DMA_DESCS_T23X (64U)
+/**
+ * @brief Number of DMA descriptors for T26x.
+ */
+#define PVA_NUM_DMA_DESCS_T26X (96U)
+
+/**
+ * @brief Number of reserved DMA channels. These channels
+ * are reserved per DMA for R5 transfers. These channels
+ * will be used by R5 to transfer data which it needs.
+ */
+#define PVA_NUM_RESERVED_CHANNELS (1U)
+
+/**
+ * @brief Number of dynamic DMA descriptors for T19x. These descriptors can be
+ * used by the VPU application transfer data. These exclude
+ * the reserved descriptors from total available ones.
+ */
+#define PVA_NUM_DYNAMIC_DESCS_T19X                                             \
+	(PVA_NUM_DMA_DESCS_T19X - PVA_NUM_RESERVED_DESCRIPTORS)
+/**
+ * @brief Number of dynamic DMA descriptors for T23x. These descriptors can be
+ * used by the VPU application transfer data. These exclude
+ * the reserved descriptors from total available ones.
+ */
+#define PVA_NUM_DYNAMIC_DESCS_T23X                                             \
+	(PVA_NUM_DMA_DESCS_T23X - PVA_NUM_RESERVED_DESCRIPTORS)
+/**
+ * @brief Number of dynamic DMA descriptors for T26x. These descriptors can be
+ * used by the VPU application transfer data. These exclude
+ * the reserved descriptors from total available ones.
+ */
+#define PVA_NUM_DYNAMIC_DESCS_T26X                                             \
+	(PVA_NUM_DMA_DESCS_T26X - PVA_NUM_RESERVED_DESCRIPTORS)
+/**
+ * Note: T26x will be brought up first on Linux, and then on QNX. To support this,
+ * the following macro is needed so that the QNX driver can build without requiring
+ * any changes.
+ */
+#define PVA_NUM_DYNAMIC_DESCS (PVA_NUM_DYNAMIC_DESCS_T23X)
+
+/**
+ * @brief Number of reserved AXI data buffers for T23x.
+ */
+#define PVA_NUM_RESERVED_ADB_BUFFERS_T23X (16U)
+
+/**
+ * @brief Number of reserved VMEM data buffers.
+ */
+#define PVA_NUM_RESERVED_VDB_BUFFERS (0U)
+
+/**
+ * @brief Total number of VMEM data buffers.
+ */
+#define PVA_NUM_DMA_VDB_BUFFS (128U)
+
+/**
+ * @brief Total number of AXI data buffers for T23x.
+ */
+#define PVA_NUM_DMA_ADB_BUFFS_T23X (272U)
+
+/**
+ * @brief Number of dynamic AXI data buffers for T23x.
+ * These exclude the reserved AXI data buffers from total available ones.
+ */
+#define PVA_NUM_DYNAMIC_ADB_BUFFS_T23X                                         \
+	(PVA_NUM_DMA_ADB_BUFFS_T23X - PVA_NUM_RESERVED_ADB_BUFFERS_T23X)
+
+/**
+ * @brief Number of dynamic VMEM data buffers for T23x.
+ * These exclude the reserved VMEM data buffers from total available ones.
+ */
+#define PVA_NUM_DYNAMIC_VDB_BUFFS                                              \
+	(PVA_NUM_DMA_VDB_BUFFS - PVA_NUM_RESERVED_VDB_BUFFERS)
+
+/**
+ * @brief The first Reserved DMA descriptor. This is used as a
+ *        starting point to iterate over reserved DMA descriptors.
+ */
+#define PVA_RESERVED_DESC_START (60U)
+
+/**
+ * @brief The first Reserved AXI data buffers. This is used as a
+ *        starting point to iterate over reserved AXI data buffers.
+ */
+#define PVA_RESERVED_ADB_BUFF_START PVA_NUM_DYNAMIC_ADB_BUFFS
+
+/**
+ * @brief This macro has the value to be set by KMD in the shared semaphores
+ * @ref PVA_PREFENCE_SYNCPT_REGION_IOVA_SEM or @ref PVA_POSTFENCE_SYNCPT_REGION_IOVA_SEM
+ * if the syncpoint reserved region must not be configured as uncached
+ * in R5 MPU.
+ */
+#define PVA_R5_SYNCPT_REGION_IOVA_OFFSET_NOT_SET (0xFFFFFFFFU)
+/** @} */
+
+/**
+ * @defgroup PVA_CONFIG_PARAMS_T19X
+ *
+ * @brief PVA Configuration parameters exclusively for T19X.
+ * @{
+ */
+/**
+ * @brief Number of DMA channels for T19x or Xavier.
+ */
+#define PVA_NUM_DMA_CHANNELS_T19X (14U)
+
+/**
+ * @brief Number of reserved AXI data buffers for T19x.
+ */
+#define PVA_NUM_RESERVED_ADB_BUFFERS_T19X (8U)
+
+/**
+ * @brief Total number of AXI data buffers for T19x.
+ */
+#define PVA_NUM_DMA_ADB_BUFFS_T19X (256U)
+
+/**
+ * @brief Number of dynamic AXI data buffers for T19x.
+ * These exclude the reserved AXI data buffers from total available ones.
+ */
+#define PVA_NUM_DYNAMIC_ADB_BUFFS_T19X                                         \
+	(PVA_NUM_DMA_ADB_BUFFS_T19X - PVA_NUM_RESERVED_ADB_BUFFERS_T19X)
+
+/** @} */
+#endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-errors.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-errors.h
@@ -0,0 +1,428 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Utility Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_ERRORS_H
+#define PVA_ERRORS_H
+
+#include <stdint.h>
+#include <pva-packed.h>
+
+/**
+ * @brief PVA Error codes
+ */
+typedef uint16_t pva_errors_t;
+
+/**
+ * @defgroup PVA_ERRORS
+ *
+ * @brief General and interface errors of PVA.
+ * @{
+ */
+/**
+ * @brief In case of no Error.
+ */
+#define PVA_ERR_NO_ERROR (0x0U)
+
+/**
+ * @brief Error in case of an illegal command
+ *        PVA FW executes commands that are found
+ *        in the command look up table. If a command
+ *        is not part of supported commands, this
+ *        error will be returned. Valid commands can be
+ *        referred at @ref pva_cmd_lookup_t.
+ *
+ */
+#define PVA_ERR_BAD_CMD (0x1U)
+
+/**
+ * @brief Error in case of bad queue id, ie
+ * queue id that was requested is not available.
+ */
+#define PVA_ERR_BAD_QUEUE_ID (0x3U)
+
+/**
+ * @brief Error in case of invalid pve-id. This
+ *        error is generated if PVE id is greater
+ *        than @ref PVA_NUM_PVE.
+ */
+#define PVA_ERR_BAD_PVE_ID (0x4U)
+
+/**
+ * @brief Error in case when number of pre-actions
+ * are more than what can be accommodated.
+ */
+#define PVA_ERR_BUFF_TOO_SMALL (0x5U)
+
+/**
+ * @brief Error in case when requested feature can not be satisfied.
+ *        This error arises in scenarios where certain actions are
+ *        not supported during execution of pre-actions or post-actions.
+ *        For instance, @ref TASK_ACT_WRITE_STATUS is not supported in
+ *        executing pre-actions of task.
+ */
+#define PVA_ERR_FEATURE_NOT_SUPPORTED (0x6U)
+
+/**
+ * @brief Error in case when the address generated or translated does not
+ * meet the constraints like alignment or non-null.
+ */
+#define PVA_ERR_BAD_ADDRESS (0x9U)
+
+/**
+ * @brief Error in case when timestamp is requested on un-supported action.
+ */
+#define PVA_ERR_BAD_TIME_VALUE (0xdU)
+#if PVA_SAFETY == 0
+/**
+ * @brief Error in case when the register provided to update
+ *        the status is invalid.
+ */
+#define PVA_ERR_BAD_STATUS_REG (0x10U)
+#endif
+//! @endcond
+/**
+ * @brief Error in case of bad task.
+ *        In scenarios where task does not meet the
+ *        necessary criteria like non-zero or 64 byte alignment.
+ *        This error will be returned.
+ */
+#define PVA_ERR_BAD_TASK (0x15U)
+
+/**
+ * @brief Error in case of invalid task action list. Invalid
+ *        action list arises in scenarios like number of
+ *        pre and post actions not being zero but actual
+ *        pre or post action to be performed being NULL.
+ */
+#define PVA_ERR_BAD_TASK_ACTION_LIST (0x16U)
+
+/**
+ * @brief Error when internal state of task is not as expected.
+ *        A task goes through transition of various state while
+ *        executing. In case when a state is not coherent with
+ *        action being performed this error is returned.
+ *        For example, task can not be in a running state
+ *        while tear-down is being performed.
+ */
+#define PVA_ERR_BAD_TASK_STATE (0x17U)
+
+/**
+ * @brief Error when there is a mis-match in input status and the actual status.
+ *        This error occurs when there is a mis-match in status from @ref pva_gen_task_status_t
+ *        and actual status that is populated by FW during task execution.
+ */
+#define PVA_ERR_TASK_INPUT_STATUS_MISMATCH (0x18U)
+
+/**
+ * @brief Error in case of invalid parameters. These errors occur when
+ *        parameters passed are invalid and is applicable for task parameters
+ *        and DMA parameters.
+ */
+#define PVA_ERR_BAD_PARAMETERS (0x1aU)
+
+/**
+ * @brief Error in case of when timed out occurred for batch of task.
+ */
+#define PVA_ERR_PVE_TIMEOUT (0x23U)
+
+/**
+ * @brief Error when VPU has halted or turned off.
+ */
+#define PVA_ERR_VPU_ERROR_HALT (0x25U)
+
+/**
+ * @brief Error after FW sends an abort signal to KMD. KMD will write into status buffers for
+ *        pending tasks after FW sends an abort signal to KMD.
+ */
+#define PVA_ERR_VPU_BAD_STATE (0x28U)
+
+/**
+ * @brief Error in case of exiting VPU.
+ */
+#define PVA_ERR_VPU_EXIT_ERROR (0x2aU)
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Error in case of exiting PPE.
+ */
+#define PVA_ERR_PPE_EXIT_ERROR (0x2bU)
+//! @endcond
+/**
+ * @brief Error when a task running on PVE caused abort on PVE.
+ */
+#define PVA_ERR_PVE_ABORT (0x2dU)
+/**
+ * @brief Error in case of Floating point NAN.
+ */
+
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief Error in case of Floating point NAN.
+ */
+#define PVA_ERR_PPE_DIVIDE_BY_0 (0x34U)
+/**
+ * @brief Error in case of Floating point NAN.
+ */
+#define PVA_ERR_PPE_ILLEGAL_DEBUG (0x36U)
+
+#define PVA_ERR_PPE_ILLEGAL_INSTR_ALIGN (0x37U)
+
+/**
+ * @brief Error in case of Bad cached DRAM segment.
+ */
+#define PVA_ERR_BAD_CACHED_DRAM_SEG (0x3aU)
+
+/**
+ * @brief Error in case of Bad DRAM IOVA.
+ */
+#define PVA_ERR_BAD_DRAM_IOVA (0x3cU)
+//! @endcond
+
+/**
+ * @brief Error in case of Register mis-match.
+ */
+#define PVA_ERR_REG_MISMATCH (0x3dU)
+
+/**
+ * @brief Error in case of AISR queue empty.
+ */
+#define PVA_ERR_AISR_INPUT_QUEUE_EMPTY (0x3fU)
+
+/**
+ * @brief Error in case of AISR queue full.
+ */
+#define PVA_ERR_AISR_OUTPUT_QUEUE_FULL (0x40U)
+#if (PVA_HAS_L2SRAM == 1)
+/**
+ * @brief Error in case of L2SRAM allocation failed due to invalid parameters.
+ */
+#define PVA_ERR_BAD_L2SRAM_PARAMS (0x41U)
+#endif
+/**
+ * @brief Error in case of bad or invalid task parameters.
+ */
+#define PVA_ERR_BAD_TASK_PARAMS (0x42U)
+/**
+ * @brief Error in case of invalid VPU system call.
+ */
+#define PVA_ERR_VPU_SYS_ERROR (0x43U)
+/**
+ * @brief Error in case of HW Watchdog timer timeout
+ */
+#define PVA_ERR_WDT_TIMEOUT_ERROR (0x44U)
+/**
+ * @brief Error in case Golden register check value mismatch.
+ */
+#define PVA_ERR_GR_REG_MISMATCH (0x45U)
+/**
+ * @brief Error in case Critical register check value mismatch.
+ */
+#define PVA_ERR_CRIT_REG_MISMATCH (0x46U)
+/** @} */
+
+/**
+ * @defgroup PVA_DMA_ERRORS
+ *
+ * @brief DMA ERROR codes used across PVA.
+ * @{
+ */
+/**
+ * @brief Error when DMA transfer mode in DMA descriptor is invalid.
+ */
+#define PVA_ERR_DMA_TRANSFER_TYPE_INVALID (0x204U)
+
+/**
+ * @brief Error when DMA transfer was not successful.
+ */
+#define PVA_ERR_DMA_CHANNEL_TRANSFER (0x207U)
+
+/**
+ * @brief Error in case of BAD DMA descriptor.
+ */
+#define PVA_ERR_BAD_DMA_DESC_ID (0x208U)
+
+/**
+ * @brief Error in case of BAD DMA channel ID.
+ */
+#define PVA_ERR_BAD_DMA_CHANNEL_ID (0x209U)
+
+/**
+ * @brief Error in case of DMA timeout.
+ */
+#define PVA_ERR_DMA_TIMEOUT (0x20bU)
+
+/**
+ * @brief Error when program trying to use channel is already active.
+ */
+#define PVA_ERR_DMA_INVALID_CONFIG (0x220U)
+
+/**
+ * @brief Error in case DMA transfer was not successful.
+ */
+#define PVA_ERR_DMA_ERROR (0x221U)
+
+/**
+ * @brief Error when number of bytes of HW Seq data copy is
+ * not a multiple of 4.
+ */
+#define PVA_ERR_DMA_HWSEQ_BAD_PROGRAM (0x216U)
+
+/**
+ * @brief Error when number of bytes of HW Seq data copy is
+ * more than HW Seq RAM size.
+ */
+#define PVA_ERR_DMA_HWSEQ_PROGRAM_TOO_LONG (0x217U)
+/** @} */
+
+/**
+ * @defgroup PVA_MISR_ERRORS
+ *
+ * @brief MISR error codes used across PVA.
+ * @{
+ */
+/**
+ * @brief Error status when DMA MISR test is not run.
+ */
+#define PVA_ERR_MISR_NOT_RUN (0x280U)
+/**
+ * @brief Error status when DMA MISR test did not complete.
+ */
+#define PVA_ERR_MISR_NOT_DONE (0x281U)
+/**
+ * @brief Error status when DMA MISR test timed out.
+ */
+#define PVA_ERR_MISR_TIMEOUT (0x282U)
+/**
+ * @brief Error status in case of DMA MISR test address failure.
+ */
+#define PVA_ERR_MISR_ADDR (0x283U)
+/**
+ * @brief Error status in case of DMA MISR test data failure.
+ */
+#define PVA_ERR_MISR_DATA (0x284U)
+/**
+ * @brief Error status in case of DMA MISR test data and address failure.
+ */
+#define PVA_ERR_MISR_ADDR_DATA (0x285U)
+/** @} */
+
+/**
+ * @defgroup PVA_VPU_ISR_ERRORS
+ *
+ * @brief VPU ISR error codes used across PVA.
+ * @{
+ */
+/**
+ * @defgroup PVA_FAST_RESET_ERRORS
+ *
+ * @brief Fast reset error codes used across PVA.
+ * @{
+ */
+/**
+ * @brief Error when VPU is not in idle state for a reset to be done.
+ */
+#define PVA_ERR_FAST_RESET_TIMEOUT_VPU (0x401U)
+/**
+ * @brief Error if VPU I-Cache is busy before checking DMA engine for idle state.
+ */
+#define PVA_ERR_FAST_RESET_TIMEOUT_ICACHE1 (0x402U)
+/**
+ * @brief Error if DMA channel is busy for a reset to be done.
+ */
+#define PVA_ERR_FAST_RESET_TIMEOUT_CH0 (0x403U)
+/**
+ * @brief Error if VPU I-Cache is busy after checking DMA engine for idle state.
+ */
+#define PVA_ERR_FAST_RESET_TIMEOUT_ICACHE2 (0x419U)
+
+#if (PVA_CHIP_ID == CHIP_ID_T26X)
+/**
+ * @brief Error when PPE is not in idle state for a reset to be done.
+ */
+#define PVA_ERR_FAST_RESET_TIMEOUT_PPE (0x420U)
+#endif
+/** @} */
+
+/**
+ * @defgroup PVA_L2SRAM_ERRORS
+ *
+ * @brief L2SRAM memory error codes used across PVA.
+ * @{
+ */
+/**
+ * @brief Error if l2sram memory allocation failed because of insufficient l2sram memory or
+ * if 2 chunks of memory are already allocated.
+ */
+#define PVA_ERR_ALLOC_FAILED (0x812U)
+/**
+ * @brief Error if If l2sram address given for clearing/freeing is not a valid L2SRAM address
+ */
+#define PVA_ERR_FREE_FAILED (0x813U)
+/** @} */
+
+/**
+ * @defgroup PVA_INFO_ERRORS
+ *
+ * @brief Informational error codes.
+ * @{
+ */
+/**
+ * @brief Error when there is no task.
+ */
+#define PVA_ERR_NO_TASK (0x997U)
+/**
+ * @brief Error when CCQ IRQ line enable on VIC fails
+ */
+#define PVA_ERR_CCQ_IRQ_ENABLE_FAILED (0x998U)
+/**
+ * @brief Error when Mailbox IRQ line enable on VIC fails
+ */
+#define PVA_ERR_MBOX_IRQ_ENABLE_FAILED (0x999U)
+/**
+ * @brief Error when L2SRAM IRQ line enable on VIC fails
+ */
+#define PVA_ERR_L2SRAM_IRQ_ENABLE_FAILED (0x99AU)
+/**
+ * @brief Error when DMA0 IRQ line enable on VIC fails
+ */
+#define PVA_ERR_DMA0_IRQ_ENABLE_FAILED (0x99BU)
+/**
+ * @brief Error when DMA1 IRQ line enable on VIC fails
+ */
+#define PVA_ERR_DMA1_IRQ_ENABLE_FAILED (0x99CU)
+/**
+ * @brief Error when VPU IRQ line enable on VIC fails
+ */
+#define PVA_ERR_VPU_IRQ_ENABLE_FAILED (0x99DU)
+/**
+ * @brief Error when SEC IRQ line enable on VIC fails
+ */
+#define PVA_ERR_SEC_IRQ_ENABLE_FAILED (0x99EU)
+/**
+ * @brief Error when RAMIC IRQ line enable on VIC fails
+ */
+#define PVA_ERR_RAMIC_IRQ_ENABLE_FAILED (0x99FU)
+
+/**
+ * @brief Error in case to try again.
+ * @note This error is internal to FW only.
+ */
+#define PVA_ERR_TRY_AGAIN (0x9A0U)
+/** @} */
+
+/* Never used */
+#define PVA_ERR_MAX_ERR (0xFFFFU)
+
+#endif /* _PVA_ERRORS_H_ */
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-fw-version.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-fw-version.h
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2022 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+/*
+ * Unit: Host Interface Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_FW_VERSION_H
+#define PVA_FW_VERSION_H
+
+#include <pva-bit.h>
+
+/*
+ * Note: Below are doxygen comments with the @def command.
+ * This allows the comment to be physically distant from the define
+ * being documented.  And allows for a single general comment that is
+ * regardless of the being assigned to the macro.
+ */
+
+/**
+ * @defgroup PVA_VERSION_TYPE_FLAGS VERSION_TYPE Bit Flags
+ *
+ * @brief The bit flags that indicate the qualities of the Built Firmware.
+ * e.g: Debug, Safety, Test Features, etc.
+ *
+ * @see VERSION_TYPE
+ * @{
+ */
+
+/**
+ * @def VERSION_CODE_DEBUG
+ * @brief Set or Clear the 'debug' bit for the FW version type value. For a safety
+ * build the value of this define will be zero.
+ *
+ * @details This bit is set if the macro @r PVA_DEBUG is defined.
+ * @see PVA_DEBUG
+ */
+#if PVA_DEBUG == 1
+#define VERSION_CODE_DEBUG PVA_BIT(0)
+#else
+#define VERSION_CODE_DEBUG (0U)
+#endif
+
+/**
+ * @def VERSION_CODE_SAFETY
+ * @brief Set or Clear the 'safety' bit for the FW version type value.  For a safety
+ * build the value of this define will be non-zero.
+ *
+ * @details This bit is set if the macro @r PVA_SAFETY is defined.
+ * Building for Safety disables certain functions that are used for debug, testing,
+ * or would otherwise pose a risk to system conforming to safety protocols such as ISO-26262 or
+ * ASPICE.
+ *
+ * @see PVA_SAFETY
+ */
+#if PVA_SAFETY == 1
+#define VERSION_CODE_SAFETY PVA_BIT(1)
+#else
+#define VERSION_CODE_SAFETY (0U)
+#endif
+
+/**
+ * @def VERSION_CODE_PVA_TEST_SUPPORT
+ * @brief Set or Clear the 'test support' bit for the FW version type value.
+ *
+ * @details This bit is set if the macro @r TEST_TASK is defined.
+ * This bit is expected to be unset during a safety build.
+ *
+ * Building with tests support enabled may add additional commands to that
+ * can be processed by the FW to aid in testing of the system code. Often code of this
+ * nature can change the processing, memory, or timing characteristics of the system, and
+ * and should only enabled when explicitly needed.
+ *
+ *
+ * @see TEST_TASK
+ */
+#if TEST_TASK == 1
+#define VERSION_CODE_PVA_TEST_SUPPORT PVA_BIT(2)
+#else
+#define VERSION_CODE_PVA_TEST_SUPPORT (0U)
+#endif
+
+/**
+ * @def VERSION_CODE_STANDALONE_TESTS
+ * @brief Set or Clear the 'standalone tests' bit for the FW version type value.
+ *
+ * @details This bit is set if the macro @r TEST_TASK is defined.
+ * This bit is expected to be unset during a safety build.
+ *
+ * @see TEST_TASK
+ *
+ */
+#if TEST_TASK == 1
+#define VERSION_CODE_STANDALONE_TESTS PVA_BIT(3)
+#else
+#define VERSION_CODE_STANDALONE_TESTS (0U)
+#endif
+/** @} */
+
+/**
+ * @defgroup PVA_VERSION_MACROS PVA version macros used to calculate the PVA
+ * FW binary version.
+ * @{
+ */
+
+/**
+  * @brief An 8-bit bit field that describes which conditionally compiled facets of the Firmware
+  * have been enabled.
+  *
+  * @details The value of this macro is used when constructing a 32-bit Firmware Version identifier.
+  *
+  @verbatim
+  |  Bit  |  Structure Field Name  |  Condition for Enabling  |
+  |:-----:|:----------------------:|:------------------------:|
+  |  0  |  VERSION_CODE_DEBUG  |  This bit is set when the Firmware is built with @ref PVA_DEBUG defined as equalling 1.
+  |  1  |  VERSION_CODE_SAFETY  |  This bit is set when the Firmware is built with @ref PVA_SAFETY defined equalling 1.  |
+  |  2  |  VERSION_CODE_PVA_TEST_SUPPORT  |  This bit is set when the Firmware is built with @ref TEST_TASK defined as equalling 1.  |
+  |  3  |  VERSION_CODE_STANDALONE_TESTS  |  This bit is set when the Firmware is built with @ref TEST_TASK defined equalling 1. |
+  | 4-7 |  Reserved  |  The remaining bits of the bitfield are undefined.  |
+  @endverbatim
+  * @see PVA_VERSION_TYPE_FLAGS
+  */
+#define VERSION_TYPE                                                           \
+	(uint32_t) VERSION_CODE_DEBUG | (uint32_t)VERSION_CODE_SAFETY |        \
+		(uint32_t)VERSION_CODE_PVA_TEST_SUPPORT |                      \
+		(uint32_t)VERSION_CODE_STANDALONE_TESTS
+/** @} */
+
+/**
+ * @defgroup PVA_VERSION_VALUES PVA Major, Minor, and Subminor Version Values
+ *
+ * @brief The values listed below are applied to the corresponding fields when
+ * the PVA_VERSION macro is used.
+ *
+ * @see PVA_VERSION, PVA_MAKE_VERSION
+ * @{
+ */
+
+/**
+ * @brief The Major version of the Firmware
+ */
+#define PVA_VERSION_MAJOR 0x08
+
+/**
+ * @brief The Minor version of the Firmware
+ */
+#define PVA_VERSION_MINOR 0x02
+
+/**
+ * @brief The sub-minor version of the Firmware.
+ */
+#define PVA_VERSION_SUBMINOR 0x03
+/** @} */
+
+/**
+ * @def PVA_VERSION_GCID_REVISION
+ * @brief The GCID Revision of the Firmware.
+ *
+ * @details If this version is not otherwise defined during build time, this fallback value is used.
+ */
+#ifndef PVA_VERSION_GCID_REVISION
+/**
+ * @brief GCID revision of PVA FW binary.
+ */
+#define PVA_VERSION_GCID_REVISION 0x00000000
+#endif
+
+/**
+ * @def PVA_VERSION_BUILT_ON
+ * @brief The date and time the version of software was built, expressed as the number
+ * of seconds since the Epoch (00:00:00 UTC, January 1, 1970).
+ *
+ * @details If this version is not otherwise defined during build time, this fallback value is used.
+ */
+#ifndef PVA_VERSION_BUILT_ON
+#define PVA_VERSION_BUILT_ON 0x00000000
+#endif
+/** @} */
+
+#endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-packed.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-packed.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+/*
+ * Unit: Utility Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_PACKED_H
+#define PVA_PACKED_H
+/**
+ * @brief Packed attribute that avoids compiler to add any paddings.
+ *        Compiler implicitly adds padding between the structure members
+ *        to make it aligned. To avoid this packed attribute is used.
+ *        Packed is for shared structures between KMD and FW.
+ *        If packed is not used, then we depend on what padding the compiler adds.
+ *        Since KMD and FW are compiled by two different compilers, we need to
+ *        ensure that the offsets of each member of the structure is the same in
+ *        both KMD and FW. To ensure this we pack the structure.
+ */
+#define PVA_PACKED __attribute__((packed))
+#endif // PVA_PACKED_H
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-sys-dma.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-sys-dma.h
@@ -0,0 +1,486 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Direct Memory Access Driver Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+/**
+ * @file pva-sys-dma.h
+ *
+ * @brief Types and constants related to PVA DMA setup and DMA
+ * descriptors.
+ */
+
+#ifndef PVA_SYS_DMA_H
+#define PVA_SYS_DMA_H
+
+#include <stdint.h>
+#include <pva-bit.h>
+#include <pva-packed.h>
+
+#include "pva_fw_dma_hw_interface.h"
+
+/**
+ * @brief The version number of the current DMA info structure.
+ * This is used for detecting the DMA info updates for future
+ * HW releases.
+ */
+#define PVA_DMA_INFO_VERSION_ID (1U)
+
+/**
+ * @brief Number of DMA done masks in DMA info structure,
+ * corresponding to the number of DMA_COMMON_DMA_OUTPUT_ENABLEx
+ * registers in the HW.
+ */
+#define PVA_SYS_DMA_NUM_TRIGGERS (9U)
+
+/* NOTE : This must be kept as 15 for build to be
+ * successful, because in pva_fw_test we configure
+ * 15 channel, but internally we check if the
+ * number of channels requested is less than the
+ * maximum number of available channels */
+/**
+ * @brief Maximum Number of DMA channel configurations
+ * in DMA info structure.
+ */
+#define PVA_SYS_DMA_NUM_CHANNELS (15U)
+
+/**
+ * @brief Maximum number of DMA descriptors allowed
+ * for use for VPU for T23x
+ */
+#define PVA_SYS_DMA_MAX_DESCRIPTORS_T23X (60U)
+/**
+ * @brief Maximum number of DMA descriptors allowed
+ * for use for VPU for T26x
+ */
+#define PVA_SYS_DMA_MAX_DESCRIPTORS_T26X (92U)
+
+/**
+ * @brief DMA registers for VPU0 and VPU1 which are primarily
+ * used by DMA config and R5 initialization.
+ *
+ * For more information refer to section 3.4 in PVA Cluster IAS
+ * document (Document 11 in Supporting Documentation and References)
+ */
+/**
+ * @brief DMA channel base register for VPU0.
+ */
+#define PVA_DMA0_REG_CH_0 PVA_OFFSET(NV_ADDRESS_MAP_PVA0_DMA0_REG_CH_0_BASE)
+/**
+ * @brief DMA common base register for VPU0.
+ */
+#define PVA_DMA0_COMMON PVA_OFFSET(NV_ADDRESS_MAP_PVA0_DMA0_COMMON_BASE)
+/**
+ * @brief DMA DESCRAM base register for VPU0.
+ */
+#define PVA_DMA0_DESCRAM PVA_OFFSET(NV_ADDRESS_MAP_PVA0_DMA0_DESCRAM_BASE)
+/**
+ * @brief DMA channel base register for VPU1.
+ */
+#define PVA_DMA1_REG_CH_0 PVA_OFFSET(NV_ADDRESS_MAP_PVA0_DMA1_REG_CH_0_BASE)
+/**
+ * @brief DMA common base register for VPU1.
+ */
+#define PVA_DMA1_COMMON PVA_OFFSET(NV_ADDRESS_MAP_PVA0_DMA1_COMMON_BASE)
+/**
+ * @brief DMA DESCRAM base register for VPU1.
+ */
+#define PVA_DMA1_DESCRAM PVA_OFFSET(NV_ADDRESS_MAP_PVA0_DMA1_DESCRAM_BASE)
+/** @} */
+
+/**
+ *
+ * @brief DMA channel configuration for a user task.
+ *
+ * The DMA channel structure contains the set-up of a
+ * PVA DMA channel used for the VPU app.
+ *
+ * This VPU app should configure the channel information
+ * in this format
+ *
+ * @note : For more information on channel configuration, refer section 4.1.2 and 6.4 in
+ * the DMA IAS document (Document 6 in Supporting Documentation and References)
+ */
+typedef struct PVA_PACKED {
+	/**
+         * @brief HW DMA channel number from 1 to @ref PVA_NUM_DMA_CHANNELS.
+         */
+	uint8_t ch_number;
+	/**
+         * @brief Padding bytes of 3 added to align the next
+         * field of 4 bytes
+         */
+	uint8_t pad_dma_channel1[3];
+	/**
+         * @brief The value to be written to DMA channel
+         * control 0 register
+         */
+	uint32_t cntl0;
+	/**
+         * @brief The value to be written to DMA channel
+         * control 1 register
+         */
+	uint32_t cntl1;
+	/**
+         * @brief The value to be written to DMA channel
+         * boundary pad register
+         */
+	uint32_t boundary_pad;
+	/**
+         * @brief This value to be written to DMA HW sequence
+         * control register.
+         */
+	uint32_t hwseqcntl;
+	/**
+         * @brief This field is unused in t19x and T23x.
+         * It contains the value to be written to DMA
+         * channel HWSEQFSCNTL register.
+         */
+	uint32_t hwseqfscntl;
+	/**
+         * @brief Output enable mask
+         */
+	uint32_t outputEnableMask;
+	/**
+         * @brief Padding 8 bytes to align the whole structure
+         * to 32 byte boundary
+         */
+	uint32_t pad_dma_channel0[1];
+} pva_dma_ch_config_t;
+
+/**
+ *
+ * @brief DMA info for an application. The app maybe a VPU app which
+ * runs an algorithm on VPU or a DMA app which just has DMA configuration
+ * to move certain data. In both cases the application should
+ * configure the DMA information in this structure format
+ *
+ */
+typedef struct PVA_PACKED {
+	/**
+         * @brief The size of the dma_info structure.
+         * Should be populated with value sizeof(pva_dma_info_t)
+         * This is used to validate that the DRAM location populated
+         * by KMD is valid
+         */
+	uint16_t dma_info_size;
+	/**
+         * @brief This field is used to populate the DMA Info version
+         * In case we need to create a new
+         * DMA version structure then the FW can distinguish the DMA
+         * info structure. Currently it should be populated with value
+         * @ref PVA_DMA_INFO_VERSION_ID
+         */
+	uint16_t dma_info_version;
+
+	/**
+         * @brief The number of used channels. This field can
+         * be populated with values from 0 to
+         * @ref PVA_NUM_DMA_CHANNELS both inclusive.
+         */
+	uint8_t num_channels;
+	/**
+         * @brief Number of used descriptors.
+         *
+         * Note: In generations of PVA where the reserved descriptor range lies
+         *       in the middle of the entire descriptor range, when the range of
+         *       descriptors requested by the user crosses over the reserved descriptor
+         *       range, 'num_descriptors' will include the number of the reserved
+         *       descriptors as well.
+         *       E.g., if reserved descriptors are at indices 60-63 and user application
+         *             needs 70 descriptors, 'num_descriptor' will equal 74. However,
+         *             if user application needs 30 descriptors, 'num_descriptors' will be 30.
+         *
+         * On T19x and T23x, the field can be populated
+         * with values from 0 inclusive to less than
+         * @ref PVA_SYS_DMA_MAX_DESCRIPTORS
+         *
+         * On T26x, the field can be populated with values from 0 inclusive to
+         * @ref PVA_SYS_DMA_MAX_DESCRIPTORS + @ref PVA_NUM_RESERVED_DESCRIPTORS
+         */
+	uint8_t num_descriptors;
+	/**
+         * @brief The number of bytes used in HW sequencer
+         */
+	uint16_t num_hwseq;
+
+	/**
+         * @brief The First HW descriptor ID used.
+         *
+         * On T19x and T23x, the field can be populated
+         * with values from 0 inclusive to less than
+         * @ref PVA_SYS_DMA_MAX_DESCRIPTORS
+         *
+         * On T26x, the field can be populated with values from 0 inclusive to
+         * @ref PVA_SYS_DMA_MAX_DESCRIPTORS + @ref PVA_NUM_RESERVED_DESCRIPTORS
+         */
+	uint8_t descriptor_id;
+	/**
+         * @brief Padding for alignment of next element
+         */
+	uint8_t pva_dma_info_pad_0[3];
+
+	/**
+         * @brief DMA done triggers used by the VPU app.
+         * Correspond to COMMON_DMA_OUTPUT_ENABLE registers.
+         */
+	uint32_t dma_triggers[PVA_SYS_DMA_NUM_TRIGGERS];
+	/**
+         * @brief DMA channel config used by the VPU app.
+         * One app can have upto @ref PVA_NUM_DMA_CHANNELS
+         * DMA channel configurations. The size of the array
+	 * is @ref PVA_SYS_DMA_NUM_CHANNELS for additional
+	 * configuration required for future products.
+         */
+	pva_dma_ch_config_t dma_channels[PVA_SYS_DMA_NUM_CHANNELS];
+	/**
+         * @brief Value to be set in DMA common configuration register.
+         */
+	uint32_t dma_common_config;
+	/**
+         * @brief IOVA to an array of @ref pva_dtd_t, aligned at 64 bytes
+         * which holds the DMA descriptors used by the VPU app
+         */
+	pva_iova dma_descriptor_base;
+	/**
+         * @brief HW sequencer configuration base address.
+         */
+	pva_iova dma_hwseq_base;
+	/**
+         * @brief IOVA to a structure of @ref pva_dma_misr_config_t,
+         * location where DMA MISR configuration information is stored.
+         */
+	pva_iova dma_misr_base;
+} pva_dma_info_t;
+
+/**
+ * @brief DMA descriptor.
+ *
+ * PVA DMA Descriptor in packed HW format.
+ * The individual fields can be found from
+ * the DMA IAS document (Document 6 in Supporting Documentation and References)
+ * section 4.1.3.2
+ */
+typedef struct PVA_PACKED {
+	/** @brief TRANSFER_CONTROL0 byte has DSTM in lower 2 bits, SRC_TF in 3rd bit,
+         *  DDTM in 4th to 6th bit,DST_TF in 7th bit */
+	uint8_t transfer_control0;
+	/** @brief Next descriptor ID to be executed*/
+	uint8_t link_did;
+	/** @brief Highest 8 bits of the 40 bit source address*/
+	uint8_t src_adr1;
+	/** @brief Highest 8 bits of the 40 bit destination address*/
+	uint8_t dst_adr1;
+	/** @brief Lower 32 bits of the 40 bit source address*/
+	uint32_t src_adr0;
+	/** @brief Lower 32 bits of the 40 bit destination address*/
+	uint32_t dst_adr0;
+	/** @brief Length of tile line*/
+	uint16_t tx;
+	/** @brief Number of tile lines*/
+	uint16_t ty;
+	/** @brief Source Line pitch to advance to every line of 2D tile.*/
+	uint16_t slp_adv;
+	/** @brief Destination Line Pitch to advance to every line of 2D tile.*/
+	uint16_t dlp_adv;
+	/** @brief SRC PT1 CNTL has st1_adv in low 24 bits and ns1_adv in high 8 bits. */
+	uint32_t srcpt1_cntl;
+	/** @brief DST PT1 CNTL has dt1_adv in low 24 bits and nd1_adv in high 8 bits. */
+	uint32_t dstpt1_cntl;
+	/** @brief SRC PT2 CNTL has st2_adv in low 24 bits and ns2_adv in high 8 bits. */
+	uint32_t srcpt2_cntl;
+	/** @brief DST PT2 CNTL has dt2_adv in low 24 bits and nd2_adv in high 8 bits. */
+	uint32_t dstpt2_cntl;
+	/** @brief SRC PT3 CNTL has st3_adv in low 24 bits and ns3_adv in high 8 bits. */
+	uint32_t srcpt3_cntl;
+	/** @brief DST PT3 CNTL has dt3_adv in low 24 bits and nd3_adv in high 8 bits. */
+	uint32_t dstpt3_cntl;
+	/** @brief Source circular buffer Start address offset */
+	uint16_t sb_start;
+	/** @brief Destination circular buffer Start address offset*/
+	uint16_t db_start;
+	/** @brief Source buffer size in bytes for circular buffer mode from Source address.*/
+	uint16_t sb_size;
+	/** @brief Destination buffer size in bytes for circular buffer mode from destination address.*/
+	uint16_t db_size;
+	/** @brief currently reserved*/
+	uint16_t trig_ch_events;
+	/** @brief SW or HW events used for triggering the channel*/
+	uint16_t hw_sw_trig_events;
+	/** @brief Tile x coordinates, for boundary padding in pixels*/
+	uint8_t px;
+	/** @brief Tile y coordinates, for boundary padding in pixels*/
+	uint8_t py;
+	/** @brief Transfer control byte has lower 2 bits as BPP data, bit 2 with PXDIR, bit 3 as PYDIR,
+         *  bit 4 as BPE, bit 5 as TTS, bit 6 RSVD, Bit 7 ITC.
+         */
+	uint8_t transfer_control1;
+	/** @brief Transfer control 2 gas bit 0 as PREFEN, bit 1 as DCBM, bit 2 as SCBM, Bit 3 to 3 as SBADR.*/
+	uint8_t transfer_control2;
+	/** @brief Circular buffer upper bits for start address and size*/
+	uint8_t cb_ext;
+	/** @brief Reserved*/
+	uint8_t rsvd;
+	/** @brief Full replicated destination base address in VMEM aligned to 64 byte atom*/
+	uint16_t frda;
+} pva_dtd_t;
+
+/**
+ *
+ * @brief DMA MISR configuration information. This information is used by R5
+ * to program MISR registers if a task requests MISR computation on its
+ * output DMA channels.
+ *
+ */
+typedef struct PVA_PACKED {
+	/** @brief Reference value for CRC computed on write addresses, i.e., MISR 1 */
+	uint32_t ref_addr;
+	/** @brief Seed value for address CRC*/
+	uint32_t seed_crc0;
+	/** @brief Reference value for CRC computed on first 256-bits of AXI write data */
+	uint32_t ref_data_1;
+	/** @brief Seed value for write data CRC*/
+	uint32_t seed_crc1;
+	/** @brief Reference value for CRC computed on second 256-bits of AXI write data */
+	uint32_t ref_data_2;
+	/**
+     * @brief MISR timeout value configured in DMA common register
+     * @ref PVA_DMA_COMMON_MISR_ENABLE. Timeout is calculated as
+     * number of AXI clock cycles.
+     */
+	uint32_t misr_timeout;
+} pva_dma_misr_config_t;
+
+/**
+ * @defgroup PVA_DMA_TC0_BITS
+ *
+ * @brief PVA Transfer Control 0 Bitfields
+ *
+ * @{
+ */
+/**
+ * @brief The shift value for extracting DSTM field
+ */
+#define PVA_DMA_TC0_DSTM_SHIFT (0U)
+/**
+ * @brief The mask to be used to extract DSTM field
+ */
+#define PVA_DMA_TC0_DSTM_MASK (7U)
+
+/**
+ * @brief The shift value for extracting DDTM field
+ */
+#define PVA_DMA_TC0_DDTM_SHIFT (4U)
+/**
+ * @brief The mask to be used to extract DDTM field
+ */
+#define PVA_DMA_TC0_DDTM_MASK (7U)
+/** @} */
+
+/**
+ * @defgroup PVA_DMA_TM
+ *
+ * @brief DMA Transfer Modes. These can be used for both
+ * Source (DSTM) and Destination (DDTM) transfer modes
+ *
+ * @note : For more information on transfer modes, refer section 4.1.3.1 in
+ * the DMA IAS document (Document 6 in Supporting Documentation and References)
+ *
+ * @{
+ */
+/**
+ * @brief To indicate invalid transfer mode
+ */
+#define PVA_DMA_TM_INVALID (0U)
+/**
+ * @brief To indicate MC transfer mode
+ */
+#define PVA_DMA_TM_MC (1U)
+/**
+ * @brief To indicate VMEM transfer mode
+ */
+#define PVA_DMA_TM_VMEM (2U)
+#if ENABLE_UNUSED == 1U
+#define PVA_DMA_TM_CVNAS (3U)
+#endif
+/**
+ * @brief To indicate L2SRAM transfer mode
+ */
+#define PVA_DMA_TM_L2RAM (3U)
+/**
+ * @brief To indicate TCM transfer mode
+ */
+#define PVA_DMA_TM_TCM (4U)
+/**
+ * @brief To indicate MMIO transfer mode
+ */
+#define PVA_DMA_TM_MMIO (5U)
+/**
+ * @brief To indicate Reserved transfer mode
+ */
+#define PVA_DMA_TM_RSVD (6U)
+/**
+ * @brief To indicate VPU configuration transfer mode.
+ * This is only available in Source transfer mode or
+ * (DSTM). In Destination transfer mode, this value is
+ * reserved.
+ */
+#define PVA_DMA_TM_VPU (7U)
+/** @} */
+
+#if (ENABLE_UNUSED == 1U)
+/**
+ * @brief The macro defines the number of
+ * bits to shift right to get the PXDIR field
+ * in Transfer Control 1 register in DMA
+ * Descriptor
+ */
+#define PVA_DMA_TC1_PXDIR_SHIFT (2U)
+
+/**
+ * @brief The macro defines the number of
+ * bits to shift right to get the PYDIR field
+ * in Transfer Control 1 register in DMA
+ * Descriptor
+ */
+#define PVA_DMA_TC1_PYDIR_SHIFT (3U)
+#endif
+/**
+ * @defgroup PVA_DMA_BPP
+ *
+ * @brief PVA DMA Bits per Pixel
+ *
+ * @{
+ */
+/**
+ * @brief To indicate that the size of pixel data
+ * is 1 byte
+ */
+#define PVA_DMA_BPP_INT8 (0U)
+#if ENABLE_UNUSED == 1U
+#define PVA_DMA_BPP_INT16 (1U)
+#endif
+/** @} */
+
+/**
+ * @brief PVA DMA Pad X direction set to right
+ */
+#define PVA_DMA_PXDIR_RIGHT (1U)
+
+/**
+ * @brief PVA DMA Pad Y direction set to bottom
+ */
+#define PVA_DMA_PYDIR_BOT (1U)
+
+#endif /* PVA_SYS_DMA_H */
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-sys-params.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-sys-params.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Task Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+/**
+ * @file pva-sys-params.h
+ *
+ * @brief Types and constants related to VPU application parameters.
+ */
+
+#ifndef PVA_SYS_PARAMS_H
+#define PVA_SYS_PARAMS_H
+
+#include <stdint.h>
+#include <pva-packed.h>
+#include <pva-types.h>
+
+/** @brief VPU app parameters provided by kernel-user which is to be copied to
+ * VMEM during runtime
+ *
+ * The VPU App parameters contains kernel-user-provided data to be
+ * copied into the VMEM before executing the VPU app. The parameter
+ * headers are stored in the IOVA address stored in the param_base
+ * member of this structure.
+ *
+ * The FW can also initialize complex datatypes, which are marked by
+ * special param_base outside the normal IOVA space. See the structure
+ * pva_vpu_instance_data_t for an example.
+ */
+typedef struct PVA_PACKED {
+	/** @brief IOVA address of the parameter data */
+	pva_iova param_base;
+	/** @brief VMEM offset where parameter data is to be copied */
+	uint32_t addr;
+	/** @brief Size of the parameter data in bytes */
+	uint32_t size;
+} pva_vpu_parameter_list_t;
+
+/**
+ * @brief The structure holds information of various
+ *  VMEM parameters that is submitted in the task.
+ */
+typedef struct PVA_PACKED {
+	/**
+	 * @brief The IOVA address of the parameter data.
+	 * This should point to an array of type @ref pva_vpu_parameter_list_t .
+	 * If no parameters are present this should be set to 0
+	 */
+	pva_iova parameter_data_iova;
+
+	/**
+	 * @brief The starting IOVA address of the parameter data whose size
+	 * is lower than @ref PVA_DMA_VMEM_COPY_THRESHOLD . This data is copied
+	 * from DRAM to TCM using DMA, and then memcopied to VMEM.
+	 * If no small parameters are present this should be set to 0.
+	 */
+	pva_iova small_vpu_param_data_iova;
+
+	/**
+	 * @brief The number of bytes of small VPU parameter data, i.e the
+	 * data whose size is lower than @ref PVA_DMA_VMEM_COPY_THRESHOLD . If no small
+	 * parameters are present, this should be set to 0
+	 */
+	uint32_t small_vpu_parameter_data_size;
+
+	/**
+	 * @brief The index of the array of type @ref pva_vpu_parameter_list_t from which
+	 * the VPU large parameters are present, i.e the vpu parameters whose size is greater
+	 * than @ref PVA_DMA_VMEM_COPY_THRESHOLD . This value will always point to the index
+	 * immediately after the small parameters. If no large parameter is present, then
+	 * this field value will be same as the value of
+	 * @ref pva_vpu_parameter_info_t.vpu_instance_parameter_list_start_index field
+	 */
+	uint32_t large_vpu_parameter_list_start_index;
+
+	/**
+	 * @brief The index of the array of type @ref pva_vpu_parameter_list_t from which
+	 * the VPU instance parameters are present. This value will always point to the index
+	 * immediately after the large parameters if large parameters are present, else it
+	 * will be the same value as @ref pva_vpu_parameter_info_t.large_vpu_parameter_list_start_index
+	 * field.
+	 */
+	uint32_t vpu_instance_parameter_list_start_index;
+} pva_vpu_parameter_info_t;
+
+/** @brief Special marker for IOVA address of parameter data of a task to differentiate
+ *  if the parameter data specified in task should be used or if FW should create a supported
+ *  parameter data instance. If the IOVA address of parameter data is lesser than this
+ *  special marker, then use the parameter data specified in the task, else FW
+ *  creates the parameter data.
+ */
+#define PVA_COMPLEX_IOVA (0xDA7AULL << 48ULL)
+
+/** @brief Macro used to create new parameter base markers
+ *  from the special marker address @ref PVA_COMPLEX_IOVA
+ */
+#define PVA_COMPLEX_IOVA_V(v) (PVA_COMPLEX_IOVA | ((uint64_t)(v) << 32ULL))
+
+/** @brief Special Marker for @ref pva_vpu_instance_data_t */
+#define PVA_SYS_INSTANCE_DATA_V1_IOVA (PVA_COMPLEX_IOVA_V(1) | 0x00000001ULL)
+
+/**
+ * @brief The minimuim size of the VPU parameter for it to be considered
+ * as a large parameter
+ */
+#define PVA_DMA_VMEM_COPY_THRESHOLD (uint32_t)(256U)
+
+/**
+ * @brief The maximum combined size of all VMEM parameters
+ * that will be supported by PVA
+ */
+#define VMEM_PARAMETER_BUFFER_MAX_SIZE (uint32_t)(8192U)
+
+/**
+ * @brief The maximum number of symbols that will be supported
+ * for one task
+ */
+#define TASK_VMEM_PARAMETER_MAX_SYMBOLS (uint32_t)(128U)
+
+/**
+ * @brief Information of the VPU instance data passed to VPU kernel.
+ */
+typedef struct PVA_PACKED {
+	/** @brief ID of the VPU assigned to the task */
+	uint16_t pve_id;
+	/** @brief Variable to indicate that ppe task was launched or not */
+	uint16_t ppe_task_launched;
+	/** @brief Base of the VMEM memory */
+	uint32_t vmem_base;
+	/** @brief Base of the DMA descriptor SRAM memory */
+	uint32_t dma_descriptor_base;
+	/** @brief Base of L2SRAM allocated for the task executed */
+	uint32_t l2ram_base;
+	/** @brief Size of L2SRAM allocated for the task executed */
+	uint32_t l2ram_size;
+} pva_vpu_instance_data_t;
+
+#endif /* PVA_SYS_PARAMS_H */
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-types.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Utility Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_TYPES_H
+#define PVA_TYPES_H
+#include <stdint.h>
+
+/**
+ * @brief Used to represent address (IOVA) in PVA system.
+ */
+typedef uint64_t pva_iova;
+
+/**
+ * @brief Used to store Queue IDs, that represent the
+ *        actual hardware queue id between FW and KMD.
+ */
+typedef uint8_t pva_queue_id_t;
+
+/**
+ * @brief Used to store PVE ID, that represents which
+ *        PVE is being referred to .
+ */
+typedef uint8_t pva_pve_id_t;
+
+/**
+ * @brief Used to store Status interface ID, that is used
+ *        to know through which status needs to be written.
+ */
+typedef uint8_t pva_status_interface_id_t;
+
+#endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-version.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-version.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2021 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+/*
+ * Unit: Host Interface Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_VERSION_H
+#define PVA_VERSION_H
+
+#include <stdint.h>
+#include <pva-bit.h>
+#include <pva-fw-version.h>
+
+/**
+ * @brief Calculate a 32-bit build version with @ref PVA_VERSION_SUBMINOR,
+ * @ref PVA_VERSION_MINOR, @ref PVA_VERSION_MAJOR and @ref VERSION_TYPE macros.
+ *
+ * @param [in] \_type\_ an 8-bit bitfield containing flags indicating which compilation
+ * features were enabled when the firmware was compiled.
+ *
+ * @param [in] \_major\_ an unsigned, 8-bit value containing the major version of the
+ * compiled firmware.
+ *
+ * @param [in] \_minor\_ an unsigned, 8-bit value containing the minor version of the
+ * compiled firmware.
+ *
+ * @param [in] \_subminor\_ an unsigned, 8-bit value containing the sub-minor version
+ * of the compiled firmware.
+ @verbatim
+ | ------------- | ---------------------|
+ |  Bit Ranges   |      Function        |
+ | ------------- | ---------------------|
+ |    7-0        |  subminor version	|
+ |   15-8        |  minor version	|
+ |   23-16       |  major version  	|
+ |   31-24     	 |  version type 	|
+ ----------------------------------------
+ @endverbatim
+ */
+#define PVA_MAKE_VERSION(_type_, _major_, _minor_, _subminor_)                 \
+	(PVA_INSERT(_type_, 31, 24) | PVA_INSERT(_major_, 23, 16) |            \
+	 PVA_INSERT(_minor_, 15, 8) | PVA_INSERT(_subminor_, 7, 0))
+
+/**
+ * @brief Calculate PVA R5 FW binary version by calling @ref PVA_MAKE_VERSION macro.
+ *
+ * @param [in] \_type\_ an 8-bit bitfield containing flags indicating which compilation
+ * features were enabled when the firmware was compiled.
+ *
+ * @see VERSION_TYPE For details on how to construct the @p \_type\_ field.
+ *
+ * @see PVA_VERSION_MAJOR, PVA_VERSION_MINOR, PVA_VERSION_SUBMINOR for details
+ * on the values used at the time this documentation was produced.
+ */
+#define PVA_VERSION(_type_)                                                    \
+	PVA_MAKE_VERSION(_type_, PVA_VERSION_MAJOR, PVA_VERSION_MINOR,         \
+			 PVA_VERSION_SUBMINOR)
+
+#endif
--- a/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-vpu-syscall-interface.h
+++ b/drivers/video/tegra/host/pva/src/fw/baremetal/include/pva-vpu-syscall-interface.h
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022-2023 NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+/*
+ * Unit: VPU Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+/**
+ * @file pva-vpu-syscall-interface.h
+ *
+ * @brief Syscall command specification
+ *
+ * VPU uses syscall commands to request services from R5. A syscall command is a
+ * 32bit value that consists of a 8 bit syscall ID and 24 bit parameter. If more
+ * information needs to be passed to R5, the parameter field will be a pointer
+ * to a VMEM location.
+ */
+
+#ifndef PVA_VPU_SYSCALL_INTERFACE_H
+#define PVA_VPU_SYSCALL_INTERFACE_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup PVA_VPU_SYSCALL
+ *
+ * @brief PVA VPU SYS call IDs for each type of
+ * SYS call.
+ * @{
+ */
+
+//! @cond DISABLE_DOCUMENTATION
+
+/**
+ * @brief VPU Syscall id for vpu printf write.
+ */
+#define PVA_FW_PE_SYSCALL_ID_WRITE (1U)
+//! @endcond
+/**
+ * @brief VPU Syscall id for Icache prefetch.
+ */
+#define PVA_FW_PE_SYSCALL_ID_ICACHE_PREFETCH (2U)
+
+/**
+ * @brief VPU Syscall id for masking exceptions.
+ */
+#define PVA_FW_PE_SYSCALL_ID_MASK_EXCEPTION (3U)
+
+/**
+ * @brief VPU Syscall id for unmasking exceptions.
+ */
+#define PVA_FW_PE_SYSCALL_ID_UNMASK_EXCEPTION (4U)
+//! @cond DISABLE_DOCUMENTATION
+/**
+ * @brief VPU Syscall id for sampling VPU performance counters
+ */
+#define PVA_FW_PE_SYSCALL_ID_PERFMON_SAMPLE (5U)
+//! @endcond
+/** @} */
+
+/**
+ * @defgroup PVA_VPU_SYSCALL_WRITE_PARAM_GROUP
+ *
+ * @brief Parameter specification for syscall write
+ */
+
+/**
+ * @defgroup PVA_VPU_SYSCALL_COMMAND_FIELDS_GROUP
+ *
+ * @brief The command format to be used while issuing vpu syscall command from VPU kernel to R5.
+ * The fields mentioned in this group is used for submitting the command
+ * through the Signal_R5 interface from VPU kernel.
+ *
+ * @{
+ */
+
+/**
+ * @brief The most significant bit of the vpu syscall ID field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_ID_MSB (31U)
+
+/**
+ * @brief The least significant bit of the vpu syscall ID field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_ID_LSB (24U)
+
+/**
+ * @brief The most significant bit of the vpu syscall parameter field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_PARAM_MSB (23U)
+
+/**
+ * @brief The least significant bit of the vpu syscall parameter field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_PARAM_LSB (0U)
+/** @} */
+
+/**
+ * @defgroup PVA_VPU_SYSCALL_ICACHE_PREFETCH_PARAM_FIELDS_GROUP
+ *
+ * @brief The parameter format to be used while issuing vpu syscall command from VPU kernel to R5 for syscall icache prefetch.
+ * The fields mentioned in this group is used for submitting the icache prefetch command
+ * through the Signal_R5 interface from VPU kernel.
+ *
+ * @{
+ */
+
+/**
+ * @brief The most significant bit of the prefetch cache line count field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_PREFETCH_CACHE_LINE_COUNT_MSB (23U)
+
+/**
+ * @brief The least significant bit of the prefetch cache line count field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_PREFETCH_CACHE_LINE_COUNT_LSB (16U)
+
+/**
+ * @brief The most significant bit of the prefetch address field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_PREFETCH_ADDR_MSB (15U)
+
+/**
+ * @brief The least significant bit of the prefetch address field in
+ * the vpu syscall command interface
+ */
+#define PVA_FW_PE_SYSCALL_PREFETCH_ADDR_LSB (0U)
+/** @} */
+
+/**
+ * @defgroup PVA_VPU_SYSCALL_MASK_UNMASK_PARAM_FIELDS_GROUP
+ *
+ * @brief The parameter format to be used while issuing vpu syscall command from VPU kernel
+ * to R5 for masking or unmasking FP NaN Exception.
+ * The fields mentioned in this group is used for submitting the mask and unmask FP NaN eception command
+ * through the Signal_R5 interface from VPU kernel.
+ *
+ * @{
+ */
+
+/**
+ * @brief Parameter specification for syscall mask/unmask exceptions
+ */
+#define PVA_FW_PE_MASK_FP_INV_NAN (1U << 2U)
+/** @} */
+
+/**
+ * @breif Write syscall parameter will be a pointer to this struct
+ * @{
+ */
+typedef union {
+	struct {
+		uint32_t addr;
+		uint32_t size;
+	} in;
+	struct {
+		uint32_t written_size;
+	} out;
+} pva_fw_pe_syscall_write;
+/** @} */
+
+/**
+ * @defgroup PVA_VPU_SYSCALL_PERFMON_SAMPLE_PARAM_GROUP
+ *
+ * @brief Parameter specification for syscall perfmon_sample
+ *
+ * @{
+ */
+
+/**
+ * @brief Perfmon sample syscall parameter will be a pointer to this struct
+ */
+typedef struct {
+	/** counter_mask[0] is for ID: 0-31; counter_mask[1] is for ID: 32-63 */
+	uint32_t counter_mask[2];
+	uint32_t output_addr;
+} pva_fw_pe_syscall_perfmon_sample;
+
+/**
+ * @brief Index for t26x performance counters for VPU
+ */
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
+#define PERFMON_COUNTER_ID_VPS_ID_VALID_T26X (1U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T26X (2U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T26X (3U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T26X (4U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T26X (5U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T26X (6U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T26X (7U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T26X (8U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T26X (9U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T26X (10U)
+#define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T26X (11U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T26X (12U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T26X (13U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T26X (14U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T26X (15U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T26X (16U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T26X (17U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T26X (18U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T26X (19U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_FETCH_REQ_T26X (20U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_T26X (21U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_PREEMPT_T26X (22U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_LINES_T26X (23U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_MISS_DUR_T26X (24U)
+#define PERFMON_COUNTER_ID_VPS_ICACHE_PREFETCH_DUR_T26X (25U)
+#define PERFMON_COUNTER_ID_DLUT_BUSY_T26X (26U)
+#define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T26X (27U)
+#define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T26X (28U)
+#define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T26X (29U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T26X (30U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T26X (31U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T26X (32U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T26X (33U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T26X (34U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T26X (35U)
+#define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T26X (36U)
+
+/**
+ * @brief Index for t23x performance counters
+ */
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_NO_VAL_INSTR_T23X (0U)
+#define PERFMON_COUNTER_ID_VPS_ID_VALID_T23X (1U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_REG_DEPEND_T23X (2U)
+#define PERFMON_COUNTER_ID_VPS_STALL_ID_ONLY_T23X (3U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX1_ONLY_T23X (4U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RSC_HZRD_T23X (5U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_DATA_HZRD_T23X (6U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX4_RAMIC_HI_PRI_T23X (7U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX5_APB_T23X (8U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RSC_HZRD_T23X (9U)
+#define PERFMON_COUNTER_ID_VPS_STALL_EX8_RAMIC_HI_PRI_T23X (10U)
+#define PERFMON_COUNTER_ID_VPS_WFE_GPI_EX_STATE_T23X (11U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L01_T23X (12U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L01_T23X (13U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_L23_T23X (14U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_RD_REQ_ACT_L23_T23X (15U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L01_T23X (16U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L01_T23X (17U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_L23_T23X (18U)
+#define PERFMON_COUNTER_ID_VMEMIF_RAMIC_WR_REQ_ACT_L23_T23X (19U)
+#define PERFMON_COUNTER_ID_ICACHE_FETCH_REQ_T23X (20U)
+#define PERFMON_COUNTER_ID_ICACHE_MISS_T23X (21U)
+#define PERFMON_COUNTER_ID_ICACHE_PREEMP_T23X (22U)
+#define PERFMON_COUNTER_ID_ICACHE_PREFETCH_LINES_T23X (23U)
+#define PERFMON_COUNTER_ID_ICACHE_MISS_DUR_T23X (24U)
+#define PERFMON_COUNTER_ID_ICACHE_PREFETCH_DUR_T23X (25U)
+#define PERFMON_COUNTER_ID_DLUT_BUSY_T23X (26U)
+#define PERFMON_COUNTER_ID_DLUT_VPU_BOTH_BUSY_T23X (27U)
+#define PERFMON_COUNTER_ID_VPU_WAIT_FOR_DLUT_T23X (28U)
+#define PERFMON_COUNTER_ID_DLUT_WAIT_FOR_VPU_T23X (29U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_TRANS_T23X (30U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_TRANS_T23X (31U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_TRANS_T23X (32U)
+#define PERFMON_COUNTER_ID_DLUT_IDX_REQ_ACT_T23X (33U)
+#define PERFMON_COUNTER_ID_DLUT_LUT_REQ_ACT_T23X (34U)
+#define PERFMON_COUNTER_ID_DLUT_OUT_REQ_ACT_T23X (35U)
+#define PERFMON_COUNTER_ID_DLUT_NULL_GROUPS_T23X (36U)
+
+/**
+ * @brief Index for t26x performance counters for PPE
+ */
+#define PERFMON_COUNTER_ID_PPS_STALL_ID_NO_VAL_INSTR_T26X (0U)
+#define PERFMON_COUNTER_ID_PPS_ID_VALID_T26X (1U)
+#define PERFMON_COUNTER_ID_PPS_STALL_ID_REG_DEPEND_T26X (2U)
+#define PERFMON_COUNTER_ID_PPS_STALL_ID_ONLY_T26X (3U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX1_ONLY_T26X (4U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_LD_DEPENDENCY_T26X (5U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_ST_DEPENDENCY_T26X (6U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_IORF_DEPENDENCY_T26X (7U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STRM_STORE_FLUSH_T26X (8U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_STORE_FLUSH_T26X (9U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STORE_FLUSH_T26X (10U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_LD_T26X (11U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_ST_T26X (12U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STREAM_START_T26X (13U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LD_T26X (14U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_ST_T26X (15U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_SCALAR_LDST_T26X (16U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_PUSHBACK_T26X (17U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_STQ_PUSHBACK_T26X (18U)
+#define PERFMON_COUNTER_ID_PPS_STALL_EX3_LDQ_FLUSH_T26X (19U)
+#define PERFMON_COUNTER_ID_PPS_WFE_GPI_EX_STATE_T26X (20U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_FETCH_REQ_T26X (21U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_T26X (22U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_PREEMPT_T26X (23U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_LINES_T26X (24U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_MISS_DUR_T26X (25U)
+#define PERFMON_COUNTER_ID_PPS_ICACHE_PREFETCH_DUR_T26X (26U)
+/** @} */
+
+#endif /*PVA_VPU_SYSCALL_INTERFACE_H*/
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_fw.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_fw.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_FW_H
+#define PVA_FW_H
+#include "pva_api.h"
+#include "pva_bit.h"
+#include "pva_constants.h"
+#include "pva_fw_address_map.h"
+#include "pva_math_utils.h"
+
+/* The sizes of these structs must be explicitly padded to align to 4 bytes */
+
+struct pva_fw_prefence {
+	uint8_t offset_hi;
+	uint8_t pad0[3];
+	uint32_t offset_lo;
+	uint32_t resource_id;
+	uint32_t value;
+};
+
+struct pva_fw_postfence {
+	uint8_t offset_hi;
+	uint8_t ts_offset_hi;
+/** Privileged user queue may need to trigger fence that exists in user's own
+ * resource table. Set this flags to tell FW to use user's resource table when
+ * writing this post fence. This also applies to timestamp resource ID. */
+#define PVA_FW_POSTFENCE_FLAGS_USER_FENCE (1 << 0)
+	uint8_t flags;
+	uint8_t pad0;
+	uint32_t offset_lo;
+	uint32_t resource_id;
+	uint32_t value;
+
+	/* Timestamp part */
+	uint32_t ts_resource_id;
+	uint32_t ts_offset_lo;
+};
+
+struct pva_fw_memory_addr {
+	uint8_t offset_hi;
+	uint8_t pad0[3];
+	uint32_t resource_id;
+	uint32_t offset_lo;
+};
+
+struct pva_fw_cmdbuf_submit_info {
+	uint8_t num_prefence;
+	uint8_t num_postfence;
+	uint8_t num_input_status;
+	uint8_t num_output_status;
+#define PVA_CMDBUF_FLAGS_ENGINE_AFFINITY_MSB (1)
+#define PVA_CMDBUF_FLAGS_ENGINE_AFFINITY_LSB (0)
+	uint8_t flags;
+	uint8_t first_chunk_offset_hi;
+	/** First chunk size*/
+	uint16_t first_chunk_size;
+	struct pva_fw_prefence prefences[PVA_MAX_NUM_PREFENCES];
+	struct pva_fw_memory_addr input_statuses[PVA_MAX_NUM_INPUT_STATUS];
+	/** Resource ID of the first chunk */
+	uint32_t first_chunk_resource_id;
+	/** First chunk offset within the resource*/
+	uint32_t first_chunk_offset_lo;
+	/** Execution Timeout */
+	uint32_t execution_timeout_ms;
+	struct pva_fw_memory_addr output_statuses[PVA_MAX_NUM_OUTPUT_STATUS];
+	struct pva_fw_postfence postfences[PVA_MAX_NUM_POSTFENCES];
+};
+
+/* This is the header of the circular buffer */
+struct pva_fw_submit_queue_header {
+	/**
+	 * Head index of the circular buffer. Updated by R5, read by CCPLEX
+	 * (UMD/KMD).
+	 */
+	volatile uint32_t cb_head;
+	/**
+	  * Tail index of the circular buffer. Updated by CCPLEX.
+	  *
+	  * CCPLEX informs R5 the tail index through CCQ. In case KMD needs to
+	  * flush the queue. KMD may need to read the tail from here.
+	  */
+	volatile uint32_t cb_tail;
+	/* Immediately followed by an array of struct pva_cmdbuf_submit_info */
+};
+
+static inline uint32_t pva_fw_queue_count(uint32_t head, uint32_t tail,
+					  uint32_t size)
+{
+	if (tail >= head) {
+		return safe_subu32(tail, head);
+	} else {
+		return safe_addu32(safe_subu32(size, head), tail);
+	}
+}
+
+static inline uint32_t pva_fw_queue_space(uint32_t head, uint32_t tail,
+					  uint32_t size)
+{
+	return safe_subu32(
+		safe_subu32(size, pva_fw_queue_count(head, tail, size)), 1u);
+}
+
+/* CCQ commands: KMD -> R5, through CCQ FIFO */
+
+/*
+ * Most CCQ commands are meant to be used at init time.
+ * During runtime, only use PVA_FW_CCQ_OP_UPDATE_TAIL
+ */
+#define PVA_FW_CCQ_OPCODE_MSB 63
+#define PVA_FW_CCQ_OPCODE_LSB 60
+
+/*
+ * tail value bit field: 31 - 0
+ * queue id bit field: 40 - 32
+ */
+#define PVA_FW_CCQ_OP_UPDATE_TAIL 0
+#define PVA_FW_CCQ_TAIL_MSB 31
+#define PVA_FW_CCQ_TAIL_LSB 0
+#define PVA_FW_CCQ_QUEUE_ID_MSB 40
+#define PVA_FW_CCQ_QUEUE_ID_LSB 32
+
+/*
+ * resource table IOVA addr bit field: 39 - 0
+ * resource table number of entries bit field: 59 - 40
+ */
+#define PVA_FW_CCQ_OP_SET_RESOURCE_TABLE 1
+#define PVA_FW_CCQ_RESOURCE_TABLE_ADDR_MSB 39
+#define PVA_FW_CCQ_RESOURCE_TABLE_ADDR_LSB 0
+#define PVA_FW_CCQ_RESOURCE_TABLE_N_ENTRIES_MSB 59
+#define PVA_FW_CCQ_RESOURCE_TABLE_N_ENTRIES_LSB 40
+
+/*
+ * submission queue IOVA addr bit field: 39 - 0
+ * submission queue number of entries bit field: 59 - 40
+ */
+#define PVA_FW_CCQ_OP_SET_SUBMISSION_QUEUE 2
+#define PVA_FW_CCQ_QUEUE_ADDR_MSB 39
+#define PVA_FW_CCQ_QUEUE_ADDR_LSB 0
+#define PVA_FW_CCQ_QUEUE_N_ENTRIES_MSB 59
+#define PVA_FW_CCQ_QUEUE_N_ENTRIES_LSB 40
+
+/* KMD and FW communicate using messages.
+ *
+ * Message can contain up to 6 uint32_t.
+ *
+ * The first uint32_t is the header that contains message type and length.
+ */
+#define PVA_FW_MSG_MAX_LEN 6
+
+/* KMD send messages to R5 using CCQ FIFO. The message length is always 64 bit. */
+
+/* When R5 send messages to KMD using CCQ statuses, we use status 3 - 8
+ *
+ * msg[0] = STATUS8 -> generate interrupt to KMD
+ * msg[1] = STATUS3
+ * msg[2] = STATUS4
+ * msg[3] = STATUS5
+ * msg[4] = STATUS6
+ * msg[5] = STATUS7
+ */
+#define PVA_FW_MSG_STATUS_BASE 3
+#define PVA_FW_MSG_STATUS_LAST 8
+
+#define PVA_FW_MSG_TYPE_MSB 30
+#define PVA_FW_MSG_TYPE_LSB 25
+#define PVA_FW_MSG_LEN_MSB 24
+#define PVA_FW_MSG_LEN_LSB 22
+/* The remaining bits (0 - 21) of msg[0] can be used for message specific
+ * payload */
+
+/* Message types: R5 -> CCPLEX */
+#define PVA_FW_MSG_TYPE_ABORT 1
+#define PVA_FW_MSG_TYPE_BOOT_DONE 2
+#define PVA_FW_MSG_TYPE_FLUSH_PRINT 3
+#define PVA_FW_MSG_TYPE_RESOURCE_UNREGISTER 3
+
+/* Message types: CCPLEX -> R5 */
+#define PVA_FW_MSG_TYPE_UPDATE_TAIL 32
+
+/* Parameters for message ABORT
+ * ABORT message contains a short string (up to 22 chars).
+ * The first two charactors are in the message header (bit 15 - 0).
+ */
+#define PVA_FW_MSG_ABORT_STR_MAX_LEN 22
+
+/* Parameters for message BOOT_DONE */
+#define PVA_FW_MSG_R5_START_TIME_LO_IDX 1
+#define PVA_FW_MSG_R5_START_TIME_HI_IDX 2
+#define PVA_FW_MSG_R5_READY_TIME_LO_IDX 3
+#define PVA_FW_MSG_R5_READY_TIME_HI_IDX 4
+
+/* Parameters for message FLUSH PRINT */
+struct pva_fw_print_buffer_header {
+#define PVA_FW_PRINT_BUFFER_OVERFLOWED (1 << 0)
+#define PVA_FW_PRINT_FAILURE (1 << 1)
+	uint32_t flags;
+	uint32_t tail;
+	/* Followed by print content */
+};
+
+/* Parameters for message resource unregister */
+/* Table ID is stored in msg[0], bit: 0 - 7 */
+#define PVA_FW_MSG_RESOURCE_TABLE_ID_MSB 7
+#define PVA_FW_MSG_RESOURCE_TABLE_ID_LSB 0
+/* Followed by up to 5 resource IDs. The actual number of resource ID is
+ * indicated by the message length. */
+
+/** @brief Circular buffer based data channel to share data between R5 and CCPLEX */
+struct pva_data_channel {
+	uint32_t size;
+#define PVA_DATA_CHANNEL_OVERFLOW (1U << 0U)
+	uint32_t flags;
+	uint32_t head;
+	/**
+	 * Offset location in the circular buffer where from VPU printf data will be written by FW
+	 */
+	uint32_t tail;
+	/* Immediately followed by circular buffer data */
+};
+
+/* PVA FW Event profiling definitions */
+
+// Event identifiers
+#define PVA_FW_EVENT_DO_CMD PVA_BIT8(1)
+#define PVA_FW_EVENT_SCAN_QUEUES PVA_BIT8(2)
+#define PVA_FW_EVENT_SCAN_SLOTS PVA_BIT8(3)
+#define PVA_FW_EVENT_RUN_VPU PVA_BIT8(4)
+
+// Event message format
+struct pva_fw_event_message {
+	uint32_t event : 5;
+	uint32_t type : 3;
+	uint32_t arg1 : 8;
+	uint32_t arg2 : 8;
+	uint32_t arg3 : 8;
+};
+
+// Each event is one of the following types. This should fit within 3 bits
+enum pva_fw_events_type {
+	EVENT_TRY = 0U,
+	EVENT_START,
+	EVENT_YIELD,
+	EVENT_DONE,
+	EVENT_ERROR,
+	EVENT_TYPE_MAX = 7U
+};
+
+static inline const char *event_type_to_string(enum pva_fw_events_type status)
+{
+	switch (status) {
+	case EVENT_TRY:
+		return "TRY";
+	case EVENT_START:
+		return "START";
+	case EVENT_YIELD:
+		return "YIELD";
+	case EVENT_DONE:
+		return "DONE";
+	case EVENT_ERROR:
+		return "ERROR";
+	default:
+		return "";
+	}
+}
+
+enum pva_fw_timestamp_t {
+	TIMESTAMP_TYPE_TSE = 0,
+	TIMESTAMP_TYPE_CYCLE_COUNT = 1
+};
+
+struct pva_fw_profiling_buffer_header {
+#define PVA_FW_PROFILING_BUFFER_OVERFLOWED (1 << 0)
+#define PVA_FW_PROFILING_FAILURE (1 << 1)
+	uint32_t flags;
+	uint32_t tail;
+	/* Followed by print content */
+};
+/* End of PVA FW Event profiling definitions */
+
+struct pva_kmd_fw_tegrastats {
+	uint64_t window_start_time;
+	uint64_t window_end_time;
+	uint64_t total_utilization[PVA_NUM_PVE];
+};
+
+#endif // PVA_FW_H
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_fw_address_map.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_fw_address_map.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+/*
+ * Unit: Boot Unit
+ * SWUD Document:
+ * p4sw-swarm.nvidia.com/view/sw/embedded/docs/projects/active/DRIVE_6.0/QNX/PLC_Work_Products/Element_WPs/Autonomous_Middleware/PVA/04_Unit_Design/PVA_FW/SWE-PVAFW-006-SWUD.pdf
+ */
+#ifndef PVA_FW_ADDRESS_MAP_H
+#define PVA_FW_ADDRESS_MAP_H
+
+/**
+ * @brief Starting R5 address where FW code and data is placed.
+ * This address is expected to be programmed in PVA_CFG_AR1PRIV_START by KMD.
+ * This address is also expected to be used as offset where PVA_CFG_R5PRIV_LSEGREG1
+ * and PVA_CFG_R5PRIV_USEGREG1 registers would point.
+ */
+#define FW_CODE_DATA_START_ADDR 0x60000000
+
+/**
+ * @brief R5 address where FW code and data is expected to end.
+ * This address is expected to be programmed in PVA_CFG_AR1PRIV_END by KMD.
+ */
+#if PVA_DEV_MAIN_COMPATIBLE == 1
+#define FW_CODE_DATA_END_ADDR 0x60220000
+#else
+#define FW_CODE_DATA_END_ADDR 0x62000000
+#endif
+/**
+ * @defgroup PVA_EXCEPTION_VECTORS
+ *
+ * @brief Following macros define R5 addresses that are expected to be
+ * programmed by KMD in EVP registers as is.
+ * @{
+ */
+/**
+ * @brief R5 address of reset exception vector
+ */
+#define EVP_RESET_VECTOR 0x60040C00
+/**
+ * @brief R5 address of undefined instruction exception vector
+ */
+#define EVP_UNDEFINED_INSTRUCTION_VECTOR (EVP_RESET_VECTOR + 0x400 * 1)
+/**
+ * @brief R5 address of svc exception vector
+ */
+#define EVP_SVC_VECTOR (EVP_RESET_VECTOR + 0x400 * 2)
+/**
+ * @brief R5 address of prefetch abort exception vector
+ */
+#define EVP_PREFETCH_ABORT_VECTOR (EVP_RESET_VECTOR + 0x400 * 3)
+/**
+ * @brief R5 address of data abort exception vector
+ */
+#define EVP_DATA_ABORT_VECTOR (EVP_RESET_VECTOR + 0x400 * 4)
+/**
+ * @brief R5 address of reserved exception vector.
+ * It points to a dummy handler.
+ */
+#define EVP_RESERVED_VECTOR (EVP_RESET_VECTOR + 0x400 * 5)
+/**
+ * @brief R5 address of IRQ exception vector
+ */
+#define EVP_IRQ_VECTOR (EVP_RESET_VECTOR + 0x400 * 6)
+/**
+ * @brief R5 address of FIQ exception vector
+ */
+#define EVP_FIQ_VECTOR (EVP_RESET_VECTOR + 0x400 * 7)
+/** @} */
+
+/**
+ * @defgroup PVA_DEBUG_BUFFERS
+ *
+ * @brief These buffers are arranged in the following order:
+ * TRACE_BUFFER followed by CODE_COVERAGE_BUFFER followed by DEBUG_LOG_BUFFER.
+ * @{
+ */
+/**
+ * @brief Maximum size of trace buffer in bytes.
+ */
+#define FW_TRACE_BUFFER_SIZE 0x40000
+/**
+ * @brief Maximum size of code coverage buffer in bytes.
+ */
+#define FW_CODE_COVERAGE_BUFFER_SIZE 0x80000
+/**
+ * @brief Maximum size of debug log buffer in bytes.
+ */
+#if PVA_DEV_MAIN_COMPATIBLE == 1
+#define FW_DEBUG_LOG_BUFFER_SIZE 0x40000
+#else
+#define FW_DEBUG_LOG_BUFFER_SIZE 0x400000
+#endif
+/** @} */
+
+/**
+ * @brief Total size of buffers used for FW debug in bytes.
+ * TBD: Update this address based on build configuration once KMD changes are merged.
+ */
+#define FW_DEBUG_DATA_TOTAL_SIZE                                               \
+	(FW_TRACE_BUFFER_SIZE + FW_DEBUG_LOG_BUFFER_SIZE +                     \
+	 FW_CODE_COVERAGE_BUFFER_SIZE)
+
+/**
+ * @brief Starting R5 address where FW debug related data is placed.
+ * This address is expected to be programmed in PVA_CFG_AR2PRIV_START by KMD.
+ * This address is also expected to be used as offset where PVA_CFG_R5PRIV_LSEGREG2
+ * and PVA_CFG_R5PRIV_USEGREG2 registers would point.
+ */
+#define FW_DEBUG_DATA_START_ADDR (0x70000000) //1879048192 0x70000000
+
+/**
+ * @brief R5 address where FW debug related data is expected to end.
+ * This address is expected to be programmed in PVA_CFG_AR2PRIV_END by KMD.
+ */
+#define FW_DEBUG_DATA_END_ADDR                                                 \
+	(FW_DEBUG_DATA_START_ADDR + FW_DEBUG_DATA_TOTAL_SIZE)
+
+/**
+ * @brief Starting R5 address where FW expects shared buffers between KMD and FW to be placed.
+ * This is to be used as offset when programming PVA_CFG_R5USER_LSEGREG and PVA_CFG_R5USER_USEGREG.
+ */
+#define FW_SHARED_MEMORY_START (0x80000000U) //2147483648 0x80000000
+
+/**
+ * @defgroup PVA_HYP_SCR_VALUES
+ *
+ * @brief Following macros specify SCR firewall values that are expected to be
+ * programmed by Hypervisor.
+ * @{
+ */
+/**
+ * @brief EVP SCR firewall to enable only CCPLEX read/write access.
+ */
+#define PVA_EVP_SCR_VAL 0x19000202
+
+/**
+ * @brief PRIV SCR firewall to enable only CCPLEX and R5 read/write access.
+ */
+#define PVA_PRIV_SCR_VAL 0x1F008282
+
+/**
+ * @brief CCQ SCR firewall to enable only CCPLEX write access and R5 read access.
+ */
+#define PVA_CCQ_SCR_VAL 0x19000280
+
+/**
+ * @brief Status Ctl SCR firewall to enable only CCPLEX read access and R5 read/write access.
+ */
+#define PVA_STATUS_CTL_SCR_VAL 0x1f008082
+/** @} */
+
+/**
+ * @defgroup PVA_KMD_SCR_VALUES
+ *
+ * @brief Following macros specify SCR firewall values that are expected to be
+ * programmed by KMD.
+ * @{
+ */
+/**
+ * @brief SECEXT_INTR SCR firewall to enable only CCPLEX and R5 read/write access.
+ */
+#define PVA_SEC_SCR_SECEXT_INTR_EVENT_VAL 0x39008282U
+/**
+ * @brief PROC SCR firewall to enable only CCPLEX read/write access and R5 read only access.
+ */
+#define PVA_PROC_SCR_PROC_VAL 0x39000282U
+/** @} */
+
+#endif
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_fw_hyp.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_fw_hyp.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_FW_HYP_H
+#define PVA_FW_HYP_H
+
+/**
+ * @defgroup PVA_BOOT_TIME_MBOX
+ *
+ * @brief This group defines the mailboxes used by KMD to pass start iovas required for
+ * user segment and priv2 segment configuration during boot.
+ * @{
+ */
+/**
+ * @brief Used to pass bits 31-0 of start iova of user segment.
+ */
+#define PVA_MBOXID_USERSEG_L (1U)
+/**
+ * @brief Used to pass bits 39-32 of start iova of user segment.
+ */
+#define PVA_MBOXID_USERSEG_H (2U)
+/**
+ * @brief Used to pass bits 31-0 of start iova of priv2 segment.
+ */
+#define PVA_MBOXID_PRIV2SEG_L (3U)
+/**
+ * @brief Used to pass bits 39-32 of start iova of priv2 segment.
+ */
+#define PVA_MBOXID_PRIV2SEG_H (4U)
+/** @} */
+
+/**
+ * @defgroup PVA_SHARED_SEMAPHORE_STATUS_GROUP
+ *
+ * @brief The status bits for the shared semaphore which are mentioned in
+ * the group are used to communicate various information between KMD and
+ * PVA R5 FW. The highest 16 bits are used to send information from KMD to
+ * R5 FW and the lower 16 bits are used to send information from R5 FW to KMD by
+ * writing to the @ref PVA_BOOT_SEMA semaphore
+ *
+ * The bit-mapping of the semaphore is described below. The table below shows the mapping which
+ * is sent by KMD to FW.
+ *
+ * | Bit Position |    Bit Field Name     |                                                 Description                                                                                               |
+ * |:------------:|:---------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------:|
+ * |     31       |  BOOT INT             |  To indicate that KMD is expecting an interrupt from R5 once boot is complete                                                                             |
+ * |     30       |  Reserved             |  Reserved for future use                                                                                                                                  |
+ * |    27-25     |  Reserved             |  Reserved for future use                                                                                                                                  |
+ * |    23-21     |  Reserved             |  Reserved for future use                                                                                                                                  |
+ * |     20       |  CG DISABLE           |  To indicate the PVA R5 FW should disable the clock gating feature                                                                                        |
+ * |     19       |  VMEM RD WAR DISABLE  |  To disable the VMEM Read fail workaround feature                                                                                                         |
+ * |    18-16     |  Reserved             |  Reserved for future use                                                                                                                                  |
+ *
+ * The table below shows the mapping which is sent by FW to KMD
+ *
+ * | Bit Position |    Bit Field Name     |                                      Description                                                            |
+ * |:------------:|:---------------------:|:-----------------------------------------------------------------------------------------------------------:|
+ * |   15-11      |    Reserved           |  Reserved for future use                                                                                    |
+ * |   07-03      |    Reserved           |  Reserved for future use                                                                                    |
+ * |     02       |    HALTED             |  To indicate to KMD that the PVA R5 FW has halted execution                                                 |
+ * |     01       |   BOOT DONE           |  To indicate to KMD that the PVA R5 FW booting is complete                                                  |
+ *
+ * @{
+ */
+
+//! @endcond
+
+/**
+ * @brief This field is used to indicate that the R5 FW should
+ * disable the clock gating feature
+ */
+#define PVA_BOOT_SEMA_CG_DISABLE PVA_BIT(20U)
+//! @cond DISABLE_DOCUMENTATION
+
+/** Tell firmware that block linear surfaces are in XBAR_RAW format instead of
+ * TEGRA_RAW format */
+#define PVA_BOOT_SEMA_USE_XBAR_RAW PVA_BIT(17U)
+
+#define PVA_BOOT_SEMA 0U
+
+/**
+ * @brief This macro has the value to be set by KMD in the shared semaphores
+ * @ref PVA_PREFENCE_SYNCPT_REGION_IOVA_SEM or @ref PVA_POSTFENCE_SYNCPT_REGION_IOVA_SEM
+ * if the syncpoint reserved region must not be configured as uncached
+ * in R5 MPU.
+ */
+#define PVA_R5_SYNCPT_REGION_IOVA_OFFSET_NOT_SET (0xFFFFFFFFU)
+/** @} */
+
+/* Runtime mailbox messages between firmware and hypervisor */
+
+/* When hypervisor send messages to R5 through mailboxes, we use mailbox 0 - 1
+ * msg[0] = mailbox 1 -> generate interrupt to R5
+ * msg[1] = mailbox 0
+ */
+#define PVA_FW_MBOX_TO_R5_BASE 0
+#define PVA_FW_MBOX_TO_R5_LAST 1
+
+/* When R5 send messages to hypervisor through mailboxes, we use mailbox 2 - 7
+ * msg[0] = mailbox 7 -> generate interrupt to hypervisor
+ * msg[1] = mailbox 2
+ * msg[2] = mailbox 3
+ * msg[3] = mailbox 4
+ * msg[4] = mailbox 5
+ * msg[5] = mailbox 6
+ */
+#define PVA_FW_MBOX_TO_HYP_BASE 2
+#define PVA_FW_MBOX_TO_HYP_LAST 7
+
+#define PVA_FW_MBOX_FULL_BIT PVA_BIT(31)
+
+#endif // PVA_FW_HYP_H
--- a/drivers/video/tegra/host/pva/src/fw/include/pva_resource.h
+++ b/drivers/video/tegra/host/pva/src/fw/include/pva_resource.h
@@ -0,0 +1,340 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_RESOURCE_H
+#define PVA_RESOURCE_H
+#include "pva_api.h"
+#include "pva_api_dma.h"
+#include "pva_bit.h"
+#include "pva_constants.h"
+#include "pva_utils.h"
+#include "pva_math_utils.h"
+
+/* The sizes of these structs must be explicitly padded to align to 4 bytes */
+
+struct pva_fw_dma_descriptor {
+	uint8_t transfer_control0;
+	uint8_t link_did;
+	uint8_t src_adr1;
+	uint8_t dst_adr1;
+	uint32_t src_adr0;
+	uint32_t dst_adr0;
+	uint16_t tx;
+	uint16_t ty;
+	uint16_t slp_adv;
+	uint16_t dlp_adv;
+	uint32_t srcpt1_cntl;
+	uint32_t dstpt1_cntl;
+	uint32_t srcpt2_cntl;
+	uint32_t dstpt2_cntl;
+	uint32_t srcpt3_cntl;
+	uint32_t dstpt3_cntl;
+	uint16_t sb_start;
+	uint16_t db_start;
+	uint16_t sb_size;
+	uint16_t db_size;
+	uint16_t trig_ch_events;
+	uint16_t hw_sw_trig_events;
+	uint8_t px;
+	uint8_t py;
+	uint8_t transfer_control1;
+	uint8_t transfer_control2;
+	uint8_t cb_ext;
+	uint8_t rsvd;
+	uint16_t frda;
+};
+
+/** Each slot is mapped to <reloc_count> number of pva_fw_dma_reloc. When
+ * bind_dram/vmem_slot command is executed, the slot_id will be an index into
+ * the slot array. The slot contains starting index and count of reloc structs.
+ * All descriptor fields identified by the reloc structs will be patched.
+ */
+struct pva_fw_dma_slot {
+/** This slot can be bound to a DRAM buffer */
+#define PVA_FW_DMA_SLOT_FLAG_DRAM (1u << 0u)
+/** This slot can be bound to a L2SRAM buffer */
+#define PVA_FW_DMA_SLOT_FLAG_L2SRAM (1u << 1u)
+/** This slot can be bound to a VMEM DATA buffer */
+#define PVA_FW_DMA_SLOT_FLAG_VMEM_DATA (1u << 2u)
+/** This slot can be bound to a VMEM VPU config table buffer */
+#define PVA_FW_DMA_SLOT_FLAG_VMEM_VPUC_TABLE (1u << 3u)
+/** This slot has enabled circular buffer. Slot with this flags cannot be bound
+ * to block linear surface. */
+#define PVA_FW_DMA_SLOT_FLAG_CB (1u << 4u)
+#define PVA_FW_DMA_SLOT_FLAG_BOUND (1u << 5u)
+	uint8_t flags;
+	uint8_t pad;
+	/** Bitmask of channels that use this slot */
+	uint16_t ch_use_mask;
+
+	/** The number of descriptor fields that share this slot. Each field
+	 * will have a pva_fw_dma_reloc struct
+	 */
+	uint16_t reloc_count;
+	/** Starting index in the pva_fw_dma_reloc array */
+	uint16_t reloc_start_idx;
+
+	int64_t start_addr;
+	int64_t end_addr;
+};
+
+static inline uint32_t get_slot_size(struct pva_fw_dma_slot const *slot)
+{
+	uint32_t size = UINT32_MAX;
+	int64_t tmp_size = 0;
+	if (slot->end_addr < slot->start_addr) {
+		return size;
+	}
+	tmp_size = slot->end_addr - slot->start_addr;
+	if (tmp_size > (int64_t)UINT32_MAX) {
+		return size;
+	}
+	size = (uint32_t)tmp_size;
+	return size;
+}
+
+/**
+ * A relocate struct identifies an address field (src, dst or dst2) in
+ * the descriptor. The identified address field contains an offset instead of
+ * absolute address. The base address will be added to the offset during
+ * binding.
+ *
+ * This struct only has 2 bytes, so an array of this struct must have an even
+ * number of elements to satisfy alignment requirement.
+ */
+struct pva_fw_dma_reloc {
+	uint8_t desc_index;
+/** This relocation is for source field */
+#define PVA_FW_DMA_RELOC_FIELD_SRC 1u
+/** This relocation is for destination field */
+#define PVA_FW_DMA_RELOC_FIELD_DST 2u
+/** This relocation is for destination 2 field */
+#define PVA_FW_DMA_RELOC_FIELD_DST2 3u
+	uint8_t field;
+};
+
+struct pva_fw_dma_channel {
+	uint32_t cntl0;
+	uint32_t cntl1;
+	uint32_t boundary_pad;
+	uint32_t hwseqcntl;
+	uint32_t hwseqfscntl;
+};
+
+struct pva_fw_data_section_info {
+	uint32_t data_buf_off; /*< offset in data section data byte array */
+	uint32_t vmem_addr;
+	uint32_t size;
+};
+
+struct pva_dma_resource_map {
+// TODO: These macros should be derived using the maximum limits across platforms
+//	 Today, they are being hardcoded. Make it automatic
+#define PVA_DMA_NUM_CHANNEL_PARTITIONS                                         \
+	((PVA_MAX_NUM_DMA_CHANNELS) / (PVA_DMA_CHANNEL_ALIGNMENT))
+#define PVA_DMA_NUM_DESCRIPTOR_PARTITIONS                                      \
+	((PVA_MAX_NUM_DMA_DESC) / (PVA_DMA_DESCRIPTOR_ALIGNMENT))
+#define PVA_DMA_NUM_ADB_PARTITIONS                                             \
+	((PVA_MAX_NUM_ADB_BUFFS) / (PVA_DMA_ADB_ALIGNMENT))
+#define PVA_DMA_NUM_HWSEQ_WORD_PARTITIONS                                      \
+	((PVA_MAX_NUM_HWSEQ_WORDS) / (PVA_DMA_HWSEQ_WORD_ALIGNMENT))
+
+	uint64_t channels : PVA_DMA_NUM_CHANNEL_PARTITIONS;
+	uint64_t descriptors : PVA_DMA_NUM_DESCRIPTOR_PARTITIONS;
+	uint64_t adbs : PVA_DMA_NUM_ADB_PARTITIONS;
+	uint64_t hwseq_words : PVA_DMA_NUM_HWSEQ_WORD_PARTITIONS;
+	uint64_t triggers : 1;
+};
+
+static inline void
+pva_dma_resource_map_reset(struct pva_dma_resource_map *resource_map)
+{
+	resource_map->channels = 0u;
+	resource_map->descriptors = 0u;
+	resource_map->adbs = 0u;
+	resource_map->hwseq_words = 0u;
+	resource_map->triggers = 0u;
+}
+
+// Note: the following pva_dma_resource_map_* APIs assume an alignment requirement
+//	 on the 'start' index. We do not enforce it here though. If this requirement
+//	 is not met, the FW may falsely predicted resource conflicts between commands.
+//	 However, this will not impact functionality or correctness.
+static inline void
+pva_dma_resource_map_add_channels(struct pva_dma_resource_map *map,
+				  uint16_t start, uint16_t count)
+{
+	map->channels |= pva_mask64(start, count, PVA_DMA_CHANNEL_ALIGNMENT);
+}
+
+static inline void
+pva_dma_resource_map_add_descriptors(struct pva_dma_resource_map *map,
+				     uint16_t start, uint16_t count)
+{
+	map->descriptors |=
+		pva_mask64(start, count, PVA_DMA_DESCRIPTOR_ALIGNMENT);
+}
+
+static inline void
+pva_dma_resource_map_add_adbs(struct pva_dma_resource_map *map, uint16_t start,
+			      uint16_t count)
+{
+	map->adbs |= pva_mask64(start, count, PVA_DMA_ADB_ALIGNMENT);
+}
+
+static inline void
+pva_dma_resource_map_add_hwseq_words(struct pva_dma_resource_map *map,
+				     uint16_t start, uint16_t count)
+{
+	map->hwseq_words |=
+		pva_mask64(start, count, PVA_DMA_HWSEQ_WORD_ALIGNMENT);
+}
+
+static inline void
+pva_dma_resource_map_add_triggers(struct pva_dma_resource_map *map)
+{
+	// If an application is running on VPU, it has access to all the triggers
+	// Only FW and DMA-only workloads can initiate transfers in parallel to
+	// a running VPU application, but they do not require triggers.
+	map->triggers |= 1;
+}
+
+static inline void
+pva_dma_resource_map_copy_channels(struct pva_dma_resource_map *dst_map,
+				   struct pva_dma_resource_map *src_map)
+{
+	dst_map->channels |= src_map->channels;
+}
+
+static inline void
+pva_dma_resource_map_copy_descriptors(struct pva_dma_resource_map *dst_map,
+				      struct pva_dma_resource_map *src_map)
+{
+	dst_map->descriptors |= src_map->descriptors;
+}
+
+static inline void
+pva_dma_resource_map_copy_adbs(struct pva_dma_resource_map *dst_map,
+			       struct pva_dma_resource_map *src_map)
+{
+	dst_map->adbs |= src_map->adbs;
+}
+
+static inline void
+pva_dma_resource_map_copy_triggers(struct pva_dma_resource_map *dst_map,
+				   struct pva_dma_resource_map *src_map)
+{
+	dst_map->triggers |= src_map->triggers;
+}
+
+static inline void
+pva_dma_resource_map_copy_hwseq_words(struct pva_dma_resource_map *dst_map,
+				      struct pva_dma_resource_map *src_map)
+{
+	dst_map->hwseq_words |= src_map->hwseq_words;
+}
+
+struct pva_dma_config_resource {
+	uint8_t base_descriptor;
+	uint8_t base_channel;
+	uint8_t num_descriptors;
+	uint8_t num_channels;
+
+	uint16_t num_dynamic_slots;
+	/** Must be an even number to satisfy padding requirement. */
+	uint16_t num_relocs;
+	/** Indices of channels. Once the corresponding bit is set, the block height of
+	 * this channel should not be changed. */
+	uint16_t ch_block_height_fixed_mask;
+
+	uint16_t base_hwseq_word;
+	uint16_t num_hwseq_words;
+	uint16_t pad;
+
+	uint32_t vpu_exec_resource_id;
+	uint32_t common_config;
+	uint32_t output_enable[PVA_NUM_DMA_TRIGGERS];
+
+	struct pva_dma_resource_map dma_resource_map;
+	/* Followed by <num_dynamic_slots> of pva_fw_dma_slot */
+	/* Followed by <num_reloc_infos> of pva_fw_dma_reloc */
+	/* Followed by an array of pva_fw_dma_channel */
+	/* Followed by an array of pva_fw_dma_descriptor */
+
+	/* =====================================================================
+	 * The following fields do not need to be fetched into TCM. The DMA config
+	 * resource size (as noted in the resource table) does not include these
+	 * fields */
+
+	/* Followed by an array of hwseq words */
+};
+
+struct pva_fw_vmem_buffer {
+#define PVA_FW_SYM_TYPE_MSB 31
+#define PVA_FW_SYM_TYPE_LSB 29
+#define PVA_FW_VMEM_ADDR_MSB 28
+#define PVA_FW_VMEM_ADDR_LSB 0
+	uint32_t addr;
+	uint32_t size;
+};
+
+struct pva_exec_bin_resource {
+	uint8_t code_addr_hi;
+	uint8_t data_section_addr_hi;
+	uint8_t num_data_sections;
+	uint8_t pad;
+
+	uint32_t code_addr_lo;
+	uint32_t data_section_addr_lo;
+	uint32_t code_size;
+	uint32_t num_vmem_buffers;
+
+	/* Followed by <num_data_sections> number of pva_fw_data_section_info  */
+	/* Followed by <num_vmem_buffers> number of pva_fw_vmem_buffer */
+};
+
+static inline struct pva_fw_dma_slot *
+pva_dma_config_get_slots(struct pva_dma_config_resource *dma_config)
+{
+	return (struct pva_fw_dma_slot
+			*)((uint8_t *)dma_config +
+			   sizeof(struct pva_dma_config_resource));
+}
+
+static inline struct pva_fw_dma_reloc *
+pva_dma_config_get_relocs(struct pva_dma_config_resource *dma_config)
+{
+	return (struct pva_fw_dma_reloc
+			*)((uint8_t *)pva_dma_config_get_slots(dma_config) +
+			   sizeof(struct pva_fw_dma_slot) *
+				   dma_config->num_dynamic_slots);
+}
+
+static inline struct pva_fw_dma_channel *
+pva_dma_config_get_channels(struct pva_dma_config_resource *dma_config)
+{
+	return (struct pva_fw_dma_channel *)((uint8_t *)
+						     pva_dma_config_get_relocs(
+							     dma_config) +
+					     sizeof(struct pva_fw_dma_reloc) *
+						     dma_config->num_relocs);
+}
+
+static inline struct pva_fw_dma_descriptor *
+pva_dma_config_get_descriptors(struct pva_dma_config_resource *dma_config)
+{
+	return (struct pva_fw_dma_descriptor
+			*)((uint8_t *)pva_dma_config_get_channels(dma_config) +
+			   sizeof(struct pva_fw_dma_channel) *
+				   dma_config->num_channels);
+}
+
+#endif // PVA_RESOURCE_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api.h
@@ -0,0 +1,349 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_API_H
+#define PVA_API_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pva_api_types.h"
+#include "pva_api_dma.h"
+#include "pva_api_vpu.h"
+#include "pva_api_cmdbuf.h"
+
+/* Core APIs */
+
+/**
+ * @brief Create a PVA context.
+ *
+ * @param[in] pva_index Select which PVA instance to use if there are multiple PVAs
+ * in the SOC.
+ * @param[in] max_resource_count Maximum number of resources this context can have.
+ * @param[out] ctx Pointer to the created context.
+ */
+enum pva_error pva_context_create(uint32_t pva_index,
+				  uint32_t max_resource_count,
+				  struct pva_context **ctx);
+
+/**
+ * @brief Destroy a PVA context.
+ *
+ * A context can only be destroyed after all queues are destroyed.
+ *
+ * @param[in] ctx Pointer to the context to destroy.
+ */
+void pva_context_destroy(struct pva_context *ctx);
+
+/**
+ * @brief Create a PVA queue.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] max_submission_count Max number of submissions that can be queued.
+ * @param[out] queue Pointer to the created queue.
+ */
+enum pva_error pva_queue_create(struct pva_context *ctx,
+				uint32_t max_submission_count,
+				struct pva_queue **queue);
+
+/**
+ * @brief Destroy a PVA queue.
+ *
+ * @param[in] queue Pointer to the queue to destroy.
+ */
+void pva_queue_destroy(struct pva_queue *queue);
+
+/**
+ * @brief Allocate DRAM memory that can be mapped PVA's device space
+ *
+ * @param[in] size Size of the memory to allocate.
+ * @param[out] out_mem Pointer to the allocated memory.
+ */
+enum pva_error pva_memory_alloc(uint64_t size, struct pva_memory **out_mem);
+
+/**
+ * @brief Map the memory to CPU's virtual space.
+ *
+ * @param[in] mem Pointer to the memory to map.
+ * @param[in] access_mode Access mode for the memory. PVA_ACCESS_RD or
+ *                        PVA_ACCESS_RW.
+ * @param[out] out_va Pointer to the virtual address of the mapped memory.
+ */
+enum pva_error pva_memory_cpu_map(struct pva_memory *mem, uint32_t access_mode,
+				  void **out_va);
+
+/**
+ *  @brief Unmap the memory from CPU's virtual space.
+ *
+ *  @param[in] mem Pointer to the memory to unmap.
+ *  @param[in] va Previously mapped virtual address.
+ */
+enum pva_error pva_memory_cpu_unmap(struct pva_memory *mem, void *va);
+
+/**
+ * @brief Free the memory.
+ *
+ * Freeing a registered memory is okay since KMD holds a reference to the memory.
+ *
+ * @param mem Pointer to the memory to free.
+ */
+void pva_memory_free(struct pva_memory *mem);
+
+/**
+ * @brief Wait for a syncpoint to reach a value.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] syncpiont_id Syncpoint ID to wait on.
+ * @param[in] value Value to wait for.
+ * @param[in] timeout_us Timeout in microseconds. PVA_TIMEOUT_INF for infinite.
+ */
+enum pva_error pva_syncpoint_wait(struct pva_context *ctx,
+				  uint32_t syncpiont_id, uint32_t value,
+				  uint64_t timeout_us);
+
+/**
+ * @brief Submit a batch of command buffers.
+ *
+ * @param[in] queue Pointer to the queue.
+ * @param[in] submit_infos Array of submit info structures.
+ * @param[in] count Number of submit info structures.
+ * @param[in] timeout_us Timeout in microseconds. PVA_TIMEOUT_INF for infinite.
+ *
+ * @note Concurrent submission to the same queue needs to be serialized by the
+ *       caller.
+ */
+enum pva_error
+pva_cmdbuf_batch_submit(struct pva_queue *queue,
+			struct pva_cmdbuf_submit_info *submit_infos,
+			uint32_t count, uint64_t timeout_us);
+
+/**
+ * @brief Get the symbol table for a registered executable.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] exe_resource_id Resource ID of the executable.
+ * @param[out] out_info Pointer to the symbol info array.
+ * @param[in] max_num_symbols Maximum number of symbols to return.
+ */
+enum pva_error pva_executable_get_symbols(struct pva_context *ctx,
+					  uint32_t exe_resource_id,
+					  struct pva_symbol_info *out_info,
+					  uint32_t max_num_symbols);
+
+/**
+ * @brief Submit a list of asynchronous registration operations to KMD.
+ *
+ * The operations can be:
+ * - Memory registration
+ * - Executable registration
+ * - DMA config registration
+ *
+ * The response buffer will contain the resource IDs of the registered
+ * resources. Any command buffers that use these resources should wait on the
+ * returned post fence.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] fence Pointer to the post fence to wait on. If NULL, it means the
+ * caller is not interested in waiting. This usually only applies to unregister
+ * operations.
+ * @param[in] Input buffer containing the list of operations.
+ * @param[out] Output buffer to store the response.
+ *
+ * @note Input and output buffer may be the same buffer.
+ */
+enum pva_error pva_ops_submit_async(struct pva_context *ctx,
+				    struct pva_fence *fence,
+				    struct pva_ops_buffer const *input_buffer,
+				    struct pva_ops_buffer *output_buffer);
+
+/**
+ * @brief Perform a list of registration operations synchronously.
+ *
+ * The operations can be:
+ * - Memory registration
+ * - Executable registration
+ * - DMA config registration
+ *
+ * The response buffer will contain the resource IDs of the registered
+ * resources.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] Input buffer containing the list of operations.
+ * @param[out] Output buffer to store the response.
+ *
+ * @note Input and output buffer may be the same buffer.
+ *
+ */
+enum pva_error pva_ops_submit(struct pva_context *ctx,
+			      struct pva_ops_buffer const *input_buffer,
+			      struct pva_ops_buffer *output_buffer);
+
+/** Size of the ops buffer header. When user allocates memory for ops buffer,
+ * this size needs to be added. */
+#define PVA_OPS_BUFFER_HEADER_SIZE 64
+/**
+ * @brief Initialize pva_ops_buffer to keep track of the state of
+ * operations buffer during preparation.
+ *
+ * @param[out] buf_handle Pointer to the pva_ops_buffer object to initialize.
+ * @param[in] buf Pointer to the buffer that will store the operations.
+ * @param[in] size Size of the buffer.
+ */
+enum pva_error pva_ops_buffer_init(struct pva_ops_buffer *buf_handle, void *buf,
+				   uint32_t size);
+
+#define PVA_OPS_MEMORY_REG_SIZE 64
+/**
+ * @brief Append a memory registration operation to the operations buffer.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] mem Pointer to the memory to register.
+ * @param[in] segment Memory segment to register.
+ * @param[in] access_flags Access flags for the memory.
+ * @param[out] op_buf Pointer to the operations buffer.
+ */
+enum pva_error pva_ops_append_memory_register(struct pva_context *ctx,
+					      struct pva_memory *mem,
+					      enum pva_memory_segment segment,
+					      uint32_t access_flags,
+					      struct pva_ops_buffer *op_buf);
+#define PVA_OPS_EXEC_REG_HEADER_SIZE 16
+/**
+ * @brief Append an executable registration operation to the operations.
+ *
+ * @param[in] ctx Pointer to the context.
+ * @param[in] executable Pointer to the executable binary content.
+ * @param[in] executable_size Size of the executable.
+ * @param[out] op_buf Pointer to the operations buffer.
+ */
+enum pva_error pva_ops_append_executable_register(
+	struct pva_context *ctx, void const *executable,
+	uint32_t executable_size, struct pva_ops_buffer *op_buf);
+
+#define PVA_OPS_DMA_CONFIG_REG_SIZE (24 * 1024)
+/**
+ * @brief Append a DMA config registration operation to the operations.
+ * @param[in] ctx Pointer to the context.
+ * @param[in] dma_config Pointer to the DMA config.
+ * @param[out] op_buf Pointer to the operations buffer.
+ */
+enum pva_error
+pva_ops_append_dma_config_register(struct pva_context *ctx,
+				   struct pva_dma_config const *dma_config,
+				   struct pva_ops_buffer *op_buf);
+
+#define PVA_OPS_UNREG_SIZE 16
+enum pva_error pva_ops_append_unregister(struct pva_context *ctx,
+					 uint32_t resource_id,
+					 struct pva_ops_buffer *op_buf);
+
+/**
+ * @brief Parse the response buffer to get the resource ID of the registered
+ * memory or DMA configuration.
+ *
+ * @param[in] resp_buf Pointer to the response buffer.
+ * @param[out] resource_id output resource ID.
+ */
+enum pva_error pva_ops_parse_register_resp(struct pva_ops_buffer *resp_buf,
+					   uint32_t *resource_id);
+
+/**
+ * @brief Parse the response buffer to get the resource ID of the registered
+ * executable.
+ *
+ * @param[in] resp_buf Pointer to the response buffer.
+ * @param[out] num_symbols Number of symbols in the executable.
+ * @param[out] resource_id output resource ID.
+ */
+enum pva_error pva_ops_parse_exec_register_resp(struct pva_ops_buffer *op_buf,
+						uint32_t *num_symbols,
+						uint32_t *resource_id);
+
+#define PVA_DATA_CHANNEL_HEADER_SIZE 32
+/**
+ * @brief Initialize VPU print buffer
+ *
+ * @param[in] data Pointer to VPU print buffer.
+ * @param[in] size Size of VPU print buffer.
+ */
+struct pva_data_channel;
+enum pva_error pva_init_data_channel(void *data, uint32_t size,
+				     struct pva_data_channel **data_channel);
+
+/**
+ * @brief Read VPU print buffer
+ *
+ * @param[in]  data Pointer to VPU print buffer.
+ * @param[out] read_buffer Pointer to output buffer in which data will be read.
+ * @param[in]  bufferSize Size of output buffer.
+ * @param[out] read_size Size of actual data read in output buffer.
+ */
+enum pva_error pva_read_data_channel(struct pva_data_channel *data_channel,
+				     uint8_t *read_buffer, uint32_t bufferSize,
+				     uint32_t *read_size);
+
+/**
+ * @brief Duplicate PVA memory object.
+ *
+ * This function duplicates a PVA memory object. The new object will have shared
+ * ownership of the memory.
+ *
+ * @param[in] src Pointer to the source memory object.
+ * @param[in] access_mode Access mode for the new memory object. It should be
+ * more restrictive than the source memory. Passing 0 will use the same access
+ * mode as the source memory.
+ * @param[out] dst Resulting duplicated memory object.
+ */
+enum pva_error pva_memory_duplicate(struct pva_memory *src,
+				    uint32_t access_mode,
+				    struct pva_memory **dst);
+
+/**
+ * @brief Get memory attributes.
+ *
+ * @param[in] mem Pointer to the memory.
+ * @param[out] out_attrs Pointer to the memory attributes.
+ */
+void pva_memory_get_attrs(struct pva_memory const *mem,
+			  struct pva_memory_attrs *out_attrs);
+
+/** \brief Specifies the PVA system software major version. */
+#define PVA_SYSSW_MAJOR_VERSION (2U)
+
+/** \brief Specifies the PVA system software minor version. */
+#define PVA_SYSSW_MINOR_VERSION (7U)
+
+/**
+ * @brief Get PVA system software version.
+ *
+ * PVA system software version is defined as the latest version of cuPVA which is fully supported
+ * by this version of the PVA system software.
+ *
+ * @param[out] version version of currently running system SW, computed as:
+ 	       (PVA_SYSSW_MAJOR_VERSION * 1000) + PVA_SYSSW_MINOR_VERSION
+ * @return PVA_SUCCESS on success, else error code indicating the failure.
+ */
+enum pva_error pva_get_version(uint32_t *version);
+
+/**
+ * @brief Get the hardware characteristics of the PVA.
+ *
+ * @param[out] pva_hw_char Pointer to the hardware characteristics.
+ */
+enum pva_error
+pva_get_hw_characteristics(struct pva_characteristics *pva_hw_char);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PVA_API_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_cmdbuf.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_cmdbuf.h
@@ -0,0 +1,627 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_API_CMDBUF_H
+#define PVA_API_CMDBUF_H
+#include "pva_api_types.h"
+
+//Maximum number of slots for maintaining Timestamps
+#define PVA_MAX_QUERY_SLOTS_COUNT 32U
+
+/** The common header for all commands.
+ */
+struct pva_cmd_header {
+#define PVA_CMD_PRIV_OPCODE_FLAG (1U << 7U)
+	/** Opcode for the command. MSB of opcode indicates whether this command is
+	 * privileged or not */
+	uint8_t opcode;
+	/** Command specific flags  */
+	uint8_t flags;
+	/**
+	* For pva_cmd_barrier: barrier_group specifies which group this barrier
+	*	waits for.
+	* For pva_cmd_retire_barrier_group: barrier_group specifies which id will
+	*	be retired. Retired ids can be re-used by future commands and will refer
+	*	to a new logical group.
+	* For all other commands: barrier_group specifies which barrier group this
+	*	command belongs to. Other commands are able to defer execution until all
+	* 	commands in the barrier group have completed, or stall the cmd buffer
+	*	until such a time. Note that asynchronous commands may complete in an
+	*	order different to the order in which they appear in the commmand
+	*	buffer.
+	*/
+	uint8_t barrier_group;
+	/** Length in 4-bytes, including this header. */
+	uint8_t len;
+};
+
+struct pva_user_dma_allowance {
+#define PVA_USER_DMA_ALLOWANCE_ADB_STEP_SIZE 8
+	uint32_t channel_idx : 4;
+	uint32_t desc_start_idx : 7;
+	uint32_t desc_count : 7;
+	uint32_t adb_start_idx : 6;
+	uint32_t adb_count : 6;
+};
+
+/* Basic Commands */
+
+/** Does nothing. It can be used as a place holder in the command buffer. */
+struct pva_cmd_noop {
+#define PVA_CMD_OPCODE_NOOP 0U
+	struct pva_cmd_header header;
+};
+
+/** Link next chunk. This command can be placed anywhere in the command buffer.
+ * Firmware will start fetching the next chunk when this command is executed. */
+struct pva_cmd_link_chunk {
+#define PVA_CMD_OPCODE_LINK_CHUNK 1U
+	struct pva_cmd_header header;
+	uint8_t next_chunk_offset_hi;
+	uint8_t pad;
+	uint16_t next_chunk_size; /**< Size of next chunk in bytes */
+	uint32_t next_chunk_resource_id;
+	uint32_t next_chunk_offset_lo;
+	struct pva_user_dma_allowance user_dma;
+};
+
+/** Barrier command. The user can assign a barrier group to any asynchronous
+ * command. The barrier command blocks FW execution until the specified group of
+ * asynchronous commands have completed. Up to 8 barrier groups are supported.
+ *
+ * @note A barrier command is not typically required since FW stalls
+ * automatically in the event of hardware conflicts or when issuing a command is
+ * deemed unsafe according to the state machines. However, if a stall is needed
+ * for other reasons, the barrier command can be utilized.
+ */
+struct pva_cmd_barrier {
+#define PVA_CMD_OPCODE_BARRIER 2U
+	struct pva_cmd_header header;
+};
+
+/** Acquire one or more PVE systems, each of which includes a VPS, DMA and PPE.
+ * It blocks until specified number of engines are acquired.
+ * By default, the lowest engine ID acquired is set as the current engine.
+ * Acquired engines will be automatically released when this command buffer finishes.
+ * They can also be released using release_engine command.
+ */
+struct pva_cmd_acquire_engine {
+#define PVA_CMD_OPCODE_ACQUIRE_ENGINE 3U
+	struct pva_cmd_header header;
+	uint8_t engine_count;
+	uint8_t pad[3];
+};
+
+/** Release all PVE systems acquired. It is legal to release engine when engine
+ * is still running. The released engine won’t be available to be acquired until
+ * it finishes and becomes idle again. */
+struct pva_cmd_release_engine {
+#define PVA_CMD_OPCODE_RELEASE_ENGINE 4U
+	struct pva_cmd_header header;
+};
+
+/** Set a PVE engine as current. Following commands will modify this engine. The
+ * zero-based engine index must be less than the acquired engine number. */
+struct pva_cmd_set_current_engine {
+#define PVA_CMD_OPCODE_SET_CURRENT_ENGINE 5U
+	struct pva_cmd_header header;
+	uint8_t engine_index;
+	uint8_t pad[3];
+};
+
+/** This command specifies the executable to use for the following VPU launches.
+ * It doesn’t do anything other than setting the context for the following
+ * commands.
+ *
+ * Note: This command cannot be initiated if any of the DMA sets (that access
+ * VMEM) are in a running state, in order to prevent mismatches between DMA sets
+ * and VPU executables. The command buffer will stall until these DMA sets are
+ * finished. */
+struct pva_cmd_set_vpu_executable {
+#define PVA_CMD_OPCODE_SET_VPU_EXECUTABLE 6U
+	struct pva_cmd_header header;
+	uint32_t vpu_exec_resource_id;
+};
+
+/** This command clears the entire VMEM. User may choose to skip VMEM clear if
+ * there are no bss sections in the VPU executable. Since VMEM can be accessed
+ * by both VPU and PPE, this command drives both the VPU state machine and the
+ * PPE state machine. As a result, it can only be started if both VPU state
+ * machine and PPE state machine are in valid states (Idle or Binded). */
+struct pva_cmd_clear_vmem {
+#define PVA_CMD_OPCODE_CLEAR_VMEM 7U
+	struct pva_cmd_header header;
+};
+
+/** This command configures VPU hardware. Specifically, it configures code
+ * segment register and copies data sections. */
+struct pva_cmd_init_vpu_executable {
+#define PVA_CMD_OPCODE_INIT_VPU_EXECUTABLE 8U
+	struct pva_cmd_header header;
+	struct pva_user_dma_allowance user_dma;
+};
+
+/** Start VPU instruction prefetch from specified entry point. Currently, the
+ * entry point index must be 0. More entry points will be supported in the
+ * future. Note that this command merely triggers the prefetch but does not wait
+ * for the prefetch to complete. Therefore, this command is synchronous. */
+struct pva_cmd_prefetch_vpu_code {
+#define PVA_CMD_OPCODE_PREFETCH_VPU_CODE 9U
+	struct pva_cmd_header header;
+	uint32_t entry_point_index;
+};
+
+/** Run the VPU program from the specified entry point until finish. The
+ * lifetime of this command covers the entire VPU program execution. Since this
+ * command is asynchronous, it doesn’t block the following commands from
+ * execution. */
+struct pva_cmd_run_vpu {
+#define PVA_CMD_OPCODE_RUN_VPU 10U
+	struct pva_cmd_header header;
+	uint32_t entry_point_index;
+};
+
+/** Copy data from opaque payload to a VPU variable. Firmware may choose to copy
+ * with R5 or DMA. If using DMA, channel 0 will be used. */
+struct pva_cmd_set_vpu_parameter {
+#define PVA_CMD_OPCODE_SET_VPU_PARAMETER 11U
+	struct pva_cmd_header header;
+	uint16_t data_size;
+	uint16_t pad;
+	uint32_t symbol_id;
+	uint32_t vmem_offset;
+	/* Followed by <data_size> number of bytes, padded to 4 bytes  */
+};
+
+/** Copy data from a DRAM buffer to a VPU variable. DMA will be used to perform
+ * the copy. The user can optionally provide a user channel, a descriptor and
+ * ADBs to speed up the copy. */
+struct pva_cmd_set_vpu_parameter_with_buffer {
+#define PVA_CMD_OPCODE_SET_VPU_PARAMETER_WITH_BUFFER 12U
+	struct pva_cmd_header header;
+	struct pva_user_dma_allowance user_dma;
+	uint8_t src_dram_offset_hi;
+	uint8_t pad[3];
+	uint32_t data_size;
+	uint32_t dst_symbol_id;
+	uint32_t dst_vmem_offset;
+	uint32_t src_dram_resource_id;
+	uint32_t src_dram_offset_lo;
+};
+
+/** For set_vpu_parameter_with_address command, set this flag in header.flags to
+ * indicate that the target symbol is the legacy pointer symbol type:
+ * pva_fw_vpu_legacy_ptr_symbol, which only supports 32bit offset and 32bit
+ * size. */
+#define PVA_CMD_FLAGS_USE_LEGACY_POINTER 0x1
+/** Copy the address of a DRAM buffer to a VPU variable. The variable must be
+ * laid out exactly according to pva_fw_vpu_ptr_symbol
+ */
+struct pva_cmd_set_vpu_parameter_with_address {
+#define PVA_CMD_OPCODE_SET_VPU_PARAMETER_WITH_ADDRESS 13U
+	struct pva_cmd_header header;
+	uint8_t dram_offset_hi;
+	uint8_t pad[3];
+	uint32_t symbol_id;
+	uint32_t dram_resource_id;
+	uint32_t dram_offset_lo;
+};
+
+#define PVA_MAX_DMA_SETS_PER_DMA_ENGINE 4
+#define PVA_DMA_CONFIG_FETCH_BUFFER_PER_DMA_ENGINE 1
+
+/** This command first acquires the TCM scratch and then fetches DMA configuration
+ * into the scratch. The command does not modify DMA
+ * hardware, allowing FW to continue using user channels for data transfer after
+ * its execution. This command only uses channel 0 to fetch the DMA
+ * configuration. However, user can still help speed up the process by
+ * providing additional ADBs. This command will block if there’s no TCM scratch
+ * available. If there’s no pending commands AND there’s no TCM scratch, then it
+ * means we encountered a dead lock, the command buffer will be aborted. */
+struct pva_cmd_fetch_dma_configuration {
+#define PVA_CMD_OPCODE_FETCH_DMA_CONFIGURATION 14U
+	struct pva_cmd_header header;
+	uint8_t dma_set_id;
+	uint8_t pad[3];
+	uint32_t resource_id;
+	struct pva_user_dma_allowance user_dma;
+};
+
+/** Setup DMA hardware registers using previously fetched DMA configuration. FW
+ * uses channel 0 to copy DMA descriptors into descriptor RAM. The user can
+ * provide additional ADBs to speed up the process. The command will block until
+ * the needed channels, descriptors and hwseq words are acquired. The command must
+ * also validate that all source and destinations fields of each DMA descriptor
+ * being programmed is bound to a resource.
+ */
+struct pva_cmd_setup_dma {
+#define PVA_CMD_OPCODE_SETUP_DMA 15U
+	struct pva_cmd_header header;
+	struct pva_user_dma_allowance user_dma;
+	uint8_t dma_set_id;
+	uint8_t pad[3];
+};
+
+/** Run DMA channels according to the current DMA configuration until they are
+ * finished. The lifetime of this command covers the entire DMA transfer. The
+ * command shall block until the needed VDBs/ADBs and triggers (GPIOs) are
+ * acquired.
+
+ * @note This command checks that the DMA set to be started is indeed paired
+ * with the currently bound VPU executable. If not, this constitutes a
+ * programming error, and the command buffer will be aborted. */
+struct pva_cmd_run_dma {
+#define PVA_CMD_OPCODE_RUN_DMA 16U
+	struct pva_cmd_header header;
+	uint8_t dma_set_id;
+	uint8_t pad[3];
+};
+
+/** This command specifies the executable to use for the following PPE launches.
+ * It doesn’t do anything other than setting the context for the following
+ * commands. */
+struct pva_cmd_set_ppe_executable {
+#define PVA_CMD_OPCODE_SET_PPE_EXECUTABLE 17U
+	struct pva_cmd_header header;
+	uint32_t ppe_exec_resource_id;
+};
+
+/** Start PPE instruction prefetch from specified entry point. Currently, the
+ * entry point index must be 0. Note that this command merely triggers the
+ * prefetch but does not wait for the prefetch to complete. Therefore, this
+ * command is synchronous. */
+struct pva_cmd_prefetch_ppe_code {
+#define PVA_CMD_OPCODE_PREFETCH_PPE_CODE 18U
+	struct pva_cmd_header header;
+	uint32_t entry_point_index;
+};
+
+/** Setup PPE code segment and data sections. */
+struct pva_cmd_init_ppe_executable {
+#define PVA_CMD_OPCODE_INIT_PPE_EXECUTABLE 19U
+	struct pva_cmd_header header;
+	struct pva_user_dma_allowance user_dma;
+};
+
+/** Run the PPE program until finish. This lifetime of this command covers the
+ * entire PPE program execution. */
+struct pva_cmd_run_ppe {
+#define PVA_CMD_OPCODE_RUN_PPE 20U
+	struct pva_cmd_header header;
+	uint32_t entry_point_index;
+};
+
+#define PVA_BARRIER_GROUP_0 0U
+#define PVA_BARRIER_GROUP_1 1U
+#define PVA_BARRIER_GROUP_2 2U
+#define PVA_BARRIER_GROUP_3 3U
+#define PVA_BARRIER_GROUP_4 4U
+#define PVA_BARRIER_GROUP_5 5U
+#define PVA_BARRIER_GROUP_6 6U
+#define PVA_BARRIER_GROUP_7 7U
+
+#define PVA_MAX_BARRIER_GROUPS 8U
+
+#define PVA_BARRIER_GROUP_INVALID 0xFFU
+
+/**
+ * @brief Captures a timestamp to DRAM
+ *
+ * This command allows you to capture a timestamp using one of three modes:
+ *
+ * - **IMMEDIATE_MODE**: Captures the timestamp immediately.
+ * - **VPU START MODE**: Enqueue a timestamp to be captured the next time the
+ *   current VPU starts. Up to 8 VPU start timestamps may be active at a time
+ *   for a given engine.
+ * - **VPU DONE MODE**: Enqueue a timestamp to be captured the next time the
+ *   current VPU enters done state. Up to 8 VPU done timestamps may be active at
+ *   a time for a given engine.
+ * - **DEFER MODE**: Defers the timestamp capture by specifying a barrier group.
+ *   The timestamp will be captured once the commands in the specified barrier
+ *   group have completed. Each barrier group allows one timestamp to be active
+ *   at a time.
+ *
+ * The timestamp will be available in DRAM after waiting on any postfence.
+ *
+ * @note This command is asynchronous, ensuring it does not block the next command.
+ */
+struct pva_cmd_capture_timestamp {
+#define PVA_CMD_OPCODE_CAPTURE_TIMESTAMP 21U
+	struct pva_cmd_header header;
+	uint8_t offset_hi;
+	uint8_t defer_barrier_group;
+#define PVA_CMD_CAPTURE_MODE_IMMEDIATE 0U
+#define PVA_CMD_CAPTURE_MODE_VPU_START 1U
+#define PVA_CMD_CAPTURE_MODE_VPU_DONE 2U
+#define PVA_CMD_CAPTURE_MODE_DEFER 3U
+	uint8_t capture_mode;
+	uint8_t pad;
+	uint32_t resource_id;
+	uint32_t offset_lo;
+};
+
+/** Set the address of the status buffer. FW will output detailed command buffer
+ * status in case of command buffer abort. */
+struct pva_cmd_request_status {
+#define PVA_CMD_OPCODE_CAPTURE_STATUS 22U
+	struct pva_cmd_header header;
+	uint8_t offset_hi;
+	uint8_t pad[3];
+	uint32_t resource_id;
+	uint32_t offset_lo;
+};
+
+/** Blocks until l2ram is available. To prevent deadlock with other command
+ * buffers, l2ram must be acquired prior to acquiring any engine. It will be
+ * automatically freed when this command buffer finishes. If persistence is
+ * required, it must be saved to DRAM. One command buffer may only hold one
+ * L2SRAM allocation at a time. */
+struct pva_cmd_bind_l2sram {
+#define PVA_CMD_OPCODE_BIND_L2SRAM 23U
+	struct pva_cmd_header header;
+	uint8_t dram_offset_hi;
+#define FILL_ON_MISS (1U << 0U)
+#define FLUSH_ON_EVICTION (1U << 1U)
+	uint8_t access_policy;
+	uint8_t pad[2];
+	uint32_t dram_resource_id;
+	uint32_t dram_offset_lo;
+	uint32_t l2sram_size;
+	struct pva_user_dma_allowance user_dma;
+};
+
+/** Free previously allocated l2ram. This command is asynchronous because it
+ * needs to wait for all commands that are started before it to complete. */
+struct pva_cmd_release_l2sram {
+#define PVA_CMD_OPCODE_RELEASE_L2SRAM 24U
+	struct pva_cmd_header header;
+};
+
+/*
+ * This command writes data to a DRAM region. The DRAM region is described
+ * by resource ID, offset and size fields. The data to be written is placed
+ * right after the command struct. For this command to successfully execute,
+ * the following conditions must be met:
+ * 1. 'resource_id' should point to a valid resource in DRAM.
+ * 2. the offset and size fields should add up to be less than or equal to the size of DRAM resource.
+ */
+struct pva_cmd_write_dram {
+#define PVA_CMD_OPCODE_WRITE_DRAM 25U
+	struct pva_cmd_header header;
+	uint8_t offset_hi;
+	uint8_t pad;
+	uint16_t write_size;
+	uint32_t resource_id;
+	uint32_t offset_lo;
+	/* Followed by write_size bytes, padded to 4 bytes boundary */
+};
+
+/** Set this bit to @ref pva_surface_format to indicate if the surface format is
+ * block linear or pitch linear.
+ *
+ * For block linear surfaces, the starting address for a descriptor is:
+ * IOVA_OF(resource_id) + surface_base_offset + PL2BL(slot_offset + desc_offset).
+ *
+ * For pitch linear surfaces, the starting address for a descriptor is:
+ * IOVA_OF(resource_id) + surface_base_offset + slot_offset + desc_offset
+ */
+#define PVA_CMD_FLAGS_SURFACE_FORMAT_MSB 0U
+#define PVA_CMD_FLAGS_SURFACE_FORMAT_LSB 0U
+/** MSB of log2 block height in flags field of the command header */
+#define PVA_CMD_FLAGS_LOG2_BLOCK_HEIGHT_MSB 3U
+/** LSB of log2 block height in flags field of the command header */
+#define PVA_CMD_FLAGS_LOG2_BLOCK_HEIGHT_LSB 1U
+/** Bind a DRAM surface to a slot. The surface can be block linear or pitch
+ * linear. */
+struct pva_cmd_bind_dram_slot {
+#define PVA_CMD_OPCODE_BIND_DRAM_SLOT 26U
+	/** flags field will contain block linear flag and block height */
+	struct pva_cmd_header header;
+	uint8_t dma_set_id; /**< ID of the DMA set */
+	uint8_t slot_offset_hi;
+	uint8_t surface_base_offset_hi;
+	uint8_t slot_id; /**< ID of slot to bind */
+	uint32_t resource_id; /**< Resource ID of the DRAM allocation for the surface */
+	uint32_t slot_offset_lo; /**< Per-slot offset in pitch linear domain, from slot base to surface base */
+	uint32_t surface_base_offset_lo; /**< Surface base offset in bytes, from surface base to allocation base */
+};
+
+struct pva_cmd_bind_vmem_slot {
+#define PVA_CMD_OPCODE_BIND_VMEM_SLOT 27U
+	struct pva_cmd_header header;
+	uint8_t dma_set_id;
+	uint8_t slot_id;
+	uint8_t pad[2];
+	uint32_t symbol_id;
+	uint32_t offset;
+};
+
+/** @brief Unregisters a resource.
+ *
+ * This command immediately removes the specified resource from the resource
+ * table upon execution. However, FW does not immediately notify KMD to
+ * deallocate the resource as it may still be in use by other concurrently
+ * running command buffers in the same context.
+ *
+ * The FW takes note of the currently running command buffers and notifies the
+ * KMD to deallocate the resource once these command buffers have completed
+ * their execution.
+ *
+ * @note If a command buffer in the same context either hangs or executes for an
+ * extended period, no resources can be effectively freed, potentially leading
+ * to resource exhaustion.
+ */
+struct pva_cmd_unregister_resource {
+#define PVA_CMD_OPCODE_UNREGISTER_RESOURCE 28U
+	struct pva_cmd_header header;
+	uint32_t resource_id;
+};
+
+/** Write instance parameter to a VMEM symbol. */
+struct pva_cmd_set_vpu_instance_parameter {
+#define PVA_CMD_OPCODE_SET_VPU_INSTANCE_PARAMETER 29U
+	struct pva_cmd_header header;
+	uint32_t symbol_id;
+};
+
+struct pva_cmd_run_unit_tests {
+#define PVA_CMD_OPCODE_RUN_UNIT_TESTS 30U
+	struct pva_cmd_header header;
+#define PVA_FW_UTESTS_MAX_ARGC 16U
+	uint8_t argc;
+	uint8_t pad[3];
+	uint32_t in_resource_id;
+	uint32_t in_offset;
+	uint32_t in_size;
+	uint32_t out_resource_id;
+	uint32_t out_offset;
+	uint32_t out_size;
+};
+
+struct pva_cmd_set_vpu_print_cb {
+#define PVA_CMD_OPCODE_SET_VPU_PRINT_CB 31U
+	struct pva_cmd_header header;
+	uint32_t cb_resource_id;
+	uint32_t cb_offset;
+};
+
+struct pva_cmd_invalidate_l2sram {
+#define PVA_CMD_OPCODE_INVALIDATE_L2SRAM 32U
+	struct pva_cmd_header header;
+	uint8_t dram_offset_hi;
+	uint8_t pad[3];
+	uint32_t dram_resource_id;
+	uint32_t dram_offset_lo;
+	uint32_t l2sram_size;
+};
+
+struct pva_cmd_flush_l2sram {
+#define PVA_CMD_OPCODE_FLUSH_L2SRAM 33U
+	struct pva_cmd_header header;
+	struct pva_user_dma_allowance user_dma;
+};
+
+struct pva_cmd_err_inject {
+#define PVA_CMD_OPCODE_ERR_INJECT 34U
+	struct pva_cmd_header header;
+	enum pva_error_inject_codes err_inject_code;
+};
+
+struct pva_cmd_patch_l2sram_offset {
+#define PVA_CMD_OPCODE_PATCH_L2SRAM_OFFSET 35U
+	struct pva_cmd_header header;
+	uint8_t dma_set_id;
+	uint8_t slot_id;
+	uint8_t pad[2];
+	uint32_t offset;
+};
+
+/** After retiring a barrier group, all future commands which refer to that barrier group id will be
+ * mapped to a new logical barrier group. This allows re-using barrier ids within a command buffer.
+ */
+struct pva_cmd_retire_barrier_group {
+#define PVA_CMD_OPCODE_RETIRE_BARRIER_GROUP 36U
+	struct pva_cmd_header header;
+};
+
+#define PVA_CMD_OPCODE_COUNT 37U
+
+struct pva_cmd_init_resource_table {
+#define PVA_CMD_OPCODE_INIT_RESOURCE_TABLE (0U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	/**< Resource table id is from 0 to 7, 0 is the device's resource table,
+	 * 1-7 are users'. */
+	uint8_t resource_table_id;
+	uint8_t resource_table_addr_hi;
+	uint8_t pad[2];
+	uint32_t resource_table_addr_lo;
+	uint32_t max_n_entries;
+};
+
+struct pva_cmd_deinit_resource_table {
+#define PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE (1U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t resource_table_id;
+	uint8_t pad[3];
+};
+
+struct pva_cmd_update_resource_table {
+#define PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE (2U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t resource_table_id;
+	uint8_t pad[3];
+	uint32_t resource_id;
+	struct pva_resource_entry entry;
+};
+
+struct pva_cmd_init_queue {
+#define PVA_CMD_OPCODE_INIT_QUEUE (3U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t ccq_id;
+	uint8_t queue_id;
+	uint8_t queue_addr_hi;
+	uint8_t pad;
+	uint32_t queue_addr_lo;
+	uint32_t max_n_submits;
+};
+
+struct pva_cmd_deinit_queue {
+#define PVA_CMD_OPCODE_DEINIT_QUEUE (4U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t ccq_id;
+	uint8_t queue_id;
+	uint8_t pad[2];
+};
+
+struct pva_cmd_enable_fw_profiling {
+#define PVA_CMD_OPCODE_ENABLE_FW_PROFILING (5U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t buffer_offset_hi;
+	uint8_t timestamp_type;
+	uint8_t pad[2];
+	uint32_t buffer_resource_id;
+	uint32_t buffer_size;
+	uint32_t buffer_offset_lo;
+	uint32_t filter;
+};
+
+struct pva_cmd_disable_fw_profiling {
+#define PVA_CMD_OPCODE_DISABLE_FW_PROFILING (6U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+};
+
+struct pva_cmd_get_tegra_stats {
+#define PVA_CMD_OPCODE_GET_TEGRA_STATS (7U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+	uint8_t buffer_offset_hi;
+	bool enabled;
+	uint8_t pad[2];
+	uint32_t buffer_resource_id;
+	uint32_t buffer_size;
+	uint32_t buffer_offset_lo;
+};
+
+struct pva_cmd_suspend_fw {
+#define PVA_CMD_OPCODE_SUSPEND_FW (8U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+};
+
+struct pva_cmd_resume_fw {
+#define PVA_CMD_OPCODE_RESUME_FW (9U | PVA_CMD_PRIV_OPCODE_FLAG)
+	struct pva_cmd_header header;
+};
+
+#define PVA_CMD_PRIV_OPCODE_COUNT 10U
+
+#define PVA_MAX_CMDBUF_CHUNK_LEN 1024
+#define PVA_MAX_CMDBUF_CHUNK_SIZE (sizeof(uint32_t) * PVA_MAX_CMDBUF_CHUNK_LEN)
+
+#endif // PVA_API_CMDBUF_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_cuda.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_cuda.h
@@ -0,0 +1,222 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#ifndef PVA_API_CUDA_H
+#define PVA_API_CUDA_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cuda.h"
+#include "pva_api_types.h"
+
+/**
+ *  @brief Structure for cuExtend queue data needed for command submission.
+ */
+struct pva_cuextend_queue_data {
+	/*! Holds a pointer to pva queue object */
+	struct pva_queue *queue;
+	/*! Holds engine affinity for command submission*/
+	uint32_t affinity;
+};
+
+/**
+ * @brief Function type for cuExtend register memory callback
+ *
+ * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
+ * @param[in] mem The pointer to a \ref pva_memory object. This register memory callback shall transfer the
+ *                ownership of the memory to the client, and it is client's responsibility to release the memory.
+ * @param[in] cuda_ptr CUDA device pointer.
+ * @param[in] cached_flags The cached flags for the memory.
+ * @return \ref pva_error The completion status of register memory operation.
+ */
+typedef enum pva_error (*pva_cuextend_memory_register)(void *callback_args,
+						       struct pva_memory *mem,
+						       void *cuda_ptr,
+						       uint32_t cached_flags);
+
+/**
+ *  @brief Function type for cuExtend unregister memory callback.
+ *
+ * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
+ * @param[in] cuda_ptr CUDA device pointer.
+ * @return \ref pva_error The completion status of unregister memory operation.
+ */
+typedef enum pva_error (*pva_cuextend_memory_unregister)(void *callback_args,
+							 void *cuda_ptr);
+
+/**
+ *  @brief Function type for cuExtend register stream callback.
+ *
+ * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
+ * @param[out] stream_payload Client data associated with a CUDA stream.
+ * @param[in] flags Reserved for future. Must set to 0.
+ * @return \ref pva_error The completion status of register stream operation.
+ */
+typedef enum pva_error (*pva_cuextend_stream_register)(void *callback_args,
+						       void **stream_payload,
+						       uint64_t flags);
+
+/**
+ *  @brief Function type for cuExtend unregister stream callback.
+ *
+ * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
+ * @param[in] stream_payload Client data returned by \ref pva_cuextend_stream_register.
+ * @param[in] flags Reserved for future. Must set to 0.
+ * @return \ref pva_error The completion status of unregister stream operation.
+ */
+typedef enum pva_error (*pva_cuextend_stream_unregister)(void *callback_args,
+							 void *stream_payload,
+							 uint64_t flags);
+
+/**
+ *  @brief Function type for cuExtend acquire queue callback.
+ *
+ * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
+ * @param[in] stream_payload Client data returned by \ref pva_cuextend_stream_register.
+ * @param[out] queue_data Output pointer to a pva_cuextend_queue_data object.
+ * @return \ref pva_error The completion status of acquire queue operation.
+ */
+typedef enum pva_error (*pva_cuextend_queue_acquire)(
+	void *callback_args, void *stream_payload,
+	struct pva_cuextend_queue_data **queue_data);
+
+/**
+ *  @brief Function type for cuExtend release queue callback.
+ *
+ * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
+ * @param[in] stream_payload Client data returned by \ref pva_cuextend_stream_register.
+ * @return \ref pva_error The completion status of release  queue operation.
+ */
+typedef enum pva_error (*pva_cuextend_queue_release)(void *callback_args,
+						     void *stream_payload,
+						     void *queue_data);
+
+/**
+ * @brief Function type for retrieving error code from cuExtend.
+ *
+ * @param[in] teardown_ctx Pointer to the cuExtend context pointer.
+ */
+typedef enum pva_error (*pva_cuextend_get_error)(void *teardown_ctx);
+
+/**
+ * @brief Function type for cuExtend teardown callback.
+ *
+ * It is expected that the client does the following necessary actions in this callback:
+ * Blocking wait for all pending tasks on all queues. In the wait loop, periodically check for CUDA error by calling \ref pva_cuextend_get_error,
+ * hop out then loop if there is an error.
+ *
+ * @param[in] callback_args Pointer to the callback arguments provided by client during cuExtend initialization.
+ * @param[in] teardown_ctx Pointer to a teardown context passed by cuExtend teardown callback.
+ * @param[in] get_error Function pointer to get CUDA error function.
+ * @return \ref pva_error The completion status of release  queue operation.
+ */
+typedef enum pva_error (*pva_cuextend_teardown)(
+	void *callback_args, void *teardown_ctx,
+	pva_cuextend_get_error get_error);
+
+/**
+ *  @brief Structure for cuExtend callbacks provided by the caller during cuExtend initialization.
+ */
+struct pva_cuextend_callbacks {
+	/*! Holds the register memory callback */
+	pva_cuextend_memory_register mem_reg;
+	/*! Holds the unregister memory callback */
+	pva_cuextend_memory_unregister mem_unreg;
+	/*! Holds the register stream callback */
+	pva_cuextend_stream_register stream_reg;
+	/*! Holds the unregister stream callback */
+	pva_cuextend_stream_unregister stream_unreg;
+	/*! Holds the acquire queue callback */
+	pva_cuextend_queue_acquire queue_acquire;
+	/*! Holds the release queue callback */
+	pva_cuextend_queue_release queue_release;
+	/*! Holds the teardown callback */
+	pva_cuextend_teardown teardown;
+	/*! Pointer to the callback arguments provided by client during cuExtend initialization */
+	void *args;
+};
+
+/**
+ * @brief Initialize cuExtend context.
+ *
+ * This function must be called before any other cuExtend functions. It does the following:
+ *
+ * 1. Load cuExtend library and retrieves function pointers to the library's exported functions.
+ * 2. Add PVA to CUDA unified context model.
+ * 3. Initialize the opaque cuExtend impl pointer.
+ *
+ * @param[in] ctx Pointer to a PVA context object.
+ * @param[in] callbacks Pointer to CUDA interop callbacks.
+ * @return \ref pva_error The completion status of the initialization operation.
+ */
+enum pva_error pva_cuextend_init(struct pva_context *ctx,
+				 struct pva_cuextend_callbacks *callbacks);
+
+/**
+ * @brief De-initialize cuExtend context.
+ *
+ * This function must be called at the context destructor in the client. It does the following:
+ *
+ * 1. Clear the opaque cuExtend impl pointer in pva context object.
+ * 2. Remove PVA to from cuExtend context.
+ * 3. Unload cuExtend library and clear all the function pointers.
+ *
+ * @param[in] ctx Pointer to a PVA context object.
+ * @return \ref pva_error The completion status of the de-initialization operation.
+ */
+enum pva_error pva_cuextend_deinit(struct pva_context *ctx);
+
+/**
+ * @brief Import a memory region from a CUDA context into a PVA context.
+ *
+ * @param[in] ctx Pointer to a PVA context structure.
+ * @param[in] cuda_ptr Pointer to CUDA memory provided by client.
+ * @param[in] size Size of the memory region.
+ * @param[in] access_type Access flag provided by client.
+ * @param[out] out_mem Pointer to the imported memory object.
+ * @param[out] cached_flags Output cached flags for the memory.
+ * @return \ref pva_error The completion status of the initialization operation.
+ */
+enum pva_error pva_cuextend_memory_import(struct pva_context *ctx,
+					  void *cuda_ptr, uint64_t size,
+					  uint32_t access_mode,
+					  struct pva_memory **out_mem,
+					  uint32_t *cached_flags);
+
+/**
+ * @brief Submit a batch of command buffers via a CUDA stream.
+ *
+ * @param[in] queue Pointer to the queue. If queue is not NULL, this API will try to submit the client tasks to this queue directly.
+ *                  Otherwise, it will call queue_acquire callback to query a pva_queue object from stream payload, and then submit
+ *                  the tasks to the queried queue.
+ * @param[in] stream A CUDA stream.
+ * @param[in] submit_infos Array of submit info structures.
+ * @param[in] count Number of submit info structures.
+ * @param[in] timeout_ms Timeout in milliseconds. PVA_TIMEOUT_INF for infinite.
+ * @return \ref pva_error The completion status of the submit operation.
+ *
+ * @note Concurrent submission to the same queue needs to be serialized by the
+ *       caller.
+ */
+enum pva_error
+pva_cuextend_cmdbuf_batch_submit(struct pva_queue *queue, CUstream stream,
+				 struct pva_cmdbuf_submit_info *submit_infos,
+				 uint32_t count, uint64_t timeout_ms);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PVA_API_CUDA_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_dma.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_dma.h
@@ -0,0 +1,343 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_API_DMA_H
+#define PVA_API_DMA_H
+#include "pva_api_types.h"
+
+/** Bit indices for VPU GPIO triggers */
+enum pva_gpio_bit {
+	GPIO_VPU_CFG_BIT = 4U,
+	GPIO_READ0_BIT = 16U,
+	GPIO_READ1_BIT = 17U,
+	GPIO_READ2_BIT = 18U,
+	GPIO_READ3_BIT = 19U,
+	GPIO_READ4_BIT = 20U,
+	GPIO_READ5_BIT = 21U,
+	GPIO_READ6_BIT = 22U,
+	GPIO_WRITE0_BIT = 23U,
+	GPIO_WRITE1_BIT = 24U,
+	GPIO_WRITE2_BIT = 25U,
+	GPIO_WRITE3_BIT = 26U,
+	GPIO_WRITE4_BIT = 27U,
+	GPIO_WRITE5_BIT = 28U,
+	GPIO_WRITE6_BIT = 29U
+};
+
+enum pva_dma_descriptor_id {
+	PVA_DMA_DESC_NONE = 0,
+	PVA_DMA_DESC0 = 1,
+	PVA_DMA_DESC1 = 2,
+	PVA_DMA_DESC2 = 3,
+	PVA_DMA_DESC3 = 4,
+	PVA_DMA_DESC4 = 5,
+	PVA_DMA_DESC5 = 6,
+	PVA_DMA_DESC6 = 7,
+	PVA_DMA_DESC7 = 8,
+	PVA_DMA_DESC8 = 9,
+	PVA_DMA_DESC9 = 10,
+	PVA_DMA_DESC10 = 11,
+	PVA_DMA_DESC11 = 12,
+	PVA_DMA_DESC12 = 13,
+	PVA_DMA_DESC13 = 14,
+	PVA_DMA_DESC14 = 15,
+	PVA_DMA_DESC15 = 16,
+	PVA_DMA_DESC16 = 17,
+	PVA_DMA_DESC17 = 18,
+	PVA_DMA_DESC18 = 19,
+	PVA_DMA_DESC19 = 20,
+	PVA_DMA_DESC20 = 21,
+	PVA_DMA_DESC21 = 22,
+	PVA_DMA_DESC22 = 23,
+	PVA_DMA_DESC23 = 24,
+	PVA_DMA_DESC24 = 25,
+	PVA_DMA_DESC25 = 26,
+	PVA_DMA_DESC26 = 27,
+	PVA_DMA_DESC27 = 28,
+	PVA_DMA_DESC28 = 29,
+	PVA_DMA_DESC29 = 30,
+	PVA_DMA_DESC30 = 31,
+	PVA_DMA_DESC31 = 32,
+	PVA_DMA_DESC32 = 33,
+	PVA_DMA_DESC33 = 34,
+	PVA_DMA_DESC34 = 35,
+	PVA_DMA_DESC35 = 36,
+	PVA_DMA_DESC36 = 37,
+	PVA_DMA_DESC37 = 38,
+	PVA_DMA_DESC38 = 39,
+	PVA_DMA_DESC39 = 40,
+	PVA_DMA_DESC40 = 41,
+	PVA_DMA_DESC41 = 42,
+	PVA_DMA_DESC42 = 43,
+	PVA_DMA_DESC43 = 44,
+	PVA_DMA_DESC44 = 45,
+	PVA_DMA_DESC45 = 46,
+	PVA_DMA_DESC46 = 47,
+	PVA_DMA_DESC47 = 48,
+	PVA_DMA_DESC48 = 49,
+	PVA_DMA_DESC49 = 50,
+	PVA_DMA_DESC50 = 51,
+	PVA_DMA_DESC51 = 52,
+	PVA_DMA_DESC52 = 53,
+	PVA_DMA_DESC53 = 54,
+	PVA_DMA_DESC54 = 55,
+	PVA_DMA_DESC55 = 56,
+	PVA_DMA_DESC56 = 57,
+	PVA_DMA_DESC57 = 58,
+	PVA_DMA_DESC58 = 59,
+	PVA_DMA_DESC59 = 60,
+	PVA_DMA_DESC60 = 61,
+	PVA_DMA_DESC61 = 62,
+	PVA_DMA_DESC62 = 63,
+	PVA_DMA_DESC63 = 64
+};
+
+/**
+ * The values of the enum members conform to the definitions of DMA descriptors'
+ * trig_vpu_events field. Therefore, they can be assigned to trig_vpu_events
+ * directly.
+ */
+enum pva_dma_trigger {
+	PVA_DMA_NO_TRIG = 0,
+	PVA_DMA_TRIG_READ0,
+	PVA_DMA_TRIG_WRITE0,
+	PVA_DMA_TRIG_VPU_CFG,
+	PVA_DMA_TRIG_READ1,
+	PVA_DMA_TRIG_WRITE1,
+	PVA_DMA_TRIG_READ2,
+	PVA_DMA_TRIG_WRITE2,
+	PVA_DMA_TRIG_READ3,
+	PVA_DMA_TRIG_WRITE3,
+	PVA_DMA_TRIG_READ4,
+	PVA_DMA_TRIG_WRITE4,
+	PVA_DMA_TRIG_READ5,
+	PVA_DMA_TRIG_WRITE5,
+	PVA_DMA_TRIG_READ6,
+	PVA_DMA_TRIG_WRITE6,
+	PVA_DMA_TRIG_HWSEQ_RD,
+	PVA_DMA_TRIG_HWSEQ_WR,
+};
+
+enum pva_dma_trigger_mode {
+	PVA_DMA_TRIG_MODE_DIS = 0,
+	PVA_DMA_TRIG_MODE_4TH_DIM,
+	PVA_DMA_TRIG_MODE_3RD_DIM,
+	PVA_DMA_TRIG_MODE_TILE
+};
+
+enum pva_dma_transfer_mode {
+	PVA_DMA_TRANS_MODE_INVALID = 0,
+	PVA_DMA_TRANS_MODE_DRAM = 1,
+	PVA_DMA_TRANS_MODE_VMEM = 2,
+	PVA_DMA_TRANS_MODE_L2SRAM = 3,
+	PVA_DMA_TRANS_MODE_TCM = 4,
+	/** MMIO is valid as dst in VPU config mode only */
+	PVA_DMA_TRANS_MODE_MMIO = 5,
+	PVA_DMA_TRANS_MODE_RSVD = 5,
+	/** VPU config mode, valid for src only */
+	PVA_DMA_TRANS_MODE_VPUCFG = 7
+};
+
+struct pva_dma_transfer_attr {
+	uint8_t rpt1;
+	uint8_t rpt2;
+	uint8_t rpt3;
+	uint8_t cb_enable;
+	uint8_t transfer_mode;
+	/** When dynamic slot flag is set, it means the memory location will be
+	* relocated by commands.
+	*/
+#define PVA_DMA_DYNAMIC_SLOT (1 << 15)
+#define PVA_DMA_STATIC_SLOT (1 << 14)
+#define PVA_DMA_SLOT_INVALID 0
+#define PVA_DMA_SLOT_ID_MASK 0xFF
+#define PVA_DMA_MAX_NUM_SLOTS 256
+	uint16_t slot;
+	/** Line pitch in pixels */
+	uint16_t line_pitch;
+	uint32_t cb_start;
+	uint32_t cb_size;
+	int32_t adv1;
+	int32_t adv2;
+	int32_t adv3;
+	uint64_t offset;
+};
+
+struct pva_dma_descriptor {
+	/**
+	 * Linked descriptor ID
+	 *
+	 * - 0: No linked descriptor
+	 * - N (> 0): Linking to descriptor N - 1 in the descriptor array
+	 */
+	uint8_t link_desc_id;
+	uint8_t px;
+	uint8_t py;
+	/** enum pva_dma_trigger_mode */
+	uint8_t trig_event_mode;
+	/** Trigger from enum pva_dma_trigger */
+	uint8_t trig_vpu_events;
+	uint8_t desc_reload_enable;
+	/**
+	 * Log2(number bytes per pixel).
+	 *
+	 * - 0: 1 byte per pixel
+	 * - 1: 2 bytes per pixel
+	 * - 2: 4 bytes per pixel
+	 * - others: invalid
+	 */
+	uint8_t log2_pixel_size;
+	uint8_t px_direction;
+	uint8_t py_direction;
+	uint8_t boundary_pixel_extension;
+	/** TCM transfer size */
+	uint8_t tts;
+	/**
+	 * - 0: transfer true completion disabled
+	 * - 1: transfer true completion enabled
+	 */
+	uint8_t trans_true_completion;
+	uint8_t prefetch_enable;
+
+	uint16_t tx;
+	uint16_t ty;
+	uint16_t dst2_slot;
+	uint32_t dst2_offset;
+	struct pva_dma_transfer_attr src;
+	struct pva_dma_transfer_attr dst;
+};
+
+struct pva_dma_channel {
+	/**
+	 *  Starting descriptor index in the descriptor array
+	 *
+	 *  Valid range is [0, max_num_descriptors - 1]. This is different from
+	 *  link_desc_id field, where 0 means no linked descriptor.
+	 */
+	uint8_t desc_index;
+	uint8_t vdb_count;
+	uint8_t vdb_offset;
+	uint8_t req_per_grant;
+	uint8_t prefetch_enable;
+	uint8_t ch_rep_factor;
+	uint8_t hwseq_enable;
+	uint8_t hwseq_traversal_order;
+	uint8_t hwseq_tx_select;
+	uint8_t hwseq_trigger_done;
+	uint8_t hwseq_frame_count;
+	uint8_t hwseq_con_frame_seq;
+	uint16_t hwseq_start;
+	uint16_t hwseq_end;
+	uint16_t adb_count;
+	uint16_t adb_offset;
+	/*!
+	* Holds the trigger signal this channel will react to.
+	*
+	* IAS:
+	*     DMA_COMMON_DMA_OUTPUT_ENABLEn (4 Bytes)
+	*
+	* Mapping:
+	*     chanId corresponding to this structure is allocated by KMD.
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE0.bit[chanId]      = outputEnableMask.bit[0];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE0.bit[16 + chanId] = outputEnableMask.bit[1];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE1.bit[chanId]      = outputEnableMask.bit[2];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE1.bit[16 + chanId] = outputEnableMask.bit[3];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE2.bit[chanId]      = outputEnableMask.bit[4];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE2.bit[16 + chanId] = outputEnableMask.bit[5];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE3.bit[chanId]      = outputEnableMask.bit[6];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE3.bit[16 + chanId] = outputEnableMask.bit[7];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE4.bit[chanId]      = outputEnableMask.bit[8];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE4.bit[16 + chanId] = outputEnableMask.bit[9];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE5.bit[chanId]      = outputEnableMask.bit[10];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE5.bit[16 + chanId] = outputEnableMask.bit[11];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE6.bit[chanId]      = outputEnableMask.bit[12];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE6.bit[16 + chanId] = outputEnableMask.bit[13];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE7.bit[chanId]      = outputEnableMask.bit[14];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE8.bit[chanId]      = outputEnableMask.bit[15];
+	*     DMA_COMMON_DMA_OUTPUT_ENABLE8.bit[16 + chanId] = outputEnableMask.bit[16];
+	*/
+	uint32_t output_enable_mask;
+	uint32_t pad_value;
+};
+
+struct pva_dma_config_header {
+/* In order to make efficient the allocation and tracking of DMA resources, DMA resources
+ * are allocated in groups. For example, descriptors may be allocated in groups of 4, which
+ * means that every allocation of descriptors will start at an alignment of 4. The following
+ * macros control the alignment/grouping requirement of DMA resources.
+ */
+// TODO: Add compile time asserts to ensure the following alignment requirments don't result
+//	 in fractional resource partitions?
+#define PVA_DMA_CHANNEL_ALIGNMENT 1
+#define PVA_DMA_DESCRIPTOR_ALIGNMENT 4
+#define PVA_DMA_ADB_ALIGNMENT 16
+#define PVA_DMA_HWSEQ_WORD_ALIGNMENT 128
+	uint8_t base_channel;
+	uint8_t base_descriptor;
+	uint8_t num_channels;
+	uint8_t num_descriptors;
+
+	uint16_t num_static_slots;
+	uint16_t num_dynamic_slots;
+
+	uint16_t base_hwseq_word;
+	uint16_t num_hwseq_words;
+	uint32_t vpu_exec_resource_id;
+
+	/* For serialized version of pva_dma_config, the following fields follow
+	 * immediately after this header. The starting addresses of these fields
+	 * must be aligned to 8 bytes */
+
+	/* An array of hwseq words */
+	/* An array of pva_dma_channel */
+	/* An array of pva_dma_descriptor */
+	/* An array of pva_dma_slot_buffer */
+};
+
+enum pva_dma_static_binding_type {
+	PVA_DMA_STATIC_BINDING_INVALID = 0,
+	PVA_DMA_STATIC_BINDING_DRAM,
+	PVA_DMA_STATIC_BINDING_VMEM,
+};
+
+/** Max block height is 32 GOB */
+#define PVA_DMA_MAX_LOG2_BLOCK_HEIGHT 5
+
+struct pva_dma_dram_binding {
+	/** enum pva_surface_format */
+	uint8_t surface_format;
+	uint8_t log2_block_height;
+	uint32_t resource_id;
+	uint64_t surface_base_offset;
+	uint64_t slot_offset;
+};
+
+struct pva_dma_vmem_binding {
+	struct pva_vmem_addr addr;
+};
+
+struct pva_dma_static_binding {
+	/** enum pva_dma_static_binding_type */
+	uint8_t type;
+	union {
+		struct pva_dma_dram_binding dram;
+		struct pva_dma_vmem_binding vmem;
+	};
+};
+
+struct pva_dma_config {
+	struct pva_dma_config_header header;
+	uint32_t *hwseq_words;
+	struct pva_dma_channel *channels;
+	struct pva_dma_descriptor *descriptors;
+	struct pva_dma_static_binding *static_bindings;
+};
+
+#endif // PVA_API_DMA_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_nvsci.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_nvsci.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_API_NVSCI_H
+#define PVA_API_NVSCI_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pva_api_types.h"
+#include "nvscibuf.h"
+#include "nvscisync.h"
+
+/**
+ * @brief Fill NvSciBuf attributes required by PVA.
+ *
+ * @param[out] scibuf_attr The NvSciBuf attribute list to be filled with PVA-specific attributes.
+ */
+enum pva_error pva_nvsci_buf_fill_attrs(NvSciBufAttrList scibuf_attr);
+
+/**
+ * @brief Fill NvSciSync attributes required by PVA.
+ *
+ * @param[in] access_mode Access mode for the sync object, determining how PVA
+ *                        will interact with the sync object (read, write, etc.)
+ * @param[out] attr_list The NvSciSync attribute list to be populated with attributes.
+ */
+enum pva_error pva_nvsci_sync_fill_attrs(uint32_t access_mode,
+					 NvSciSyncAttrList attr_list);
+
+/**
+ * @brief Holds the metadata for a NvSci plane.
+ */
+struct pva_plane_attrs {
+	uint32_t line_pitch;
+	uint32_t width_in_bytes;
+	uint32_t height;
+	uint64_t offset;
+};
+
+#define PVA_SURFACE_ATTRS_MAX_NUM_PLANES 6U
+
+/**
+ * @brief Holds the metadata for a NvSci surface.
+ */
+struct pva_surface_attrs {
+	bool is_surface;
+	enum pva_surface_format format;
+	uint32_t n_planes;
+	uint64_t size;
+	struct pva_plane_attrs planes[PVA_SURFACE_ATTRS_MAX_NUM_PLANES];
+	uint8_t log2_gobs_per_block_y[PVA_SURFACE_ATTRS_MAX_NUM_PLANES];
+};
+
+/**
+ * @brief Import an NvSciBuf object into PVA.
+ *
+ * This function imports an NvSciBuf buffer object into PVA for further
+ * operations. It creates a PVA memory object representing the buffer and
+ * retrieves surface information about the buffer.
+ *
+ * The caller is responsible for freeing the PVA memory object.
+ *
+ * @param[in] obj The NvSciBuf object to be imported.
+ * @param[in] access_mode Access mode for the buffer, determining the PVA's permissions for interaction.
+ * @param[out] out_obj A pointer to the PVA memory object representing the imported buffer.
+ * @param[out] out_surf_info Surface metadata of the buffer
+ */
+enum pva_error pva_nvsci_buf_import(NvSciBufObj obj, uint32_t access_mode,
+				    struct pva_memory **out_obj,
+				    struct pva_surface_attrs *out_surf_info);
+
+/**
+ * @brief An opaque object representing an imported NvSciSync object.
+ */
+struct pva_nvsci_syncobj;
+
+/**
+ * @brief Describes the attributes of an imported NvSciSync object.
+ *
+ * This structure contains details about the memory buffers associated with the
+ * imported NvSciSync object.
+ */
+struct pva_nvsci_syncobj_attrs {
+	struct pva_memory *
+		semaphore_buf; /**< Pointer to the semaphore memory buffer; NULL if syncpoints are used. */
+	struct pva_memory *
+		timestamp_buf; /**< Pointer to the timestamp memory buffer; NULL if unused. */
+	struct pva_memory
+		*status_buf; /**< Pointer to the status memory buffer. */
+};
+
+/**
+ * @brief Import an NvSciSync object into the PVA.
+ *
+ * This function imports an NvSciSync object into PVA, enabling it to be used
+ * for synchronization of operations.
+ *
+ * @param[in] ctx The PVA context in which the sync object is to be used.
+ * @param[in] nvsci_obj The NvSciSync object to be imported.
+ * @param[in] access_mode The access mode for the sync object, indicating how PVA will use it.
+ * @param[out] out_obj A pointer to the resulting PVA sync object handle.
+ */
+enum pva_error pva_nvsci_syncobj_import(struct pva_context *ctx,
+					NvSciSyncObj nvsci_obj,
+					uint32_t access_mode,
+					struct pva_nvsci_syncobj **out_obj);
+
+/**
+ * @brief Retrieve the attributes of an imported NvSciSync object.
+ *
+ * This function fills in the provided attribute structure with details from
+ * the imported NvSciSync object, including information relevant for semaphores,
+ * timestamps, and status.
+ *
+ * @param[in] syncobj The NvSciSync object whose attributes are to be retrieved.
+ * @param[out] out_attrs The structure to be filled with the sync object's attributes.
+ */
+void pva_nvsci_syncobj_get_attrs(struct pva_nvsci_syncobj const *syncobj,
+				 struct pva_nvsci_syncobj_attrs *out_attrs);
+
+/**
+ * @brief Free an imported NvSciSync object.
+ *
+ * This function releases the resources associated with a PVA NvSciSync object,
+ * including PVA memory objects for semaphores, timestamps and statuses.
+ *
+ * @param[in] syncobj The PVA sync object to be freed.
+ */
+void pva_nvsci_syncobj_free(struct pva_nvsci_syncobj *syncobj);
+
+/**
+ * @brief Get the next status slot for a new fence.
+ *
+ * @param[in] syncobj The imported NvSciSyncObj
+ * @param[out] out_status_slot The status slot index for the next fence.
+ */
+enum pva_error pva_nvsci_syncobj_next_status(struct pva_nvsci_syncobj *syncobj,
+					     uint32_t *out_status_slot);
+
+/**
+ * @brief Get the next timestamp slot for a new fence.
+ *
+ * @param[in] syncobj The imported NvSciSyncObj
+ * @param[out] out_timestamp_slot The timestamp slot index for the next fence.
+ */
+enum pva_error
+pva_nvsci_syncobj_next_timestamp(struct pva_nvsci_syncobj *syncobj,
+				 uint32_t *out_timestamp_slot);
+
+/**
+ * @brief Fence data for import and export.
+ */
+struct pva_nvsci_fence_info {
+	uint32_t index; /**< The index of the fence. */
+	uint32_t value; /**< The value of the fence. */
+	uint32_t status_slot; /**< The slot index for the status. */
+	uint32_t timestamp_slot; /**< The slot index for the timestamp. */
+};
+/**
+ * @brief Import a NvSciSync fence into a PVA fence.
+ *
+ * @param[in] nvsci_fence The NvSciSync fence to be imported.
+ * @param[in] pva_syncobj The previously imported NvSciSyncObj that's associated with the fence.
+ * @param[out] out_fence_info The information about the NvSci fence. It can be used to fill a pva_fence.
+ *
+ * @note This function only fills the index and value field of the pva_fence.
+ * The user needs to set the semaphore resource ID if the sync object is a
+ * semaphore.
+ *
+ */
+enum pva_error
+pva_nvsci_fence_import(NvSciSyncFence const *nvsci_fence,
+		       struct pva_nvsci_syncobj const *pva_syncobj,
+		       struct pva_nvsci_fence_info *out_fence_info);
+
+/**
+ * @brief Export a PVA fence into an NvSciSync fence.
+ *
+ * @param[in] fence_info The information about the fence to be exported.
+ * @param[in] syncobj The previously imported NvSciSyncObj that's associated with the fence.
+ * @param[out] out_nvsci_fence The resulting NvSciSync fence object.
+ */
+enum pva_error
+pva_nvsci_fence_export(struct pva_nvsci_fence_info const *fence_info,
+		       struct pva_nvsci_syncobj const *syncobj,
+		       NvSciSyncFence *out_nvsci_fence);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PVA_API_NVSCI_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_types.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_types.h
@@ -0,0 +1,396 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_API_TYPES_H
+#define PVA_API_TYPES_H
+#if !defined(__KERNEL__)
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#define container_of(ptr, type, member)                                        \
+	(type *)((char *)(ptr) - (char *)&((type *)0)->member)
+#else
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#define UINT64_MAX U64_MAX
+#define UINT32_MAX U32_MAX
+#endif
+
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+#define FOREACH_ERR(ACT)                                                       \
+	ACT(PVA_SUCCESS)                                                       \
+	ACT(PVA_UNKNOWN_ERROR)                                                 \
+	ACT(PVA_BAD_PARAMETER_ERROR)                                           \
+	ACT(PVA_NOT_IMPL)                                                      \
+	ACT(PVA_NOENT)                                                         \
+	ACT(PVA_NOMEM)                                                         \
+	ACT(PVA_INVAL)                                                         \
+	ACT(PVA_TIMEDOUT)                                                      \
+	ACT(PVA_INTERNAL)                                                      \
+	ACT(PVA_CMDBUF_NOT_FOUND)                                              \
+	ACT(PVA_CMDBUF_INVALID)                                                \
+	ACT(PVA_CMDBUF_TOO_LARGE)                                              \
+	ACT(PVA_RES_OUT_OF_RANGE)                                              \
+	ACT(PVA_AGAIN)                                                         \
+	ACT(PVA_NO_RESOURCE_ID)                                                \
+	ACT(PVA_INVALID_RESOURCE)                                              \
+	ACT(PVA_INVALID_RESOURCE_SIZE)                                         \
+	ACT(PVA_INVALID_RESOURCE_ALIGNMENT)                                    \
+	ACT(PVA_QUEUE_FULL)                                                    \
+	ACT(PVA_INVALID_IOVA)                                                  \
+	ACT(PVA_NO_PERM)                                                       \
+	ACT(PVA_INVALID_CMD_OPCODE)                                            \
+	ACT(PVA_BUF_OUT_OF_RANGE)                                              \
+	ACT(PVA_CMDBUF_NO_BEGIN)                                               \
+	ACT(PVA_NO_CCQ)                                                        \
+	ACT(PVA_INPUT_STATUS_ERROR)                                            \
+	ACT(PVA_ENOSPC)                                                        \
+	ACT(PVA_EACCES)                                                        \
+	ACT(PVA_ERANGE)                                                        \
+	ACT(PVA_BAD_SURFACE_BASE_ALIGNMENT)                                    \
+	ACT(PVA_BAD_DESC_ADDR_ALIGNMENT)                                       \
+	ACT(PVA_INVALID_DMA_CONFIG)                                            \
+	ACT(PVA_INVALID_SYMBOL)                                                \
+	ACT(PVA_INVALID_BINDING)                                               \
+	ACT(PVA_EINTR)                                                         \
+	ACT(PVA_FILL_NVSCIBUF_ATTRS_FAILED)                                    \
+	ACT(PVA_NVSCIBUF_SET_ATTR_FAILED)                                      \
+	ACT(PVA_IMPORT_FROM_NVSCIBUF_FAILED)                                   \
+	ACT(PVA_NVSCISYNC_SET_ATTR_FAILED)                                     \
+	ACT(PVA_RETRIEVE_DATA_FROM_NVSCISYNC_FAILED)                           \
+	ACT(PVA_UPDATE_DATA_TO_NVSCISYNC_FAILED)                               \
+	ACT(PVA_UNSUPPORTED_NVSCISYNC_TIMESTAMP_FORMAT)                        \
+	ACT(PVA_INVALID_NVSCISYNC_FENCE)                                       \
+	ACT(PVA_ERR_CMD_NOT_SUPPORTED)                                         \
+	ACT(PVA_CUDA_INITIALIZED)                                              \
+	ACT(PVA_CUDA_LOAD_LIBRARY_FAILED)                                      \
+	ACT(PVA_CUDA_ADD_CLIENT_FAILED)                                        \
+	ACT(PVA_CUDA_REMOVE_CLIENT_FAILED)                                     \
+	ACT(PVA_CUDA_INIT_FAILED)                                              \
+	ACT(PVA_CUDA_SUBMIT_FAILED)                                            \
+	ACT(PVA_CUDA_GET_RM_HANDLE_FAILED)                                     \
+	ACT(PVA_CUDA_INTERNAL_ERROR)                                           \
+	ACT(PVA_ERR_CMD_INVALID_VPU_STATE)                                     \
+	ACT(PVA_ERR_CMD_VMEM_BUF_OUT_OF_RANGE)                                 \
+	ACT(PVA_ERR_CMD_L2SRAM_BUF_OUT_OF_RANGE)                               \
+	ACT(PVA_ERR_CMD_DRAM_BUF_OUT_OF_RANGE)                                 \
+	ACT(PVA_ERR_CMD_INVALID_BLOCK_HEIGHT)                                  \
+	ACT(PVA_ERR_CMD_PAYLOAD_TOO_SMALL)                                     \
+	ACT(PVA_ERR_CMD_ENGINE_NOT_ACQUIRED)                                   \
+	ACT(PVA_ERR_CMD_INVALID_SYMBOL_TYPE)                                   \
+	ACT(PVA_ERR_CMD_INVALID_ENGINE)                                        \
+	ACT(PVA_ERR_CMD_INVALID_DMA_SET_ID)                                    \
+	ACT(PVA_ERR_CMD_INVALID_DMA_SLOT_ID)                                   \
+	ACT(PVA_ERR_CMD_INVALID_DMA_SLOT_TYPE)                                 \
+	ACT(PVA_ERR_CMD_INVALID_USER_ALLOWANCE)                                \
+	ACT(PVA_ERR_CMD_INCOMPATIBLE_RESOURCE)                                 \
+	ACT(PVA_ERR_CMD_INSUFFICIENT_PRIVILEGE)                                \
+	ACT(PVA_ERR_CMD_INVALID_BARRIER_ID)                                    \
+	ACT(PVA_ERR_CMD_CAPTURE_SLOTS_EXCEEDED)                                \
+	ACT(PVA_ERR_CMD_INVALID_CAPTURE_MODE)                                  \
+	ACT(PVA_ERR_CMD_INVALID_L2SRAM_POLICY)                                 \
+	ACT(PVA_ERR_FW_DMA0_IRQ_ENABLE_FAILED)                                 \
+	ACT(PVA_ERR_FW_DMA1_IRQ_ENABLE_FAILED)                                 \
+	ACT(PVA_ERR_FW_BAD_DMA_STATE)                                          \
+	ACT(PVA_ERR_FW_RESOURCE_IN_USE)                                        \
+	ACT(PVA_ERR_FW_VPU_ERROR_STATE)                                        \
+	ACT(PVA_ERR_FW_VPU_RETCODE_NONZERO)                                    \
+	ACT(PVA_ERR_FW_INVALID_CMD_OPCODE)                                     \
+	ACT(PVA_ERR_FW_INVALID_VPU_CMD_SEQ)                                    \
+	ACT(PVA_ERR_FW_INVALID_DMA_CMD_SEQ)                                    \
+	ACT(PVA_ERR_FW_INVALID_L2SRAM_CMD_SEQ)                                 \
+	ACT(PVA_ERR_FW_ENGINE_NOT_RELEASED)                                    \
+	ACT(PVA_ERR_FW_UTEST)                                                  \
+	ACT(PVA_ERR_VPU_ERROR_STATE)                                           \
+	ACT(PVA_ERR_VPU_RETCODE_NONZERO)                                       \
+	ACT(PVA_ERR_VPU_ILLEGAL_INSTR)                                         \
+	ACT(PVA_ERR_VPU_DIVIDE_BY_0)                                           \
+	ACT(PVA_ERR_VPU_FP_NAN)                                                \
+	ACT(PVA_ERR_VPU_IN_DEBUG)                                              \
+	ACT(PVA_ERR_VPU_DLUT_CFG)                                              \
+	ACT(PVA_ERR_VPU_DLUT_MISS)                                             \
+	ACT(PVA_ERR_VPU_CP_ACCESS)                                             \
+	ACT(PVA_ERR_PPE_ILLEGAL_INSTR)                                         \
+	ACT(PVA_ERR_MATH_OP)                                                   \
+	ACT(PVA_ERR_HWSEQ_INVALID)                                             \
+	ACT(PVA_ERR_CODE_COUNT)
+
+enum pva_error {
+#define ADD_COMMA(name) name,
+	FOREACH_ERR(ADD_COMMA)
+#undef ADD_COMMA
+};
+
+enum pva_chip_id {
+	PVA_CHIP_T19X,
+	PVA_CHIP_T23X,
+	PVA_CHIP_T26X,
+	PVA_CHIP_OTHERS
+};
+
+enum pva_hw_gen {
+	PVA_HW_GEN1,
+	PVA_HW_GEN2,
+	PVA_HW_GEN3,
+};
+
+/* Opaque API data types */
+struct pva_context;
+struct pva_queue;
+struct pva_memory;
+
+struct pva_memory_attrs {
+	uint32_t access_mode;
+	uint64_t offset;
+	uint64_t size;
+};
+
+/**
+ * @brief A memory address accessible by PVA.
+ */
+struct pva_dram_addr {
+	uint32_t resource_id;
+	uint64_t offset;
+};
+
+struct pva_vmem_addr {
+	uint32_t symbol_id;
+	uint32_t offset;
+};
+
+/**
+ * @brief Represents a synchronization fence, which can be associated with
+ * either a memory semaphore or a syncpoint for signaling or waiting operations.
+ *
+ * The UMD handles semaphores and syncpoints differently when used as
+ * postfences:
+ * - Semaphores: UMD does not track future values.
+ * - Syncpoints: UMD tracks future values.
+ *
+ * To use semaphore for either prefences and postfences:
+ * - Set `semaphore_resource_id` to the resource ID of the memory backing the semaphore.
+ * - Set `index` to the byte offset divided by the semaphore size (`sizeof(uint32_t)`).
+ * - Set `value` to the semaphore's signaling or waiting value.
+ *
+ * To use syncpoint for prefences:
+ * - Set `semaphore_resource_id` to `PVA_RESOURCE_ID_INVALID`.
+ * - Set `index` to the syncpoint ID to wait for.
+ * - Set `value` to the waiting value.
+ *
+ * To use syncpoint for postfences:
+ * - Set `semaphore_resource_id` to `PVA_RESOURCE_ID_INVALID`.
+ * - Do not set `index` or `value`.
+ * - After submission, UMD will assign `index` to the queue syncpoint ID and `value` to the expected future value.
+ */
+struct pva_fence {
+	/** Resource ID of the memory semaphore. If resource ID is
+	 * PVA_RESOURCE_ID_INVALID, then the sync object primitive is assumed to
+	 * be syncpoint. */
+	uint32_t semaphore_resouce_id;
+	/** Represents either the semaphore index or the syncpoint ID, depending
+	 *  on the sync object primitive type.
+	 */
+	uint32_t index;
+	/** Represents the semaphore or syncpoint value used for signaling or
+	 * waiting. */
+	uint32_t value;
+};
+
+struct pva_fw_vpu_ptr_symbol {
+	uint64_t base;
+	uint64_t offset;
+	uint64_t size;
+};
+
+struct pva_fw_vpu_legacy_ptr_symbol {
+	uint64_t base;
+	uint32_t offset;
+	uint32_t size;
+};
+
+enum pva_surface_format {
+	PVA_SURF_FMT_PITCH_LINEAR = 0,
+	PVA_SURF_FMT_BLOCK_LINEAR
+};
+
+enum pva_memory_segment {
+	/** Memory segment directly reachable by R5. Command buffer chunk
+	 * memories need to be allocated from this segment */
+	PVA_MEMORY_SEGMENT_R5 = 1,
+	/** Memory segment reachable only by DMA. User buffers should be
+	 * allocated from this segment */
+	PVA_MEMORY_SEGMENT_DMA = 2,
+};
+
+enum pva_symbol_type {
+	/*! Specifies the an invalid symbol type */
+	PVA_SYM_TYPE_INVALID = 0,
+	/*! Specifies a data symbol */
+	PVA_SYM_TYPE_DATA,
+	/*! Specifies a VPU config table symbol */
+	PVA_SYM_TYPE_VPUC_TABLE,
+	/*! Specifies a Pointer symbol */
+	PVA_SYM_TYPE_POINTER,
+	/*! Specifies a System symbol */
+	PVA_SYM_TYPE_SYSTEM,
+	/*! Specifies an extended Pointer symbol */
+	PVA_SYM_TYPE_POINTER_EX,
+	PVA_SYM_TYPE_MAX,
+};
+
+/**
+ * \brief Holds PVA Sync Client Type.
+ * Currently NvSciSync supports NvSciSyncFences with syncpoint primitive type only.
+ */
+enum pva_sync_client_type {
+	/*! For a given SyncObj PVA acts as a signaler. This type corresponds to
+      * postfences from PVA. */
+	PVA_SYNC_CLIENT_TYPE_SIGNALER,
+	/*! For a given SyncObj PVA acts as a waiter. This type corresponds to
+      * prefences to PVA. */
+	PVA_SYNC_CLIENT_TYPE_WAITER,
+	/*! For a given SyncObj PVA acts as both signaler and waiter. */
+	PVA_SYNC_CLIENT_TYPE_SIGNALER_WAITER,
+	/*! Specifies the non inclusive upper bound of valid values. */
+	PVA_SYNC_CLIENT_TYPE_MAX,
+	/*! Reserved bound of valid values. */
+	PVA_SYNC_CLIENT_TYPE_RESERVED = 0x7FFFFFFF,
+};
+
+#define PVA_SYMBOL_ID_INVALID 0U
+#define PVA_SYMBOL_ID_BASE 1U
+#define PVA_MAX_SYMBOL_NAME_LEN 64U
+struct pva_symbol_info {
+	char name[PVA_MAX_SYMBOL_NAME_LEN + 1U];
+	enum pva_symbol_type symbol_type;
+	uint32_t size;
+	uint32_t vmem_addr;
+	/** Symbol ID local to this executable */
+	uint32_t symbol_id; /*< Starting from PVA_SYMBOL_ID_BASE */
+};
+
+#define PVA_RESOURCE_ID_INVALID 0U
+#define PVA_RESOURCE_ID_BASE 1U
+struct pva_resource_entry {
+#define PVA_RESOURCE_TYPE_INVALID 0U
+#define PVA_RESOURCE_TYPE_DRAM 1U
+#define PVA_RESOURCE_TYPE_EXEC_BIN 2U
+#define PVA_RESOURCE_TYPE_DMA_CONFIG 3U
+	uint8_t type;
+	uint8_t smmu_context_id;
+	uint8_t addr_hi;
+	uint8_t size_hi;
+	uint32_t addr_lo;
+	uint32_t size_lo;
+};
+
+/** \brief Maximum number of queues per context */
+#define PVA_MAX_QUEUES_PER_CONTEXT (8)
+
+/** \brief Specifies the memory is GPU CACHED. */
+#define PVA_GPU_CACHED_MEMORY (1u << 1u)
+
+#define PVA_ACCESS_RO (1U << 0) /**< Read only access */
+#define PVA_ACCESS_WO (1U << 1) /**< Write only access */
+#define PVA_ACCESS_RW                                                          \
+	(PVA_ACCESS_RO | PVA_ACCESS_WO) /**< Read and write access */
+
+#define PVA_TIMEOUT_INF UINT64_MAX /**< Infinite timeout */
+
+#define PVA_MAX_NUM_INPUT_STATUS 2 /**< Maximum number of input statuses */
+#define PVA_MAX_NUM_OUTPUT_STATUS 2 /**< Maximum number of output statuses */
+#define PVA_MAX_NUM_PREFENCES 2 /**< Maximum number of pre-fences */
+#define PVA_MAX_NUM_POSTFENCES 2 /**< Maximum number of post-fences */
+/** Maximum number of timestamps */
+#define PVA_MAX_NUM_TIMESTAMPS PVA_MAX_NUM_POSTFENCES
+
+struct pva_cmdbuf_submit_info {
+	uint8_t num_prefences;
+	uint8_t num_postfences;
+	uint8_t num_input_status;
+	uint8_t num_output_status;
+	uint8_t num_timestamps;
+#define PVA_ENGINE_AFFINITY_NONE 0
+#define PVA_ENGINE_AFFINITY_ENGINE0 (1 << 0)
+#define PVA_ENGINE_AFFINITY_ENGINE1 (1 << 1)
+#define PVA_ENGINE_AFFINITY_ANY                                                \
+	(PVA_ENGINE_AFFINITY_ENGINE0 | PVA_ENGINE_AFFINITY_ENGINE1)
+	uint8_t engine_affinity;
+	/** Size of the first chunk */
+	uint16_t first_chunk_size;
+	/** Resource ID of the first chunk */
+	uint32_t first_chunk_resource_id;
+	/** Offset of the first chunk within the resource */
+	uint64_t first_chunk_offset;
+#define PVA_EXEC_TIMEOUT_REUSE 0xFFFFFFFFU
+#define PVA_EXEC_TIMEOUT_INF 0U
+	/** Execution Timeout */
+	uint32_t execution_timeout_ms;
+	struct pva_fence prefences[PVA_MAX_NUM_PREFENCES];
+	struct pva_fence postfences[PVA_MAX_NUM_POSTFENCES];
+	struct pva_dram_addr input_statuses[PVA_MAX_NUM_INPUT_STATUS];
+	struct pva_dram_addr output_statuses[PVA_MAX_NUM_OUTPUT_STATUS];
+	struct pva_dram_addr timestamps[PVA_MAX_NUM_TIMESTAMPS];
+};
+
+struct pva_ops_buffer {
+	void *base; /**< Buffer holding a list of async operations */
+	uint32_t offset; /**< First unused byte in the buffer */
+	uint32_t size; /**< Size of the buffer */
+};
+
+struct pva_cmdbuf_status {
+	/** Timestamp reflecting when the status was updated. This is in resolution of ns */
+	uint64_t timestamp;
+	/** Additional status information for the engine state */
+	uint32_t info32;
+	/** Additional status information for the engine state */
+	uint16_t info16;
+	/** Error code. Type: enum pva_error */
+	uint16_t status;
+};
+
+/** \brief Holds the PVA capabilities. */
+struct pva_characteristics {
+	/*! Holds the number of PVA engines. */
+	uint32_t pva_engine_count;
+	/*! Holds the number of VPUs per PVA engine. */
+	uint32_t pva_pve_count;
+	/*! Holds the PVA generation information */
+	enum pva_hw_gen hw_version;
+	uint16_t max_desc_count;
+	uint16_t max_ch_count;
+	uint16_t max_adb_count;
+	uint16_t max_hwseq_word_count;
+	uint16_t max_vmem_region_count;
+	uint16_t reserved_desc_start;
+	uint16_t reserved_desc_count;
+	uint16_t reserved_adb_start;
+	uint16_t reserved_adb_count;
+};
+
+enum pva_error_inject_codes {
+	PVA_ERR_INJECT_WDT_HW_ERR, // watchdog Hardware error
+	PVA_ERR_INJECT_WDT_TIMEOUT, // watchdog Timeout error
+};
+
+/*
+ * !!!! DO NOT MODIFY !!!!!!
+ * These values are defined as per DriveOS guidelines
+ */
+#define PVA_INPUT_STATUS_SUCCESS (0)
+#define PVA_INPUT_STATUS_INVALID (0xFFFF)
+
+#endif // PVA_API_TYPES_H
--- a/drivers/video/tegra/host/pva/src/include/pva_api_vpu.h
+++ b/drivers/video/tegra/host/pva/src/include/pva_api_vpu.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_API_VPU_H
+#define PVA_API_VPU_H
+#include "pva_api_types.h"
+
+/**
+ * @brief Information of the VPU instance data passed to VPU kernel.
+ */
+struct pva_vpu_instance_data {
+	/** @brief ID of the VPU assigned to the task */
+	uint16_t engine_id;
+	/** @brief Variable to indicate that ppe task was launched or not */
+	uint16_t ppe_task_launched;
+	/** @brief Base of the VMEM memory */
+	uint32_t vmem_base;
+	/** @brief Base of the DMA descriptor SRAM memory */
+	uint32_t dma_descriptor_base;
+	/** @brief Base of L2SRAM allocated for the task executed */
+	uint32_t l2ram_base;
+	/** @brief Size of L2SRAM allocated for the task executed */
+	uint32_t l2ram_size;
+};
+
+#endif // PVA_API_VPU_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.c
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_block_allocator.h"
+#include "pva_kmd_utils.h"
+#include "pva_api.h"
+
+#define INVALID_ID 0xFFFFFFFF
+enum pva_error
+pva_kmd_block_allocator_init(struct pva_kmd_block_allocator *allocator,
+			     void *block_mem, uint32_t base_id,
+			     uint32_t block_size, uint32_t max_num_blocks)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	allocator->free_slot_head = INVALID_ID;
+	allocator->next_free_slot = 0;
+	allocator->max_num_blocks = max_num_blocks;
+	allocator->block_size = block_size;
+	allocator->base_id = base_id;
+
+	allocator->blocks = block_mem;
+
+	allocator->slot_in_use = pva_kmd_zalloc(
+		sizeof(*allocator->slot_in_use) * max_num_blocks);
+	if (!allocator->slot_in_use) {
+		err = PVA_NOMEM;
+		goto err_out;
+	}
+
+	return PVA_SUCCESS;
+err_out:
+	return err;
+}
+
+void pva_kmd_block_allocator_deinit(struct pva_kmd_block_allocator *allocator)
+{
+	pva_kmd_free(allocator->slot_in_use);
+}
+
+static inline void *get_block(struct pva_kmd_block_allocator *allocator,
+			      uint32_t slot)
+{
+	uintptr_t base = (uintptr_t)allocator->blocks;
+	uintptr_t addr = base + (slot * allocator->block_size);
+	return (void *)addr;
+}
+
+static inline uint32_t next_slot(struct pva_kmd_block_allocator *allocator,
+				 uint32_t slot)
+{
+	uint32_t *next = (uint32_t *)get_block(allocator, slot);
+	return *next;
+}
+
+void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
+			  uint32_t *out_id)
+{
+	void *block = NULL;
+	uint32_t slot = INVALID_ID;
+
+	if (allocator->free_slot_head != INVALID_ID) {
+		slot = allocator->free_slot_head;
+		allocator->free_slot_head =
+			next_slot(allocator, allocator->free_slot_head);
+	} else {
+		if (allocator->next_free_slot < allocator->max_num_blocks) {
+			slot = allocator->next_free_slot;
+			allocator->next_free_slot++;
+		} else {
+			goto err_out;
+		}
+	}
+	allocator->slot_in_use[slot] = true;
+
+	*out_id = slot + allocator->base_id;
+	block = get_block(allocator, slot);
+	return block;
+err_out:
+	return NULL;
+}
+
+static bool is_slot_valid(struct pva_kmd_block_allocator *allocator,
+			  uint32_t slot)
+{
+	if (slot >= allocator->max_num_blocks) {
+		return false;
+	}
+
+	return allocator->slot_in_use[slot];
+}
+
+void *pva_kmd_get_block(struct pva_kmd_block_allocator *allocator, uint32_t id)
+{
+	uint32_t slot = id - allocator->base_id;
+	if (!is_slot_valid(allocator, slot)) {
+		return NULL;
+	}
+
+	return get_block(allocator, slot);
+}
+
+enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
+				  uint32_t id)
+{
+	uint32_t slot = id - allocator->base_id;
+	uint32_t *next;
+	if (!is_slot_valid(allocator, slot)) {
+		return PVA_INVAL;
+	}
+
+	allocator->slot_in_use[slot] = false;
+	next = (uint32_t *)get_block(allocator, slot);
+	*next = allocator->free_slot_head;
+	allocator->free_slot_head = slot;
+
+	return PVA_SUCCESS;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_block_allocator.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_BLOCK_ALLOCATOR_H
+#define PVA_KMD_BLOCK_ALLOCATOR_H
+
+#include "pva_api.h"
+
+struct pva_kmd_block_allocator {
+	uint32_t free_slot_head;
+	uint32_t base_id;
+	uint32_t max_num_blocks;
+	uint32_t next_free_slot;
+	uint32_t block_size;
+	void *blocks;
+	bool *slot_in_use;
+};
+
+enum pva_error
+pva_kmd_block_allocator_init(struct pva_kmd_block_allocator *allocator,
+			     void *chunk_mem, uint32_t base_id,
+			     uint32_t chunk_size, uint32_t max_num_chunks);
+
+void *pva_kmd_alloc_block(struct pva_kmd_block_allocator *allocator,
+			  uint32_t *out_id);
+static inline void *
+pva_kmd_zalloc_block(struct pva_kmd_block_allocator *allocator,
+		     uint32_t *out_id)
+{
+	void *ptr = pva_kmd_alloc_block(allocator, out_id);
+	if (ptr != NULL) {
+		memset(ptr, 0, allocator->block_size);
+	}
+	return ptr;
+}
+
+void *pva_kmd_get_block(struct pva_kmd_block_allocator *allocator, uint32_t id);
+enum pva_error pva_kmd_free_block(struct pva_kmd_block_allocator *allocator,
+				  uint32_t id);
+
+void pva_kmd_block_allocator_deinit(struct pva_kmd_block_allocator *allocator);
+
+#endif // PVA_KMD_BLOCK_ALLOCATOR_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.c
@@ -0,0 +1,280 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_cmdbuf.h"
+#include "pva_api_cmdbuf.h"
+#include "pva_kmd_utils.h"
+#include "pva_math_utils.h"
+
+#define CHUNK_STATE_INVALID 0
+#define CHUNK_STATE_FENCE_TRIGGERED 1
+
+static uint32_t *
+get_chunk_states(struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool)
+{
+	return (uint32_t *)pva_offset_pointer(
+		cmdbuf_chunk_pool->mem_base_va,
+		cmdbuf_chunk_pool->chunk_states_offset);
+}
+
+static void *get_chunk(struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool,
+		       uint32_t chunk_id)
+{
+	return pva_offset_pointer(cmdbuf_chunk_pool->mem_base_va,
+				  cmdbuf_chunk_pool->chunk_size * chunk_id);
+}
+
+static uint32_t get_chunk_id_from_res_offset(
+	struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool, uint64_t offset)
+{
+	ASSERT(offset >= cmdbuf_chunk_pool->mem_offset);
+	offset -= cmdbuf_chunk_pool->mem_offset;
+	return offset / cmdbuf_chunk_pool->chunk_size;
+}
+
+enum pva_error pva_kmd_cmdbuf_chunk_pool_init(
+	struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool,
+	uint32_t mem_resource_id, uint64_t mem_offset, uint32_t mem_size,
+	uint16_t chunk_size, uint32_t num_chunks, void *mem_base_va)
+{
+	uint32_t *chunk_states;
+	uint32_t i;
+	enum pva_error err;
+
+	ASSERT(mem_size >= pva_kmd_cmdbuf_pool_get_required_mem_size(
+				   chunk_size, num_chunks));
+
+	cmdbuf_chunk_pool->mem_resource_id = mem_resource_id;
+	cmdbuf_chunk_pool->mem_offset = mem_offset;
+	cmdbuf_chunk_pool->mem_size = mem_size;
+	cmdbuf_chunk_pool->chunk_size = chunk_size;
+	cmdbuf_chunk_pool->num_chunks = num_chunks;
+	cmdbuf_chunk_pool->mem_base_va = mem_base_va;
+	cmdbuf_chunk_pool->chunk_states_offset = chunk_size * num_chunks;
+	chunk_states = get_chunk_states(cmdbuf_chunk_pool);
+	for (i = 0; i < num_chunks; i++) {
+		chunk_states[i] = CHUNK_STATE_INVALID;
+	}
+
+	err = pva_kmd_block_allocator_init(&cmdbuf_chunk_pool->block_allocator,
+					   mem_base_va, 0, chunk_size,
+					   num_chunks);
+	return err;
+}
+
+void pva_kmd_cmdbuf_chunk_pool_deinit(struct pva_kmd_cmdbuf_chunk_pool *pool)
+{
+	pva_kmd_block_allocator_deinit(&pool->block_allocator);
+}
+
+void pva_kmd_free_linked_cmdbuf_chunks(struct pva_kmd_cmdbuf_chunk_pool *pool,
+				       uint32_t chunk_id)
+{
+	struct pva_cmd_link_chunk *begin;
+	uint32_t *chunk_states;
+	uint64_t offset;
+	uint32_t resource_id;
+
+	chunk_states = get_chunk_states(pool);
+	while (true) {
+		begin = get_chunk(pool, chunk_id);
+		chunk_states[chunk_id] = CHUNK_STATE_INVALID;
+		offset = assemble_addr(begin->next_chunk_offset_hi,
+				       begin->next_chunk_offset_lo);
+		resource_id = begin->next_chunk_resource_id;
+		pva_kmd_free_block(&pool->block_allocator, chunk_id);
+		if (resource_id == PVA_RESOURCE_ID_INVALID) {
+			break;
+		}
+		ASSERT(resource_id == pool->mem_resource_id);
+		/* Free next chunk */
+		chunk_id = get_chunk_id_from_res_offset(pool, offset);
+	}
+}
+
+static bool recycle_chunks(struct pva_kmd_cmdbuf_chunk_pool *pool)
+{
+	uint32_t *chunk_states;
+	uint32_t i;
+	bool freed = false;
+
+	chunk_states = get_chunk_states(pool);
+	for (i = 0; i < pool->num_chunks; i++) {
+		if (chunk_states[i] == CHUNK_STATE_FENCE_TRIGGERED) {
+			pva_kmd_free_linked_cmdbuf_chunks(pool, i);
+			freed = true;
+			break;
+		}
+	}
+
+	return freed;
+}
+
+enum pva_error
+pva_kmd_alloc_cmdbuf_chunk(struct pva_kmd_cmdbuf_chunk_pool *pool,
+			   uint32_t *out_chunk_id)
+{
+	enum pva_error err = PVA_SUCCESS;
+	void *chunk;
+
+	chunk = pva_kmd_alloc_block(&pool->block_allocator, out_chunk_id);
+	if (chunk == NULL) {
+		if (recycle_chunks(pool)) {
+			chunk = pva_kmd_alloc_block(&pool->block_allocator,
+						    out_chunk_id);
+			ASSERT(chunk != NULL);
+		} else {
+			err = PVA_NOMEM;
+		}
+	}
+
+	return err;
+}
+
+void pva_kmd_get_free_notifier_fence(struct pva_kmd_cmdbuf_chunk_pool *pool,
+				     uint32_t chunk_id,
+				     struct pva_fw_postfence *fence)
+{
+	uint64_t offset_sum =
+		safe_addu64(pool->mem_offset, pool->chunk_states_offset);
+	uint64_t chunk_size =
+		(uint64_t)safe_mulu32((uint32_t)sizeof(uint32_t), chunk_id);
+	uint64_t state_offset = safe_addu64(offset_sum, chunk_size);
+	memset(fence, 0, sizeof(*fence));
+	fence->resource_id = pool->mem_resource_id;
+	fence->offset_lo = iova_lo(state_offset);
+	fence->offset_hi = iova_hi(state_offset);
+	fence->value = CHUNK_STATE_FENCE_TRIGGERED;
+	fence->ts_resource_id = PVA_RESOURCE_ID_INVALID;
+}
+
+static void *current_cmd(struct pva_kmd_cmdbuf_builder *builder)
+{
+	return pva_offset_pointer(
+		pva_kmd_get_cmdbuf_chunk_va(builder->pool,
+					    builder->current_chunk_id),
+		builder->current_chunk_offset);
+}
+
+static void begin_chunk(struct pva_kmd_cmdbuf_builder *builder)
+{
+	struct pva_cmd_link_chunk *cmd = pva_kmd_get_cmdbuf_chunk_va(
+		builder->pool, builder->current_chunk_id);
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->header.opcode = PVA_CMD_OPCODE_LINK_CHUNK;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->next_chunk_resource_id = PVA_RESOURCE_ID_INVALID;
+	builder->current_chunk_offset = sizeof(*cmd);
+}
+
+static void end_chunk(struct pva_kmd_cmdbuf_builder *builder)
+{
+	/* Size of this chunk is now known. Update the header of the previous chunk. */
+	*builder->chunk_size_ptr = builder->current_chunk_offset;
+}
+
+static void link_chunk(struct pva_kmd_cmdbuf_builder *builder,
+		       uint32_t new_chunk_id)
+{
+	struct pva_cmd_link_chunk *old_link;
+	uint64_t new_chunk_offset;
+
+	old_link = (struct pva_cmd_link_chunk *)pva_kmd_get_cmdbuf_chunk_va(
+		builder->pool, builder->current_chunk_id);
+	new_chunk_offset = pva_kmd_get_cmdbuf_chunk_res_offset(builder->pool,
+							       new_chunk_id);
+	old_link->next_chunk_resource_id = builder->pool->mem_resource_id;
+	old_link->next_chunk_offset_lo = iova_lo(new_chunk_offset);
+	old_link->next_chunk_offset_hi = iova_hi(new_chunk_offset);
+	/* The new chunk size is still unknown. We record the pointer here. */
+	builder->chunk_size_ptr = &old_link->next_chunk_size;
+}
+
+void *pva_kmd_reserve_cmd_space(struct pva_kmd_cmdbuf_builder *builder,
+				uint16_t size)
+{
+	uint16_t max_size;
+	enum pva_error err;
+	void *cmd_start;
+
+	max_size = safe_subu16(builder->pool->chunk_size,
+			       (uint16_t)sizeof(struct pva_cmd_link_chunk));
+
+	ASSERT(size <= max_size);
+
+	if ((builder->current_chunk_offset + size) >
+	    builder->pool->chunk_size) {
+		/* Not enough space in the current chunk. Allocate a new one. */
+		uint32_t new_chunk_id;
+
+		err = pva_kmd_alloc_cmdbuf_chunk(builder->pool, &new_chunk_id);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err("No more chunk in the pool");
+			goto err_out;
+		}
+		end_chunk(builder);
+		link_chunk(builder, new_chunk_id);
+
+		builder->current_chunk_id = new_chunk_id;
+		builder->current_chunk_offset = 0;
+		begin_chunk(builder);
+	}
+
+	cmd_start = current_cmd(builder);
+	(void)memset(cmd_start, 0, size);
+
+	builder->current_chunk_offset += size;
+
+	return cmd_start;
+err_out:
+	return NULL;
+}
+
+enum pva_error
+pva_kmd_cmdbuf_builder_init(struct pva_kmd_cmdbuf_builder *builder,
+			    struct pva_kmd_cmdbuf_chunk_pool *chunk_pool)
+{
+	enum pva_error err = PVA_SUCCESS;
+	uint32_t const min_chunk_size = sizeof(struct pva_cmd_link_chunk);
+
+	ASSERT(chunk_pool->chunk_size >= min_chunk_size);
+
+	builder->pool = chunk_pool;
+	err = pva_kmd_alloc_cmdbuf_chunk(chunk_pool,
+					 &builder->current_chunk_id);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	builder->current_chunk_offset = 0;
+	builder->first_chunk_size = 0;
+	builder->first_chunk_id = builder->current_chunk_id;
+	builder->chunk_size_ptr = &builder->first_chunk_size;
+
+	begin_chunk(builder);
+
+	return PVA_SUCCESS;
+err_out:
+	return err;
+}
+
+void pva_kmd_cmdbuf_builder_finalize(struct pva_kmd_cmdbuf_builder *builder,
+				     uint32_t *out_first_chunk_id,
+				     uint16_t *out_first_chunk_size)
+{
+	end_chunk(builder);
+	*out_first_chunk_id = builder->first_chunk_id;
+	*out_first_chunk_size = builder->first_chunk_size;
+}
+
+void pva_kmd_cmdbuf_builder_cancel(struct pva_kmd_cmdbuf_builder *builder)
+{
+	pva_kmd_free_linked_cmdbuf_chunks(builder->pool,
+					  builder->first_chunk_id);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_cmdbuf.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_CMDBUF_H
+#define PVA_KMD_CMDBUF_H
+#include "pva_fw.h"
+#include "pva_resource.h"
+#include "pva_kmd_block_allocator.h"
+#include "pva_kmd_mutex.h"
+#include "pva_api_cmdbuf.h"
+#include "pva_utils.h"
+#include "pva_math_utils.h"
+
+struct pva_kmd_queue;
+
+/**
+ * A fixed-size pool of command buffer chunks.
+ *
+ * We can allocate chunks from this pool. When submitting the chunks, we should
+ * request a post fence from the pool for the first chunk. When the post fence
+ * is triggered, the chain of chunks will be considered free by the pool.
+ */
+struct pva_kmd_cmdbuf_chunk_pool {
+	uint16_t chunk_size;
+	uint32_t num_chunks;
+	uint32_t mem_resource_id;
+	uint64_t mem_size;
+	uint64_t mem_offset; /**< Starting offset in the resource that can be
+			      * used by this pool */
+	uint64_t chunk_states_offset;
+	void *mem_base_va;
+	struct pva_kmd_block_allocator block_allocator;
+};
+
+static inline uint64_t
+pva_kmd_cmdbuf_pool_get_required_mem_size(uint16_t chunk_size,
+					  uint32_t num_chunks)
+{
+	/* Add storage required for free notifier fences */
+	return (chunk_size + sizeof(uint32_t)) * num_chunks;
+}
+
+/**
+ * Initialize the chunk pool.
+ *
+ * @param[out] Pointer to the pool.
+ *
+ * @param[in] mem_resource_id Resource ID of the memory to be used for the pool.
+ *
+ * @param[in] mem_offset Offset of the memory to be used for the pool.
+
+ * @param[in] mem_size Size of the memory to be used for the pool.
+ *
+ * @param[in] chunk_size Size of each chunk in the pool.
+ *
+ * @param[in] num_chunks Number of chunks in the pool.
+ *
+ * @param[in] mem_base_va Virtual address of the memory to be used for the pool.
+ *            The virtual address is the base address of the resource.
+ */
+enum pva_error pva_kmd_cmdbuf_chunk_pool_init(
+	struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool,
+	uint32_t mem_resource_id, uint64_t mem_offset, uint32_t mem_size,
+	uint16_t chunk_size, uint32_t num_chunks, void *mem_base_va);
+
+void pva_kmd_cmdbuf_chunk_pool_deinit(
+	struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool);
+
+/**
+ * Allocate a chunk from the pool.
+ *
+ * If the chunk is submitted, then free will be done automatically when
+ * free-notifier fence is triggered.
+ */
+enum pva_error
+pva_kmd_alloc_cmdbuf_chunk(struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool,
+			   uint32_t *out_chunk_id);
+
+/**
+ * Free a linked list of chunks.
+ *
+ * We only need to call this function if we decide not to submit the chunks,
+ * usually in error path.
+ */
+void pva_kmd_free_linked_cmdbuf_chunks(
+	struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool, uint32_t chunk_id);
+
+/**
+ * Get the free-notifier fence.
+ *
+ * @param[in] The first chunk of the command buffer to be submitted.
+ *
+ * @param[out] The free-notifier fence that should be submitted with the command buffer.
+ */
+void pva_kmd_get_free_notifier_fence(
+	struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool, uint32_t chunk_id,
+	struct pva_fw_postfence *fence);
+
+static inline void *
+pva_kmd_get_cmdbuf_chunk_va(struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool,
+			    uint32_t chunk_id)
+{
+	return (void *)((uintptr_t)cmdbuf_chunk_pool->mem_base_va +
+			chunk_id * cmdbuf_chunk_pool->chunk_size);
+}
+
+static inline uint64_t pva_kmd_get_cmdbuf_chunk_res_offset(
+	struct pva_kmd_cmdbuf_chunk_pool *cmdbuf_chunk_pool, uint32_t chunk_id)
+{
+	uint64_t chunk_size = (uint64_t)safe_mulu32(
+		chunk_id, (uint32_t)cmdbuf_chunk_pool->chunk_size);
+	return safe_addu64(cmdbuf_chunk_pool->mem_offset, chunk_size);
+}
+
+/**
+ * Utility for building a command buffer with multiple chunks.
+ *
+ * The builder will automatically allocate chunks from the pool when the current
+ * chunk is full.
+ */
+struct pva_kmd_cmdbuf_builder {
+	uint16_t first_chunk_size;
+	uint16_t current_chunk_offset;
+	uint32_t first_chunk_id;
+	uint32_t current_chunk_id;
+	struct pva_kmd_cmdbuf_chunk_pool *pool;
+	uint16_t *chunk_size_ptr; /**< Pointer to the chunk size field of the previous link_chunk command  */
+};
+
+enum pva_error
+pva_kmd_cmdbuf_builder_init(struct pva_kmd_cmdbuf_builder *builder,
+			    struct pva_kmd_cmdbuf_chunk_pool *chunk_pool);
+
+void *pva_kmd_reserve_cmd_space(struct pva_kmd_cmdbuf_builder *builder,
+				uint16_t size);
+void pva_kmd_cmdbuf_builder_finalize(struct pva_kmd_cmdbuf_builder *builder,
+				     uint32_t *out_first_chunk_id,
+				     uint16_t *out_first_chunk_size);
+
+void pva_kmd_cmdbuf_builder_cancel(struct pva_kmd_cmdbuf_builder *builder);
+
+static inline void pva_kmd_set_cmd_init_resource_table(
+	struct pva_cmd_init_resource_table *cmd, uint8_t resource_table_id,
+	uint64_t iova_addr, uint32_t max_num_entries)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_INIT_RESOURCE_TABLE;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->resource_table_id = resource_table_id;
+	cmd->resource_table_addr_lo = iova_lo(iova_addr);
+	cmd->resource_table_addr_hi = iova_hi(iova_addr);
+	cmd->max_n_entries = max_num_entries;
+}
+
+static inline void
+pva_kmd_set_cmd_deinit_resource_table(struct pva_cmd_deinit_resource_table *cmd,
+				      uint8_t resource_table_id)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_RESOURCE_TABLE;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->resource_table_id = resource_table_id;
+}
+
+static inline void pva_kmd_set_cmd_init_queue(struct pva_cmd_init_queue *cmd,
+					      uint8_t ccq_id, uint8_t queue_id,
+					      uint64_t iova_addr,
+					      uint32_t max_num_submit)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_INIT_QUEUE;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->ccq_id = ccq_id;
+	cmd->queue_id = queue_id;
+	cmd->queue_addr_lo = iova_lo(iova_addr);
+	cmd->queue_addr_hi = iova_hi(iova_addr);
+	cmd->max_n_submits = max_num_submit;
+}
+
+static inline void
+pva_kmd_set_cmd_deinit_queue(struct pva_cmd_deinit_queue *cmd, uint8_t ccq_id,
+			     uint8_t queue_id)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_DEINIT_QUEUE;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->ccq_id = ccq_id;
+	cmd->queue_id = queue_id;
+}
+
+static inline void pva_kmd_set_cmd_update_resource_table(
+	struct pva_cmd_update_resource_table *cmd, uint32_t resource_table_id,
+	uint32_t resource_id, struct pva_resource_entry const *entry)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_UPDATE_RESOURCE_TABLE;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->resource_table_id = resource_table_id;
+	cmd->resource_id = resource_id;
+	cmd->entry = *entry;
+}
+
+static inline void
+pva_kmd_set_cmd_unregister_resource(struct pva_cmd_unregister_resource *cmd,
+				    uint32_t resource_id)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_UNREGISTER_RESOURCE;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->resource_id = resource_id;
+}
+
+static inline void
+pva_kmd_set_cmd_enable_fw_profiling(struct pva_cmd_enable_fw_profiling *cmd,
+				    uint32_t buffer_resource_id,
+				    uint32_t buffer_size, uint64_t offset,
+				    uint32_t filter, uint8_t timestamp_type)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_ENABLE_FW_PROFILING;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->buffer_resource_id = buffer_resource_id;
+	cmd->buffer_offset_hi = iova_hi(offset);
+	cmd->buffer_offset_lo = iova_lo(offset);
+	cmd->buffer_size = buffer_size;
+	cmd->filter = filter;
+	cmd->timestamp_type = timestamp_type;
+}
+
+static inline void
+pva_kmd_set_cmd_disable_fw_profiling(struct pva_cmd_disable_fw_profiling *cmd)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_DISABLE_FW_PROFILING;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+}
+
+static inline void pva_kmd_set_cmd_get_tegra_stats(
+	struct pva_cmd_get_tegra_stats *cmd, uint32_t buffer_resource_id,
+	uint32_t buffer_size, uint64_t offset, bool enabled)
+{
+	cmd->header.opcode = PVA_CMD_OPCODE_GET_TEGRA_STATS;
+	cmd->header.len = sizeof(*cmd) / sizeof(uint32_t);
+	cmd->buffer_resource_id = buffer_resource_id;
+	cmd->buffer_offset_hi = iova_hi(offset);
+	cmd->buffer_offset_lo = iova_lo(offset);
+	cmd->buffer_size = buffer_size;
+	cmd->enabled = enabled;
+}
+
+static inline void pva_kmd_set_cmd_suspend_fw(struct pva_cmd_suspend_fw *cmd)
+{
+	uint64_t len = (sizeof(*cmd) / sizeof(uint32_t));
+	cmd->header.opcode = PVA_CMD_OPCODE_SUSPEND_FW;
+	ASSERT(len <= 255u);
+	cmd->header.len = (uint8_t)(len);
+}
+
+static inline void pva_kmd_set_cmd_resume_fw(struct pva_cmd_resume_fw *cmd)
+{
+	uint64_t len = (sizeof(*cmd) / sizeof(uint32_t));
+	cmd->header.opcode = PVA_CMD_OPCODE_RESUME_FW;
+	ASSERT(len <= 255u);
+	cmd->header.len = (uint8_t)(len);
+}
+#endif // PVA_KMD_CMDBUF_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_constants.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_constants.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_CONSTANTS_H
+#define PVA_KMD_CONSTANTS_H
+#include "pva_constants.h"
+/* Limits related to KMD's own submission*/
+#define PVA_KMD_MAX_NUM_KMD_RESOURCES 32
+#define PVA_KMD_MAX_NUM_KMD_DMA_CONFIGS 1
+#define PVA_KMD_MAX_NUM_KMD_CHUNKS 32
+#define PVA_KMD_MAX_NUM_KMD_SUBMITS 32
+
+/* Limits related to User's privileged submission */
+#define PVA_KMD_MAX_NUM_PRIV_CHUNKS 256
+#define PVA_KMD_MAX_NUM_PRIV_SUBMITS 256
+
+#define PVA_KMD_USER_CONTEXT_ID_BASE 1u
+#define PVA_KMD_PVA0_T23x_REG_BASE 0x16000000
+#define PVA_KMD_PVA0_T23x_REG_SIZE 0x800000
+
+#define PVA_KMD_TIMEOUT_INF UINT64_MAX
+
+// clang-format off
+#if PVA_BUILD_MODE == PVA_BUILD_MODE_SIM
+    #define PVA_KMD_TIMEOUT_FACTOR 100
+#else
+    #define PVA_KMD_TIMEOUT_FACTOR 1
+#endif
+// clang-format on
+
+#define PVA_KMD_TIMEOUT(val) (val * PVA_KMD_TIMEOUT_FACTOR)
+
+#define PVA_KMD_TIMEOUT_RESOURCE_SEMA_MS PVA_KMD_TIMEOUT(100) /*< 100 ms */
+#define PVA_KMD_WAIT_FW_TIMEOUT_US PVA_KMD_TIMEOUT(1000000) /*< 1 second*/
+#define PVA_KMD_WAIT_FW_POLL_INTERVAL_US PVA_KMD_TIMEOUT(100) /*< 100 us*/
+#define PVA_KMD_FW_BOOT_TIMEOUT_MS PVA_KMD_TIMEOUT(1000) /*< 1 seconds */
+
+#define PVA_NUM_RW_SYNCPTS 56
+
+// clang-format off
+#if PVA_DEV_MAIN_COMPATIBLE == 1
+    #define PVA_KMD_LOAD_FROM_GSC_DEFAULT true
+    #if PVA_SAFETY == 1
+        #define PVA_KMD_APP_AUTH_DEFAULT true
+    #else
+        #define PVA_KMD_APP_AUTH_DEFAULT false
+    #endif
+#else
+    #define PVA_KMD_LOAD_FROM_GSC_DEFAULT false
+    #define PVA_KMD_APP_AUTH_DEFAULT false
+#endif
+// clang-format on
+
+#endif // PVA_KMD_CONSTANTS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.c
@@ -0,0 +1,363 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_utils.h"
+#include "pva_constants.h"
+#include "pva_api_cmdbuf.h"
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_queue.h"
+#include "pva_kmd_context.h"
+#include "pva_kmd_constants.h"
+
+struct pva_kmd_context *pva_kmd_context_create(struct pva_kmd_device *pva)
+{
+	uint32_t alloc_id;
+	enum pva_error err;
+	struct pva_kmd_context *ctx;
+
+	ctx = pva_kmd_zalloc_block(&pva->context_allocator, &alloc_id);
+	if (ctx == NULL) {
+		goto err_out;
+	}
+	ctx->ccq_id = alloc_id;
+	ctx->resource_table_id = ctx->ccq_id;
+	ctx->smmu_ctx_id = ctx->ccq_id;
+	ctx->pva = pva;
+	ctx->max_n_queues = PVA_MAX_NUM_QUEUES_PER_CONTEXT;
+	ctx->ccq0_lock_ptr = &pva->ccq0_lock;
+	pva_kmd_mutex_init(&ctx->ccq_lock);
+	pva_kmd_mutex_init(&ctx->resource_table_lock);
+	ctx->queue_allocator_mem = pva_kmd_zalloc(sizeof(struct pva_kmd_queue) *
+						  ctx->max_n_queues);
+	if (ctx->queue_allocator_mem == NULL) {
+		goto free_ctx;
+	}
+
+	err = pva_kmd_block_allocator_init(&ctx->queue_allocator,
+					   ctx->queue_allocator_mem, 0,
+					   sizeof(struct pva_kmd_queue),
+					   ctx->max_n_queues);
+	if (err != PVA_SUCCESS) {
+		goto free_queue_mem;
+	}
+
+	return ctx;
+free_queue_mem:
+	pva_kmd_free(ctx->queue_allocator_mem);
+free_ctx:
+	pva_kmd_free(ctx);
+err_out:
+	return NULL;
+}
+
+static enum pva_error notify_fw_context_init(struct pva_kmd_context *ctx)
+{
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_kmd_submitter *dev_submitter = &ctx->pva->submitter;
+	struct pva_cmd_init_resource_table *res_cmd;
+	struct pva_cmd_init_queue *queue_cmd;
+	struct pva_cmd_update_resource_table *update_cmd;
+	struct pva_resource_entry entry = { 0 };
+	uint32_t fence_val;
+	enum pva_error err;
+
+	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	res_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*res_cmd));
+	ASSERT(res_cmd != NULL);
+
+	pva_kmd_set_cmd_init_resource_table(
+		res_cmd, ctx->resource_table_id,
+		ctx->ctx_resource_table.table_mem->iova,
+		ctx->ctx_resource_table.n_entries);
+
+	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
+	ASSERT(queue_cmd != NULL);
+
+	pva_kmd_set_cmd_init_queue(
+		queue_cmd, PVA_PRIV_CCQ_ID,
+		ctx->ccq_id, /* For privileged queues, queue ID == user CCQ ID*/
+		ctx->ctx_queue.queue_memory->iova,
+		ctx->ctx_queue.max_num_submit);
+
+	update_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*update_cmd));
+	ASSERT(update_cmd != NULL);
+
+	err = pva_kmd_make_resource_entry(&ctx->pva->dev_resource_table,
+					  ctx->submit_memory_resource_id,
+					  &entry);
+	ASSERT(err == PVA_SUCCESS);
+
+	pva_kmd_set_cmd_update_resource_table(update_cmd,
+					      0, /* KMD's resource table ID */
+					      ctx->submit_memory_resource_id,
+					      &entry);
+
+	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		// Error is either QUEUE_FULL or TIMEDOUT
+		goto cancel_builder;
+	}
+
+	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Waiting for FW timed out when initializing context");
+		goto err_out;
+	}
+
+	return PVA_SUCCESS;
+
+cancel_builder:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+err_out:
+	return err;
+}
+
+static enum pva_error notify_fw_context_deinit(struct pva_kmd_context *ctx)
+{
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_kmd_submitter *dev_submitter = &ctx->pva->submitter;
+	struct pva_cmd_deinit_resource_table *deinit_table_cmd;
+	struct pva_cmd_deinit_queue *deinit_queue_cmd;
+	uint32_t fence_val;
+	enum pva_error err;
+
+	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	deinit_queue_cmd =
+		pva_kmd_reserve_cmd_space(&builder, sizeof(*deinit_queue_cmd));
+	ASSERT(deinit_queue_cmd != NULL);
+	pva_kmd_set_cmd_deinit_queue(
+		deinit_queue_cmd, PVA_PRIV_CCQ_ID,
+		ctx->ccq_id /* For privileged queues, queue ID == user CCQ ID*/
+	);
+
+	deinit_table_cmd =
+		pva_kmd_reserve_cmd_space(&builder, sizeof(*deinit_table_cmd));
+	ASSERT(deinit_table_cmd != NULL);
+	pva_kmd_set_cmd_deinit_resource_table(deinit_table_cmd,
+					      ctx->resource_table_id);
+
+	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		goto cancel_builder;
+	}
+
+	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Waiting for FW timed out when deinitializing context");
+		goto err_out;
+	}
+
+	return PVA_SUCCESS;
+cancel_builder:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+err_out:
+	return err;
+}
+
+enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
+				    uint32_t res_table_capacity)
+{
+	enum pva_error err;
+	uint32_t queue_mem_size;
+	uint64_t chunk_mem_size;
+	struct pva_fw_postfence post_fence = { 0 };
+	struct pva_syncpt_rw_info *syncpts;
+	uint64_t size;
+
+	/* Power on PVA if not already */
+	err = pva_kmd_device_busy(ctx->pva);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	/* Allocate RW syncpoints for this context */
+	syncpts = (struct pva_syncpt_rw_info *)pva_kmd_alloc_block(
+		&ctx->pva->syncpt_allocator, &ctx->syncpt_block_index);
+	ASSERT(syncpts != NULL);
+
+	/* Init resource table for this context */
+	err = pva_kmd_resource_table_init(&ctx->ctx_resource_table, ctx->pva,
+					  ctx->smmu_ctx_id, res_table_capacity,
+					  res_table_capacity);
+	if (err != PVA_SUCCESS) {
+		goto drop_device;
+	}
+
+	/* Init privileged queue for this context */
+	queue_mem_size = pva_get_submission_queue_memory_size(
+		PVA_KMD_MAX_NUM_PRIV_SUBMITS);
+	ctx->ctx_queue_mem =
+		pva_kmd_device_memory_alloc_map(queue_mem_size, ctx->pva,
+						PVA_ACCESS_RW,
+						PVA_R5_SMMU_CONTEXT_ID);
+	if (ctx->ctx_queue_mem == NULL) {
+		err = PVA_NOMEM;
+		goto deinit_table;
+	}
+
+	pva_kmd_queue_init(
+		&ctx->ctx_queue, ctx->pva, PVA_PRIV_CCQ_ID,
+		ctx->ccq_id, /* Context's PRIV queue ID is identical to CCQ ID */
+		&ctx->pva->ccq0_lock, ctx->ctx_queue_mem,
+		PVA_KMD_MAX_NUM_PRIV_SUBMITS);
+
+	/* Allocate memory for submission */
+	chunk_mem_size = pva_kmd_cmdbuf_pool_get_required_mem_size(
+		PVA_MAX_CMDBUF_CHUNK_SIZE, PVA_KMD_MAX_NUM_PRIV_CHUNKS);
+	/* Allocate one post fence at the end. This memory will be added to
+	 * KMD's own resource table. We don't need to explicitly free it. It
+	 * will be freed after we drop the resource. */
+	size = safe_addu64(chunk_mem_size, (uint64_t)sizeof(uint32_t));
+	ctx->submit_memory = pva_kmd_device_memory_alloc_map(
+		size, ctx->pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+	if (ctx->submit_memory == NULL) {
+		err = PVA_NOMEM;
+		goto queue_deinit;
+	}
+
+	/* Add submit memory to resource table */
+	pva_kmd_mutex_lock(&ctx->pva->resource_table_lock);
+	err = pva_kmd_add_dram_buffer_resource(&ctx->pva->dev_resource_table,
+					       ctx->submit_memory,
+					       &ctx->submit_memory_resource_id);
+	pva_kmd_mutex_unlock(&ctx->pva->resource_table_lock);
+	if (err != PVA_SUCCESS) {
+		goto free_submit_memory;
+	}
+
+	/* Init chunk pool */
+	err = pva_kmd_cmdbuf_chunk_pool_init(
+		&ctx->chunk_pool, ctx->submit_memory_resource_id,
+		0 /* offset */, chunk_mem_size, PVA_MAX_CMDBUF_CHUNK_SIZE,
+		PVA_KMD_MAX_NUM_PRIV_CHUNKS, ctx->submit_memory->va);
+	if (err != PVA_SUCCESS) {
+		goto free_dram_buffer_resource;
+	}
+
+	/* Init fence */
+	ctx->fence_offset = chunk_mem_size;
+
+	/* Init submitter */
+	pva_kmd_mutex_init(&ctx->submit_lock);
+	pva_kmd_mutex_init(&ctx->chunk_pool_lock);
+	post_fence.resource_id = ctx->submit_memory_resource_id;
+	post_fence.offset_lo = iova_lo(ctx->fence_offset);
+	post_fence.offset_hi = iova_hi(ctx->fence_offset);
+	post_fence.ts_resource_id = PVA_RESOURCE_ID_INVALID;
+	pva_kmd_submitter_init(
+		&ctx->submitter, &ctx->ctx_queue, &ctx->submit_lock,
+		&ctx->chunk_pool, &ctx->chunk_pool_lock,
+		pva_offset_pointer(ctx->submit_memory->va, ctx->fence_offset),
+		&post_fence);
+
+	/* Use KMD's queue to inform FW */
+	err = notify_fw_context_init(ctx);
+	if (err != PVA_SUCCESS) {
+		goto deinit_submitter;
+	}
+	ctx->inited = true;
+
+	return PVA_SUCCESS;
+
+deinit_submitter:
+	pva_kmd_mutex_deinit(&ctx->chunk_pool_lock);
+	pva_kmd_mutex_deinit(&ctx->submit_lock);
+	pva_kmd_cmdbuf_chunk_pool_deinit(&ctx->chunk_pool);
+free_dram_buffer_resource:
+	pva_kmd_drop_resource(&ctx->pva->dev_resource_table,
+			      ctx->submit_memory_resource_id);
+free_submit_memory:
+	pva_kmd_device_memory_free(ctx->submit_memory);
+queue_deinit:
+	pva_kmd_queue_deinit(&ctx->ctx_queue);
+	pva_kmd_device_memory_free(ctx->ctx_queue_mem);
+deinit_table:
+	pva_kmd_resource_table_deinit(&ctx->ctx_resource_table);
+drop_device:
+	pva_kmd_device_idle(ctx->pva);
+err_out:
+	return err;
+}
+
+void pva_kmd_context_deinit(struct pva_kmd_context *ctx)
+{
+	enum pva_error err;
+
+	if (ctx->inited) {
+		err = notify_fw_context_deinit(ctx);
+		ASSERT(err == PVA_SUCCESS);
+		pva_kmd_verify_all_resources_free(&ctx->ctx_resource_table);
+		pva_kmd_device_idle(ctx->pva);
+		pva_kmd_mutex_deinit(&ctx->submit_lock);
+		pva_kmd_mutex_deinit(&ctx->chunk_pool_lock);
+		pva_kmd_cmdbuf_chunk_pool_deinit(&ctx->chunk_pool);
+		pva_kmd_mutex_lock(&ctx->pva->resource_table_lock);
+		pva_kmd_drop_resource(&ctx->pva->dev_resource_table,
+				      ctx->submit_memory_resource_id);
+		pva_kmd_mutex_unlock(&ctx->pva->resource_table_lock);
+		pva_kmd_queue_deinit(&ctx->ctx_queue);
+		pva_kmd_device_memory_free(ctx->ctx_queue_mem);
+		pva_kmd_resource_table_deinit(&ctx->ctx_resource_table);
+		pva_kmd_free_block(&ctx->pva->syncpt_allocator,
+				   ctx->syncpt_block_index);
+		ctx->inited = false;
+	}
+}
+
+static void pva_kmd_destroy_all_queues(struct pva_kmd_context *ctx)
+{
+	enum pva_error err;
+	struct pva_kmd_queue_destroy_in_args args;
+
+	for (uint32_t queue_id = 0u; queue_id < ctx->max_n_queues; queue_id++) {
+		struct pva_kmd_queue *queue =
+			pva_kmd_get_block(&ctx->queue_allocator, queue_id);
+		if (queue != NULL) {
+			args.queue_id = queue_id;
+			err = pva_kmd_queue_destroy(ctx, &args);
+			ASSERT(err == PVA_SUCCESS);
+		}
+	}
+}
+
+void pva_kmd_context_destroy(struct pva_kmd_context *ctx)
+{
+	enum pva_error err;
+
+	pva_kmd_destroy_all_queues(ctx);
+	pva_kmd_context_deinit(ctx);
+	pva_kmd_block_allocator_deinit(&ctx->queue_allocator);
+	pva_kmd_free(ctx->queue_allocator_mem);
+	pva_kmd_mutex_deinit(&ctx->ccq_lock);
+	pva_kmd_mutex_deinit(&ctx->resource_table_lock);
+	err = pva_kmd_free_block(&ctx->pva->context_allocator, ctx->ccq_id);
+	ASSERT(err == PVA_SUCCESS);
+}
+
+struct pva_kmd_context *pva_kmd_get_context(struct pva_kmd_device *pva,
+					    uint8_t alloc_id)
+{
+	return pva_kmd_get_block(&pva->context_allocator, alloc_id);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_context.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_CONTEXT_H
+#define PVA_KMD_CONTEXT_H
+#include "pva_api.h"
+#include "pva_constants.h"
+#include "pva_kmd_block_allocator.h"
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_queue.h"
+#include "pva_kmd_mutex.h"
+#include "pva_kmd_submitter.h"
+
+struct pva_kmd_device;
+
+/**
+ * @brief This struct manages a user context in KMD.
+ *
+ * One KMD user context is uniquely mapped to a UMD user context. Each context
+ * is assigned a unique CCQ block and, on QNX and Linux, a unique file
+ * descriptor.
+ */
+struct pva_kmd_context {
+	struct pva_kmd_device *pva;
+	uint8_t resource_table_id;
+	uint8_t ccq_id;
+	uint8_t smmu_ctx_id;
+
+	bool inited;
+
+	pva_kmd_mutex_t resource_table_lock;
+	struct pva_kmd_resource_table ctx_resource_table;
+
+	struct pva_kmd_submitter submitter;
+	/** The lock protects the submission to the queue, including
+	 * incrementing the post fence */
+	pva_kmd_mutex_t submit_lock;
+	/** Privileged queue owned by this context. It uses the privileged
+	 * resource table (ID 0). */
+	struct pva_kmd_device_memory *ctx_queue_mem;
+
+	/** Privileged queue owned by the context */
+	struct pva_kmd_queue ctx_queue;
+	/** Pointer to the ccq0 lock owned by device*/
+	pva_kmd_mutex_t *ccq0_lock_ptr;
+
+	/** memory needed for submission: including command buffer chunks and fences */
+	struct pva_kmd_device_memory *submit_memory;
+	/** Resource ID of the submission memory, registered with the privileged resource table (ID 0) */
+	uint32_t submit_memory_resource_id;
+	uint64_t fence_offset; /**< fence offset within submit_memory*/
+
+	pva_kmd_mutex_t chunk_pool_lock;
+	struct pva_kmd_cmdbuf_chunk_pool chunk_pool;
+
+	uint32_t max_n_queues;
+	void *queue_allocator_mem;
+	struct pva_kmd_block_allocator queue_allocator;
+
+	/** This lock protects the context's own CCQ access. We don't really use
+	 * it because we don't do user queue submission in KMD.
+	 */
+	pva_kmd_mutex_t ccq_lock;
+	void *plat_data;
+	uint64_t ccq_shm_handle;
+
+	/** Index of block of syncpoints allocated for this context */
+	uint32_t syncpt_block_index;
+	uint32_t syncpt_ids[PVA_NUM_RW_SYNCPTS_PER_CONTEXT];
+};
+
+/**
+ * @brief Allocate a KMD context.
+ */
+struct pva_kmd_context *pva_kmd_context_create(struct pva_kmd_device *pva);
+
+/**
+ * @brief Destroy a KMD context.
+ */
+void pva_kmd_context_destroy(struct pva_kmd_context *client);
+
+/**
+ * @brief Initialize a KMD context.
+ *
+ * The user provides a CCQ range (inclusive on both ends) and the KMD will pick
+ * one CCQ from this range.
+ */
+enum pva_error pva_kmd_context_init(struct pva_kmd_context *ctx,
+				    uint32_t res_table_capacity);
+
+void pva_kmd_context_deinit(struct pva_kmd_context *ctx);
+
+struct pva_kmd_context *pva_kmd_get_context(struct pva_kmd_device *pva,
+					    uint8_t alloc_id);
+
+#endif // PVA_KMD_CONTEXT_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.c
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_device.h"
+#include "pva_kmd_debugfs.h"
+#include "pva_kmd_fw_profiler.h"
+#include "pva_kmd_silicon_utils.h"
+#include "pva_kmd_vpu_ocd.h"
+#include "pva_kmd_tegra_stats.h"
+#include "pva_kmd_vpu_app_auth.h"
+
+void pva_kmd_debugfs_create_nodes(struct pva_kmd_device *pva)
+{
+	static const char *vpu_ocd_names[NUM_VPU_BLOCKS] = { "ocd_vpu0_v3",
+							     "ocd_vpu1_v3" };
+	pva_kmd_debugfs_create_bool(pva, "stats_enable",
+				    &pva->debugfs_context.stats_enable);
+	pva_kmd_debugfs_create_bool(pva, "vpu_debug",
+				    &pva->debugfs_context.vpu_debug);
+	pva_kmd_debugfs_create_u32(pva, "profile_level",
+				   &pva->debugfs_context.profile_level);
+	pva->debugfs_context.vpu_fops.read = &update_vpu_stats;
+	pva->debugfs_context.vpu_fops.pdev = pva;
+	pva_kmd_debugfs_create_file(pva, "vpu_stats",
+				    &pva->debugfs_context.vpu_fops);
+	for (uint32_t i = 0; i < NUM_VPU_BLOCKS; i++) {
+		pva->debugfs_context.vpu_ocd_fops[i].open =
+			&pva_kmd_vpu_ocd_open;
+		pva->debugfs_context.vpu_ocd_fops[i].release =
+			&pva_kmd_vpu_ocd_release;
+		pva->debugfs_context.vpu_ocd_fops[i].read =
+			&pva_kmd_vpu_ocd_read;
+		pva->debugfs_context.vpu_ocd_fops[i].write =
+			&pva_kmd_vpu_ocd_write;
+		pva->debugfs_context.vpu_ocd_fops[i].pdev = pva;
+		pva->debugfs_context.vpu_ocd_fops[i].file_data =
+			(void *)&pva->regspec.vpu_dbg_instr_reg_offset[i];
+		pva_kmd_debugfs_create_file(
+			pva, vpu_ocd_names[i],
+			&pva->debugfs_context.vpu_ocd_fops[i]);
+	}
+
+	pva->debugfs_context.allowlist_fops.write = &update_vpu_allowlist;
+	pva->debugfs_context.allowlist_fops.pdev = pva;
+	pva_kmd_debugfs_create_file(pva, "vpu_app_authentication",
+				    &pva->debugfs_context.allowlist_fops);
+
+	pva_kmd_device_init_profiler(pva);
+	pva_kmd_device_init_tegra_stats(pva);
+}
+
+void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *pva)
+{
+	pva_kmd_device_deinit_tegra_stats(pva);
+	pva_kmd_device_deinit_profiler(pva);
+	pva_kmd_debugfs_remove_nodes(pva);
+}
+
+static int64_t print_vpu_stats(struct pva_kmd_tegrastats *kmd_tegra_stats,
+			       uint8_t *out_buffer, uint64_t len)
+{
+	char kernel_buffer[256];
+	int64_t formatted_len;
+
+	formatted_len = snprintf(
+		kernel_buffer, sizeof(kernel_buffer),
+		"%llu\n%llu\n%llu\n%llu\n",
+		(long long unsigned int)(kmd_tegra_stats->window_start_time),
+		(long long unsigned int)(kmd_tegra_stats->window_end_time),
+		(long long unsigned int)
+			kmd_tegra_stats->average_vpu_utilization[0],
+		(long long unsigned int)
+			kmd_tegra_stats->average_vpu_utilization[1]);
+
+	if (formatted_len <= 0) {
+		return 0;
+	}
+
+	formatted_len++; //accounting for null terminating character
+
+	if (len < (uint64_t)formatted_len) {
+		return 0;
+	}
+
+	// Copy the formatted string from kernel buffer to user buffer
+	if (pva_kmd_copy_data_to_user(out_buffer, kernel_buffer,
+				      formatted_len)) {
+		pva_kmd_log_err("failed to copy read buffer to user");
+		return 0;
+	}
+
+	return formatted_len;
+}
+
+int64_t update_vpu_stats(struct pva_kmd_device *dev, void *file_data,
+			 uint8_t *out_buffer, uint64_t offset, uint64_t size)
+{
+	uint64_t size_read = 0U;
+	struct pva_kmd_tegrastats kmd_tegra_stats;
+
+	kmd_tegra_stats.window_start_time = 0;
+	kmd_tegra_stats.window_end_time = 0;
+	kmd_tegra_stats.average_vpu_utilization[0] = 0;
+	kmd_tegra_stats.average_vpu_utilization[1] = 0;
+
+	pva_kmd_log_err("Reading VPU stats");
+	pva_kmd_notify_fw_get_tegra_stats(dev, &kmd_tegra_stats);
+
+	size_read = print_vpu_stats(&kmd_tegra_stats, out_buffer, size);
+
+	return size_read;
+}
+
+int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
+			     const uint8_t *in_buffer, uint64_t offset,
+			     uint64_t size)
+{
+	char strbuf[2]; // 1 byte for '0' or '1' and another 1 byte for the Null character
+	uint32_t pva_auth_enable;
+	unsigned long retval;
+	retval = pva_kmd_copy_data_from_user(strbuf, in_buffer, sizeof(strbuf));
+	if (retval != 0u) {
+		pva_kmd_log_err("Failed to copy write buffer from user");
+		return -1;
+	}
+
+	pva_auth_enable = pva_kmd_strtol(strbuf, 16);
+
+	pva->pva_auth->pva_auth_enable = (pva_auth_enable == 1) ? true : false;
+
+	if (pva->pva_auth->pva_auth_enable)
+		pva->pva_auth->pva_auth_allow_list_parsed = false;
+
+	return 2;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_debugfs.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_DEBUGFS_H
+#define PVA_KMD_DEBUGFS_H
+#include "pva_kmd.h"
+#include "pva_kmd_shim_debugfs.h"
+#include "pva_kmd_fw_profiler.h"
+
+#define NUM_VPU_BLOCKS 2U
+
+/**
+ * Maximum length of file operation
+ */
+#define MAX_FILE_LEN 256U
+
+struct pva_kmd_file_ops {
+	int (*open)(struct pva_kmd_device *dev);
+	int (*release)(struct pva_kmd_device *dev);
+	int64_t (*read)(struct pva_kmd_device *dev, void *file_data,
+			uint8_t *data, uint64_t offset, uint64_t size);
+	int64_t (*write)(struct pva_kmd_device *dev, void *file_data,
+			 const uint8_t *data, uint64_t offset, uint64_t size);
+	void *pdev;
+	void *file_data;
+};
+
+struct pva_kmd_debugfs_context {
+	bool stats_enable;
+	bool vpu_debug;
+	bool vpu_print_enable;
+	char *allowlist_path;
+	uint32_t profile_level;
+	struct pva_kmd_file_ops vpu_fops;
+	struct pva_kmd_file_ops allowlist_fops;
+	struct pva_kmd_file_ops hwpm_fops;
+	void *data_hwpm;
+	struct pva_kmd_file_ops vpu_ocd_fops[NUM_VPU_BLOCKS];
+	struct pva_kmd_fw_profiling_config g_fw_profiling_config;
+};
+
+void pva_kmd_debugfs_create_nodes(struct pva_kmd_device *dev);
+void pva_kmd_debugfs_destroy_nodes(struct pva_kmd_device *dev);
+int64_t update_vpu_stats(struct pva_kmd_device *dev, void *file_data,
+			 uint8_t *out_buffer, uint64_t offset, uint64_t size);
+int64_t update_vpu_allowlist(struct pva_kmd_device *pva, void *file_data,
+			     const uint8_t *in_buffer, uint64_t offset,
+			     uint64_t size);
+#endif //PVA_KMD_DEBUGFS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.c
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_api_types.h"
+#include "pva_kmd_fw_debug.h"
+#include "pva_kmd_utils.h"
+#include "pva_api_cmdbuf.h"
+#include "pva_api.h"
+#include "pva_kmd_constants.h"
+#include "pva_fw.h"
+#include "pva_bit.h"
+#include "pva_kmd_queue.h"
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_context.h"
+#include "pva_kmd_t23x.h"
+#include "pva_kmd_t26x.h"
+#include "pva_kmd_regs.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_kmd_fw_profiler.h"
+#include "pva_kmd_vpu_app_auth.h"
+#include "pva_utils.h"
+#include "pva_kmd_debugfs.h"
+#include "pva_kmd_tegra_stats.h"
+#include "pva_kmd_shim_silicon.h"
+
+/**
+ * @brief Send address and size of the resource table to FW through CCQ.
+ *
+ * Initialization through CCQ is only intended for KMD's own resource table (the
+ * first resource table created).
+ */
+void pva_kmd_send_resource_table_info_by_ccq(
+	struct pva_kmd_device *pva, struct pva_kmd_resource_table *res_table)
+{
+	enum pva_error err;
+	uint64_t addr = res_table->table_mem->iova;
+	uint32_t n_entries = res_table->n_entries;
+	uint64_t ccq_entry =
+		PVA_INSERT64(PVA_FW_CCQ_OP_SET_RESOURCE_TABLE,
+			     PVA_FW_CCQ_OPCODE_MSB, PVA_FW_CCQ_OPCODE_LSB) |
+		PVA_INSERT64(addr, PVA_FW_CCQ_RESOURCE_TABLE_ADDR_MSB,
+			     PVA_FW_CCQ_RESOURCE_TABLE_ADDR_LSB) |
+		PVA_INSERT64(n_entries, PVA_FW_CCQ_RESOURCE_TABLE_N_ENTRIES_MSB,
+			     PVA_FW_CCQ_RESOURCE_TABLE_N_ENTRIES_LSB);
+
+	pva_kmd_mutex_lock(&pva->ccq0_lock);
+	err = pva_kmd_ccq_push_with_timeout(pva, PVA_PRIV_CCQ_ID, ccq_entry,
+					    PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+					    PVA_KMD_WAIT_FW_TIMEOUT_US);
+	ASSERT(err == PVA_SUCCESS);
+	pva_kmd_mutex_unlock(&pva->ccq0_lock);
+}
+
+/**
+ * @brief Send address and size of the queue to FW through CCQ.
+ *
+ * Initialization through CCQ is only intended for KMD's own queue (the first
+ * queue created).
+ */
+void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
+				    struct pva_kmd_queue *queue)
+{
+	enum pva_error err;
+	uint64_t addr = queue->queue_memory->iova;
+	uint32_t max_submit = queue->max_num_submit;
+	uint64_t ccq_entry =
+		PVA_INSERT64(PVA_FW_CCQ_OP_SET_SUBMISSION_QUEUE,
+			     PVA_FW_CCQ_OPCODE_MSB, PVA_FW_CCQ_OPCODE_LSB) |
+		PVA_INSERT64(addr, PVA_FW_CCQ_QUEUE_ADDR_MSB,
+			     PVA_FW_CCQ_QUEUE_ADDR_LSB) |
+		PVA_INSERT64(max_submit, PVA_FW_CCQ_QUEUE_N_ENTRIES_MSB,
+			     PVA_FW_CCQ_QUEUE_N_ENTRIES_LSB);
+	pva_kmd_mutex_lock(&pva->ccq0_lock);
+	err = pva_kmd_ccq_push_with_timeout(pva, PVA_PRIV_CCQ_ID, ccq_entry,
+					    PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+					    PVA_KMD_WAIT_FW_TIMEOUT_US);
+	ASSERT(err == PVA_SUCCESS);
+	pva_kmd_mutex_unlock(&pva->ccq0_lock);
+}
+
+/**
+ * Initialize submission related data structures for this device.
+ *
+ * - Create a resource table.
+ * - Add DRAM resources to the resource table. These are used for command buffer
+ *   chunks and post fences.
+ * - Create a queue.
+ */
+static void pva_kmd_device_init_submission(struct pva_kmd_device *pva)
+{
+	uint32_t queue_mem_size;
+	uint64_t chunk_mem_size;
+	uint64_t size;
+	enum pva_error err;
+	struct pva_fw_postfence post_fence = { 0 };
+
+	/* Init KMD's queue */
+	queue_mem_size = pva_get_submission_queue_memory_size(
+		PVA_KMD_MAX_NUM_KMD_SUBMITS);
+
+	pva->queue_memory = pva_kmd_device_memory_alloc_map(
+		queue_mem_size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+	ASSERT(pva->queue_memory != NULL);
+
+	pva_kmd_queue_init(&pva->dev_queue, pva, PVA_PRIV_CCQ_ID,
+			   0 /* KMD's queue ID is 0 */, &pva->ccq0_lock,
+			   pva->queue_memory, PVA_KMD_MAX_NUM_KMD_SUBMITS);
+
+	/* Init KMD's resource table */
+	err = pva_kmd_resource_table_init(&pva->dev_resource_table, pva,
+					  PVA_R5_SMMU_CONTEXT_ID,
+					  PVA_KMD_MAX_NUM_KMD_RESOURCES,
+					  PVA_KMD_MAX_NUM_KMD_DMA_CONFIGS);
+	ASSERT(err == PVA_SUCCESS);
+
+	/* Allocate memory for submission*/
+	chunk_mem_size = pva_kmd_cmdbuf_pool_get_required_mem_size(
+		PVA_MAX_CMDBUF_CHUNK_SIZE, PVA_KMD_MAX_NUM_KMD_CHUNKS);
+
+	size = safe_addu64(chunk_mem_size, (uint64_t)sizeof(uint32_t));
+	/* Allocate one post fence at the end. We don't need to free this memory
+	 * explicitly as it will be freed after we drop the resource. */
+	pva->submit_memory = pva_kmd_device_memory_alloc_map(
+		size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+	ASSERT(pva->submit_memory != NULL);
+
+	/* Add submit memory to resource table */
+	err = pva_kmd_add_dram_buffer_resource(&pva->dev_resource_table,
+					       pva->submit_memory,
+					       &pva->submit_memory_resource_id);
+	ASSERT(err == PVA_SUCCESS);
+	pva_kmd_update_fw_resource_table(&pva->dev_resource_table);
+
+	/* Init chunk pool */
+	pva_kmd_cmdbuf_chunk_pool_init(
+		&pva->chunk_pool, pva->submit_memory_resource_id, 0,
+		chunk_mem_size, PVA_MAX_CMDBUF_CHUNK_SIZE,
+		PVA_KMD_MAX_NUM_KMD_CHUNKS, pva->submit_memory->va);
+
+	/* Init fence */
+	pva->fence_offset = chunk_mem_size;
+
+	/* Init submitter */
+	pva_kmd_mutex_init(&pva->submit_lock);
+	pva_kmd_mutex_init(&pva->chunk_pool_lock);
+	post_fence.resource_id = pva->submit_memory_resource_id;
+	post_fence.offset_lo = iova_lo(pva->fence_offset);
+	post_fence.offset_hi = iova_hi(pva->fence_offset);
+	post_fence.ts_resource_id = PVA_RESOURCE_ID_INVALID;
+	pva_kmd_submitter_init(
+		&pva->submitter, &pva->dev_queue, &pva->submit_lock,
+		&pva->chunk_pool, &pva->chunk_pool_lock,
+		pva_offset_pointer(pva->submit_memory->va, pva->fence_offset),
+		&post_fence);
+}
+
+static void pva_kmd_device_deinit_submission(struct pva_kmd_device *pva)
+{
+	pva_kmd_mutex_deinit(&pva->chunk_pool_lock);
+	pva_kmd_mutex_deinit(&pva->submit_lock);
+	pva_kmd_cmdbuf_chunk_pool_deinit(&pva->chunk_pool);
+	/* Submit memory will be freed after dropping the resource */
+	pva_kmd_drop_resource(&pva->dev_resource_table,
+			      pva->submit_memory_resource_id);
+	pva_kmd_resource_table_deinit(&pva->dev_resource_table);
+	pva_kmd_queue_deinit(&pva->dev_queue);
+	pva_kmd_device_memory_free(pva->queue_memory);
+}
+
+struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,
+					     uint32_t device_index,
+					     bool app_authenticate)
+{
+	struct pva_kmd_device *pva;
+	enum pva_error err;
+	uint32_t chunk_size;
+	uint32_t size;
+
+	pva = pva_kmd_zalloc_nofail(sizeof(*pva));
+
+	pva->device_index = device_index;
+	pva->load_from_gsc = false;
+	pva->is_hv_mode = true;
+	pva->max_n_contexts = PVA_MAX_NUM_USER_CONTEXTS;
+	pva_kmd_mutex_init(&pva->powercycle_lock);
+	pva_kmd_mutex_init(&pva->ccq0_lock);
+	pva_kmd_mutex_init(&pva->resource_table_lock);
+	pva_kmd_sema_init(&pva->fw_boot_sema, 0);
+	size = safe_mulu32((uint32_t)sizeof(struct pva_kmd_context),
+			   pva->max_n_contexts);
+	pva->context_mem = pva_kmd_zalloc(size);
+	ASSERT(pva->context_mem != NULL);
+
+	err = pva_kmd_block_allocator_init(&pva->context_allocator,
+					   pva->context_mem,
+					   PVA_KMD_USER_CONTEXT_ID_BASE,
+					   sizeof(struct pva_kmd_context),
+					   pva->max_n_contexts);
+	ASSERT(err == PVA_SUCCESS);
+
+	if (chip_id == PVA_CHIP_T23X) {
+		pva_kmd_device_init_t23x(pva);
+	} else if (chip_id == PVA_CHIP_T26X) {
+		pva_kmd_device_init_t26x(pva);
+	} else {
+		FAULT("SOC not supported");
+	}
+
+	pva_kmd_device_plat_init(pva);
+
+	chunk_size = safe_mulu32((uint32_t)sizeof(struct pva_syncpt_rw_info),
+				 (uint32_t)PVA_NUM_RW_SYNCPTS_PER_CONTEXT);
+	err = pva_kmd_block_allocator_init(&pva->syncpt_allocator,
+					   pva->syncpt_rw, 0, chunk_size,
+					   PVA_MAX_NUM_USER_CONTEXTS);
+	ASSERT(err == PVA_SUCCESS);
+
+	pva_kmd_device_init_submission(pva);
+
+	err = pva_kmd_init_vpu_app_auth(pva, app_authenticate);
+	ASSERT(err == PVA_SUCCESS);
+
+	pva->is_suspended = false;
+
+	return pva;
+}
+
+static void pva_kmd_wait_for_active_contexts(struct pva_kmd_device *pva)
+{
+	uint8_t allocated = 0;
+
+	/* Make sure no context is active by allocating all contexts here. */
+	while (allocated < pva->max_n_contexts) {
+		uint32_t unused_id;
+		struct pva_kmd_context *ctx;
+
+		ctx = pva_kmd_alloc_block(&pva->context_allocator, &unused_id);
+		if (ctx != NULL) {
+			allocated = safe_addu32(allocated, 1U);
+		} else {
+			pva_kmd_sleep_us(1000);
+		}
+	}
+}
+
+void pva_kmd_device_destroy(struct pva_kmd_device *pva)
+{
+	pva_kmd_wait_for_active_contexts(pva);
+	pva_kmd_device_deinit_submission(pva);
+	pva_kmd_device_plat_deinit(pva);
+	pva_kmd_block_allocator_deinit(&pva->syncpt_allocator);
+	pva_kmd_block_allocator_deinit(&pva->context_allocator);
+	pva_kmd_free(pva->context_mem);
+	pva_kmd_mutex_deinit(&pva->ccq0_lock);
+	pva_kmd_mutex_deinit(&pva->resource_table_lock);
+	pva_kmd_mutex_deinit(&pva->powercycle_lock);
+	pva_kmd_free(pva->pva_auth);
+	pva_kmd_free(pva);
+}
+
+enum pva_error pva_kmd_device_busy(struct pva_kmd_device *pva)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	pva_kmd_mutex_lock(&pva->powercycle_lock);
+	if (pva->refcount == 0) {
+		pva_kmd_allocate_syncpts(pva);
+
+		err = pva_kmd_power_on(pva);
+		if (err != PVA_SUCCESS) {
+			goto unlock;
+		}
+
+		err = pva_kmd_init_fw(pva);
+		if (err != PVA_SUCCESS) {
+			goto unlock;
+		}
+		/* Reset KMD queue */
+		pva->dev_queue.queue_header->cb_head = 0;
+		pva->dev_queue.queue_header->cb_tail = 0;
+
+		pva_kmd_send_resource_table_info_by_ccq(
+			pva, &pva->dev_resource_table);
+		pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
+		pva_kmd_notify_fw_enable_profiling(pva);
+	}
+	pva->refcount = safe_addu32(pva->refcount, 1U);
+
+unlock:
+	pva_kmd_mutex_unlock(&pva->powercycle_lock);
+	return err;
+}
+
+void pva_kmd_device_idle(struct pva_kmd_device *pva)
+{
+	pva_kmd_mutex_lock(&pva->powercycle_lock);
+	ASSERT(pva->refcount > 0);
+	pva->refcount--;
+	if (pva->refcount == 0) {
+		/* Disable FW profiling */
+		/* TODO: once debugfs is up, move these calls */
+		// pva_kmd_notify_fw_disable_profiling(pva);
+		// pva_kmd_drain_fw_profiling_buffer(pva,
+		// 				  &pva->fw_profiling_buffer);
+		pva_kmd_deinit_fw(pva);
+		pva_kmd_power_off(pva);
+	}
+	pva_kmd_mutex_unlock(&pva->powercycle_lock);
+}
+
+enum pva_error pva_kmd_ccq_push_with_timeout(struct pva_kmd_device *pva,
+					     uint8_t ccq_id, uint64_t ccq_entry,
+					     uint64_t sleep_interval_us,
+					     uint64_t timeout_us)
+{
+	/* spin until we have space or timeout reached */
+	while (pva_kmd_get_ccq_space(pva, ccq_id) == 0) {
+		if (timeout_us == 0) {
+			pva_kmd_log_err(
+				"pva_kmd_ccq_push_with_timeout Timed out");
+			return PVA_TIMEDOUT;
+		}
+		pva_kmd_sleep_us(sleep_interval_us);
+		timeout_us = sat_sub64(timeout_us, sleep_interval_us);
+	}
+	/* TODO: memory write barrier is needed here */
+	pva_kmd_ccq_push(pva, ccq_id, ccq_entry);
+
+	return PVA_SUCCESS;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_device.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_DEVICE_H
+#define PVA_KMD_DEVICE_H
+#include "pva_constants.h"
+#include "pva_kmd_cmdbuf.h"
+#include "pva_kmd_utils.h"
+#include "pva_kmd_mutex.h"
+#include "pva_kmd_block_allocator.h"
+#include "pva_kmd_queue.h"
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_submitter.h"
+#include "pva_kmd_regs.h"
+#include "pva_kmd_thread_sema.h"
+#include "pva_kmd_fw_debug.h"
+#include "pva_kmd_shim_init.h"
+#include "pva_kmd_shim_ccq.h"
+#include "pva_kmd_fw_profiler.h"
+#include "pva_kmd_constants.h"
+#include "pva_kmd_debugfs.h"
+
+struct pva_syncpt_rw_info {
+	/** Dont switch order since syncpt_id and syncpt_iova is prefilled during kmd boot
+	 * and first field gets updated by pva_kmd_allocator everytime its freed */
+	uint32_t syncpt_value;
+	uint32_t syncpt_id;
+	uint64_t syncpt_iova;
+};
+
+/** A struct to maintain start and end address of vmem region */
+struct vmem_region {
+	/**! Start address of vmem region */
+	uint32_t start;
+	/**! End address of vmem region */
+	uint32_t end;
+};
+
+struct pva_kmd_hw_constants {
+	enum pva_hw_gen hw_gen;
+	uint8_t n_vmem_regions;
+	uint32_t n_dma_descriptors;
+	uint32_t n_user_dma_channels;
+	uint32_t n_hwseq_words;
+	uint32_t n_dynamic_adb_buffs;
+	uint32_t n_smmu_contexts;
+};
+
+/**
+ * @brief This struct manages a single PVA cluster.
+ *
+ * Fields in this struct should be common across all platforms. Platform
+ * specific data is stored in plat_data field.
+ */
+struct pva_kmd_device {
+	uint32_t device_index;
+	uint32_t r5_image_smmu_context_id;
+	uint32_t stream_ids[PVA_MAX_NUM_SMMU_CONTEXTS];
+
+	struct pva_kmd_hw_constants hw_consts;
+
+	uint64_t reg_phy_base[PVA_KMD_APERTURE_COUNT];
+	uint64_t reg_size[PVA_KMD_APERTURE_COUNT];
+
+	struct pva_kmd_regspec regspec;
+
+	uint8_t max_n_contexts;
+	void *context_mem;
+	struct pva_kmd_block_allocator context_allocator;
+
+	pva_kmd_mutex_t resource_table_lock;
+	struct pva_kmd_resource_table dev_resource_table;
+
+	struct pva_kmd_submitter submitter;
+	/** The lock protects the submission to the queue, including
+	 * incrementing the post fence */
+	pva_kmd_mutex_t submit_lock;
+	struct pva_kmd_device_memory *queue_memory;
+	struct pva_kmd_queue dev_queue;
+	pva_kmd_mutex_t ccq0_lock;
+
+	/** memory needed for submission: including command buffer chunks and fences */
+	struct pva_kmd_device_memory *submit_memory;
+	uint32_t submit_memory_resource_id;
+	uint64_t fence_offset; /**< fence offset within submit_memory*/
+
+	pva_kmd_mutex_t chunk_pool_lock;
+	struct pva_kmd_cmdbuf_chunk_pool chunk_pool;
+
+	pva_kmd_mutex_t powercycle_lock;
+	uint32_t refcount;
+
+	/** ISR post this semaphore when FW completes boot */
+	pva_kmd_sema_t fw_boot_sema;
+
+	struct pva_kmd_device_memory *fw_debug_mem;
+	struct pva_kmd_device_memory *fw_bin_mem;
+	struct pva_kmd_device_memory *fw_profiling_buffer_memory;
+	uint32_t fw_profiling_buffer_resource_id;
+	struct pva_kmd_fw_profiling_buffer fw_profiling_buffer;
+	struct pva_kmd_fw_print_buffer fw_print_buffer;
+
+	struct pva_kmd_device_memory *tegra_stats_memory;
+	uint32_t tegra_stats_resource_id;
+	uint32_t tegra_stats_buf_size;
+
+	bool load_from_gsc;
+	bool is_hv_mode;
+	struct pva_kmd_debugfs_context debugfs_context;
+	/** Sector packing format for block linear surfaces */
+	uint8_t bl_sector_pack_format;
+
+	/** Offset between 2 syncpoints */
+	uint32_t syncpt_offset;
+	uint64_t syncpt_ro_iova;
+	uint64_t syncpt_rw_iova;
+	uint32_t num_syncpts;
+	struct pva_syncpt_rw_info syncpt_rw[PVA_NUM_RW_SYNCPTS];
+	struct pva_kmd_block_allocator syncpt_allocator;
+
+	struct vmem_region *vmem_regions_tab;
+	bool support_hwseq_frame_linking;
+
+	void *plat_data;
+	void *fw_handle;
+
+	struct pva_vpu_auth *pva_auth;
+	bool is_suspended;
+};
+
+struct pva_kmd_device *pva_kmd_device_create(enum pva_chip_id chip_id,
+					     uint32_t device_index,
+					     bool app_authenticate);
+
+void pva_kmd_device_destroy(struct pva_kmd_device *pva);
+
+enum pva_error pva_kmd_device_busy(struct pva_kmd_device *pva);
+void pva_kmd_device_idle(struct pva_kmd_device *pva);
+
+enum pva_error pva_kmd_ccq_push_with_timeout(struct pva_kmd_device *pva,
+					     uint8_t ccq_id, uint64_t ccq_entry,
+					     uint64_t sleep_interval_us,
+					     uint64_t timeout_us);
+
+void pva_kmd_send_resource_table_info_by_ccq(
+	struct pva_kmd_device *pva, struct pva_kmd_resource_table *res_table);
+
+void pva_kmd_send_queue_info_by_ccq(struct pva_kmd_device *pva,
+				    struct pva_kmd_queue *queue);
+#endif // PVA_KMD_DEVICE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.c
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_dma_cfg.h"
+#include "pva_utils.h"
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device.h"
+
+#define PVA_KMD_INVALID_CH_IDX 0xFF
+
+void pva_kmd_unload_dma_config(struct pva_kmd_dma_resource_aux *dma_aux)
+{
+	uint32_t i;
+
+	for (i = 0; i < dma_aux->dram_res_count; i++) {
+		pva_kmd_drop_resource(dma_aux->res_table,
+				      dma_aux->static_dram_res_ids[i]);
+	}
+
+	if (dma_aux->vpu_bin_res_id != PVA_RESOURCE_ID_INVALID) {
+		pva_kmd_drop_resource(dma_aux->res_table,
+				      dma_aux->vpu_bin_res_id);
+	}
+}
+
+static void trace_dma_channels(struct pva_dma_config const *dma_config,
+			       uint8_t *desc_to_ch)
+{
+	uint32_t ch_index;
+	struct pva_dma_config_header const *cfg_hdr = &dma_config->header;
+	struct pva_dma_channel *channel;
+	uint32_t num_descs = dma_config->header.num_descriptors;
+
+	for (ch_index = 0; ch_index < cfg_hdr->num_channels; ch_index++) {
+		uint8_t desc_index;
+
+		channel = &dma_config->channels[ch_index];
+		desc_index = channel->desc_index;
+		for (uint32_t i = 0; i < PVA_MAX_NUM_DMA_DESC; i++) {
+			desc_index = array_index_nospec(desc_index, num_descs);
+			if (desc_to_ch[desc_index] != PVA_KMD_INVALID_CH_IDX) {
+				//Already traced this descriptor
+				break;
+			}
+			desc_to_ch[desc_index] = ch_index;
+			desc_index = sat_sub8(
+				dma_config->descriptors[desc_index].link_desc_id,
+				1);
+		}
+	}
+}
+
+enum pva_error
+pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
+			void *dma_config_payload, uint32_t dma_config_size,
+			struct pva_kmd_dma_resource_aux *dma_aux,
+			void *fw_dma_cfg, uint32_t *out_fw_fetch_size)
+{
+	enum pva_error err = PVA_SUCCESS;
+	uint32_t fw_fetch_size;
+	struct pva_dma_config dma_config;
+	struct pva_fw_dma_slot *dyn_slots;
+	struct pva_fw_dma_reloc *dyn_relocs;
+	struct pva_fw_dma_slot *static_slots = dma_aux->static_slots;
+	struct pva_fw_dma_reloc *static_relocs = dma_aux->static_relocs;
+	struct pva_kmd_dma_access *access_sizes = dma_aux->access_sizes;
+	// Mapping descriptor index to channel index
+	uint8_t desc_to_ch[PVA_MAX_NUM_DMA_DESC];
+
+	for (uint32_t i = 0; i < PVA_MAX_NUM_DMA_DESC; i++) {
+		desc_to_ch[i] = PVA_KMD_INVALID_CH_IDX;
+	}
+
+	//set access_sizes to 0 by default
+	(void)memset(
+		access_sizes, 0,
+		(PVA_MAX_NUM_DMA_DESC * sizeof(struct pva_kmd_dma_access)));
+
+	err = pva_kmd_parse_dma_config(dma_config_payload, dma_config_size,
+				       &dma_config,
+				       &resource_table->pva->hw_consts);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	err = pva_kmd_validate_dma_config(&dma_config,
+					  &resource_table->pva->hw_consts,
+					  access_sizes,
+					  dma_aux->hw_dma_descs_mask);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	trace_dma_channels(&dma_config, desc_to_ch);
+
+	err = pva_kmd_compute_dma_access(&dma_config, access_sizes,
+					 dma_aux->hw_dma_descs_mask);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	dyn_slots = pva_offset_pointer(fw_dma_cfg,
+				       sizeof(struct pva_dma_config_resource));
+
+	dyn_relocs = pva_offset_pointer(dyn_slots,
+					dma_config.header.num_dynamic_slots *
+						sizeof(*dyn_slots));
+
+	pva_kmd_collect_relocs(&dma_config, access_sizes, static_slots,
+			       dma_config.header.num_static_slots,
+			       static_relocs, dyn_slots,
+			       dma_config.header.num_dynamic_slots, dyn_relocs,
+			       desc_to_ch);
+
+	pva_kmd_write_fw_dma_config(
+		&dma_config, fw_dma_cfg, &fw_fetch_size,
+		resource_table->pva->support_hwseq_frame_linking);
+
+	dma_aux->res_table = resource_table;
+	err = pva_kmd_dma_use_resources(&dma_config, dma_aux);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	err = pva_kmd_bind_static_buffers(fw_dma_cfg, dma_aux, static_slots,
+					  dma_config.header.num_static_slots,
+					  static_relocs,
+					  dma_config.static_bindings,
+					  dma_config.header.num_static_slots);
+	if (err != PVA_SUCCESS) {
+		goto drop_res;
+	}
+
+	*out_fw_fetch_size = fw_fetch_size;
+
+	return PVA_SUCCESS;
+drop_res:
+	pva_kmd_unload_dma_config(dma_aux);
+err_out:
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_DMA_CFG_H
+#define PVA_KMD_DMA_CFG_H
+
+#include "pva_kmd.h"
+#include "pva_resource.h"
+
+/* Mask to extract the GOB offset from the Surface address */
+#define PVA_DMA_BL_GOB_OFFSET_MASK 0x3E00U
+
+/* Right shift value for moving GOB offset value extracted from surface address to LSB  */
+#define PVA_DMA_BL_GOB_OFFSET_MASK_RSH 6U
+
+struct pva_kmd_dma_access_entry {
+	int64_t start_addr;
+	int64_t end_addr;
+};
+struct pva_kmd_dma_access {
+	struct pva_kmd_dma_access_entry src;
+	struct pva_kmd_dma_access_entry dst;
+	struct pva_kmd_dma_access_entry dst2;
+};
+
+struct pva_kmd_resource_table;
+struct pva_kmd_hw_constants;
+
+/** Auxiliary information needed for managing DMA resources:
+ *
+ * - Hold references to DRAM buffers and VPU bin used by the DMA configuration.
+ * - Scratch buffers needed during DMA configuration loading.
+ */
+struct pva_kmd_dma_resource_aux {
+	struct pva_kmd_resource_table *res_table;
+	uint32_t vpu_bin_res_id;
+
+	uint32_t dram_res_count;
+	/** DRAM buffers statically referenced by the DMA configuration */
+	uint32_t static_dram_res_ids[PVA_KMD_MAX_NUM_DMA_DRAM_SLOTS];
+
+	/* Below are work buffers need during DMA configuration loading. They
+	 * don't fit on stack. */
+	struct pva_fw_dma_slot static_slots[PVA_KMD_MAX_NUM_DMA_SLOTS];
+	struct pva_fw_dma_reloc static_relocs[PVA_KMD_MAX_NUM_DMA_SLOTS];
+	struct pva_kmd_dma_access access_sizes[PVA_MAX_NUM_DMA_DESC];
+	uint64_t hw_dma_descs_mask[((PVA_MAX_NUM_DMA_DESC / 64ULL) + 1ULL)];
+};
+
+enum pva_error
+pva_kmd_parse_dma_config(void *dma_config, uint32_t dma_config_size,
+			 struct pva_dma_config *out_cfg,
+			 struct pva_kmd_hw_constants const *hw_consts);
+
+enum pva_error
+pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,
+			  struct pva_kmd_dma_resource_aux *dma_aux);
+
+enum pva_error
+pva_kmd_validate_dma_config(struct pva_dma_config const *dma_cfg,
+			    struct pva_kmd_hw_constants const *hw_consts,
+			    struct pva_kmd_dma_access *access_sizes,
+			    uint64_t *hw_dma_descs_mask);
+
+enum pva_error
+pva_kmd_compute_dma_access(struct pva_dma_config const *dma_cfg,
+			   struct pva_kmd_dma_access *access_sizes,
+			   uint64_t *hw_dma_descs_mask);
+
+void pva_kmd_collect_relocs(struct pva_dma_config const *dma_cfg,
+			    struct pva_kmd_dma_access const *access_sizes,
+			    struct pva_fw_dma_slot *out_static_slots,
+			    uint16_t num_static_slots,
+			    struct pva_fw_dma_reloc *out_static_relocs,
+			    struct pva_fw_dma_slot *out_dyn_slots,
+			    uint16_t num_dyn_slots,
+			    struct pva_fw_dma_reloc *out_dyn_relocs,
+			    uint8_t const *desc_to_ch);
+
+/**
+ * @brief Bind static buffers to the DMA configuration.
+ *
+ * When binding static buffers, we edit pva_dma_config in-place and replace the
+ * offset field with the final addresses of static buffers.
+ *
+ * We also validate that the DMA configuration does not access those static
+ * buffers out of range.
+ */
+enum pva_error pva_kmd_bind_static_buffers(
+	struct pva_dma_config_resource *fw_dma_cfg,
+	struct pva_kmd_dma_resource_aux *dma_aux,
+	struct pva_fw_dma_slot const *static_slots, uint16_t num_static_slots,
+	struct pva_fw_dma_reloc const *static_relocs,
+	struct pva_dma_static_binding const *static_bindings,
+	uint32_t num_static_bindings);
+
+/**
+ * @brief Convert user DMA configuration to firmware format.
+ */
+void pva_kmd_write_fw_dma_config(struct pva_dma_config const *dma_cfg,
+				 void *fw_dma_config,
+				 uint32_t *out_fw_fetch_size,
+				 bool support_hwseq_frame_linking);
+
+/**
+ * @brief Load DMA configuration into firmware format.
+ *
+ * This function mostly does the following things:
+ *
+ * - Validate the DMA configuration.
+ * - Bind static resources (buffers) and embed their addresses directly in the
+ *   firmware DMA configuration.
+ * - Hold references to DRAM buffers and VPU bin used by the DMA configuration.
+ * - Convert the DMA configuration into firmware format.
+ *
+ * @param resource_table the resource table for the context.
+ * @param dma_config DMA configuration from user space.
+ * @param dma_config_size Size of the dma_config buffer.
+ * @param dma_aux Auxiliary information needed for loading the DMA
+ *        configuration.
+ * @param fw_dma_cfg Output buffer for the firmware DMA configuration.
+ * @param out_fw_fetch_size Size of the firmware DMA configuration that needs to
+ *        be fetched into TCM.
+ */
+enum pva_error
+pva_kmd_load_dma_config(struct pva_kmd_resource_table *resource_table,
+			void *dma_config, uint32_t dma_config_size,
+			struct pva_kmd_dma_resource_aux *dma_aux,
+			void *fw_dma_cfg, uint32_t *out_fw_fetch_size);
+
+void pva_kmd_unload_dma_config(struct pva_kmd_dma_resource_aux *dma_aux);
+#endif // PVA_KMD_DMA_CFG_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_binding.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_binding.c
@@ -0,0 +1,369 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_api.h"
+#include "pva_kmd_dma_cfg.h"
+#include "pva_api_dma.h"
+#include "pva_kmd_constants.h"
+#include "pva_kmd_device.h"
+
+static uint32_t get_slot_line_pitch(struct pva_fw_dma_descriptor *descs,
+				    struct pva_fw_dma_reloc const *relocs,
+				    struct pva_fw_dma_slot const *slot)
+{
+	struct pva_fw_dma_reloc const *reloc = &relocs[slot->reloc_start_idx];
+	uint32_t first_desc_index = reloc->desc_index;
+	struct pva_fw_dma_descriptor *first_desc = &descs[first_desc_index];
+	uint8_t log2_bpp =
+		PVA_EXTRACT(first_desc->transfer_control1, 1, 0, uint8_t);
+
+	if (reloc->field == PVA_FW_DMA_RELOC_FIELD_SRC) {
+		return first_desc->slp_adv << log2_bpp;
+	} else {
+		return first_desc->dlp_adv << log2_bpp;
+	}
+}
+
+static enum pva_error
+set_channel_block_height(struct pva_dma_config_resource *dma_config,
+			 uint16_t ch_mask, uint8_t log2_block_height)
+{
+	struct pva_fw_dma_channel *channels =
+		pva_dma_config_get_channels(dma_config);
+
+	// max block height is 32 GOB
+	if (log2_block_height > PVA_DMA_MAX_LOG2_BLOCK_HEIGHT) {
+		pva_kmd_log_err("Invalid block height");
+		return PVA_ERR_CMD_INVALID_BLOCK_HEIGHT;
+	}
+
+	while (ch_mask > 0) {
+		uint8_t ch_index = __builtin_ctz(ch_mask);
+		if (dma_config->ch_block_height_fixed_mask & (1 << ch_index)) {
+			/* If this bit is already set, it means block height cannot be changed.  */
+			uint8_t set_bh = PVA_EXTRACT(channels[ch_index].cntl0,
+						     27, 25, uint8_t);
+			if (set_bh != log2_block_height) {
+				pva_kmd_log_err("Conflicting block height");
+				return PVA_INVAL;
+			}
+		} else {
+			channels[ch_index].cntl0 &= ~PVA_MASK(27, 25);
+			channels[ch_index].cntl0 |=
+				PVA_INSERT(log2_block_height, 27, 25);
+
+			dma_config->ch_block_height_fixed_mask |=
+				(1 << ch_index);
+		}
+
+		ch_mask &= ~(1 << ch_index);
+	}
+	return PVA_SUCCESS;
+}
+
+static enum pva_error
+bind_static_dram_slot(struct pva_dma_config_resource *dma_config,
+		      struct pva_kmd_dma_resource_aux *dma_aux,
+		      struct pva_fw_dma_slot const *slot,
+		      struct pva_fw_dma_reloc const *static_relocs,
+		      struct pva_dma_dram_binding const *dram_bd)
+{
+	struct pva_fw_dma_descriptor *descs =
+		pva_dma_config_get_descriptors(dma_config);
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_fw_dma_reloc const *relocs;
+	bool is_block_linear =
+		(dram_bd->surface_format == PVA_SURF_FMT_BLOCK_LINEAR);
+	uint32_t line_pitch = get_slot_line_pitch(descs, static_relocs, slot);
+	uint8_t log2_block_height = dram_bd->log2_block_height;
+	struct pva_kmd_dram_resource *dram_res =
+		&pva_kmd_peek_resource(dma_aux->res_table, dram_bd->resource_id)
+			 ->dram;
+	uint64_t slot_offset_pl = dram_bd->slot_offset;
+	uint64_t surface_base_addr =
+		sat_add64(dram_bd->surface_base_offset, dram_res->mem->iova);
+	/* When binding a buffer, we add the binding->surface_base_offset to the
+         * buffer base address. Therefore, the effective buffer size is
+	 * reduced by the offset. */
+	uint64_t max_surface_size =
+		sat_sub64(dram_res->mem->size, dram_bd->surface_base_offset);
+	uint64_t sector_pack_format = 0;
+	int64_t slot_access_start_addr = 0LL;
+	int64_t slot_access_end_addr = 0LL;
+	uint64_t slot_surface_combined_offset = 0ULL;
+	pva_math_error math_error = MATH_OP_SUCCESS;
+
+	if ((slot->flags & PVA_FW_DMA_SLOT_FLAG_DRAM) == 0) {
+		pva_kmd_log_err("Binding DRAM buffer to incompatible slot");
+		err = PVA_INVALID_BINDING;
+		goto out;
+	}
+
+	if (is_block_linear) {
+		if (slot->flags & PVA_FW_DMA_SLOT_FLAG_CB) {
+			pva_kmd_log_err(
+				"Block linear surface is not compatible with circular buffer");
+			err = PVA_INVALID_BINDING;
+			goto out;
+		}
+		max_surface_size =
+			pva_max_bl_surface_size(max_surface_size,
+						log2_block_height, line_pitch,
+						&math_error);
+		if (math_error != MATH_OP_SUCCESS) {
+			pva_kmd_log_err(
+				"bind_static_dram_slot pva_max_bl_surface_size triggered a math error");
+			err = PVA_ERR_MATH_OP;
+			goto out;
+		}
+
+		if (!pva_is_512B_aligned(surface_base_addr)) {
+			pva_kmd_log_err(
+				"BL surface base address is not 512B aligned");
+			err = PVA_BAD_SURFACE_BASE_ALIGNMENT;
+			goto out;
+		}
+
+		err = set_channel_block_height(dma_config, slot->ch_use_mask,
+					       dram_bd->log2_block_height);
+		if (err != PVA_SUCCESS) {
+			goto out;
+		}
+		sector_pack_format =
+			dma_aux->res_table->pva->bl_sector_pack_format;
+	}
+
+	slot_surface_combined_offset = addu64(
+		slot_offset_pl, dram_bd->surface_base_offset, &math_error);
+
+	if (slot_surface_combined_offset >= (uint64_t)MAX_INT64) {
+		pva_kmd_log_err("Slot surface offset too large");
+		return PVA_ERR_CMD_DRAM_BUF_OUT_OF_RANGE;
+	}
+
+	slot_access_start_addr =
+		adds64(slot->start_addr, (int64_t)slot_surface_combined_offset,
+		       &math_error);
+
+	slot_access_end_addr =
+		adds64(slot->end_addr, (int64_t)slot_surface_combined_offset,
+		       &math_error);
+
+	max_surface_size = addu64(max_surface_size,
+				  dram_bd->surface_base_offset, &math_error);
+
+	if (max_surface_size >= (uint64_t)MAX_INT64) {
+		pva_kmd_log_err("DRAM buffer too large for slot binding");
+		return PVA_ERR_CMD_DRAM_BUF_OUT_OF_RANGE;
+	}
+
+	if (math_error != MATH_OP_SUCCESS) {
+		pva_kmd_log_err("Math error during slot binding");
+		return PVA_ERR_MATH_OP;
+	}
+
+	if (slot_access_start_addr < 0LL) {
+		pva_kmd_log_err(
+			"DRAM buffer offset underflows for slot binding");
+		return PVA_ERR_CMD_DRAM_BUF_OUT_OF_RANGE;
+	}
+
+	if (slot_access_end_addr > (int64_t)max_surface_size) {
+		pva_kmd_log_err("DRAM buffer too small for slot binding");
+		return PVA_ERR_CMD_DRAM_BUF_OUT_OF_RANGE;
+	}
+
+	relocs = &static_relocs[slot->reloc_start_idx];
+	for (uint32_t i = 0; i < slot->reloc_count; i++) {
+		struct pva_fw_dma_reloc const *reloc = &relocs[i];
+		struct pva_fw_dma_descriptor *desc = &descs[reloc->desc_index];
+		uint8_t *addr_hi_ptr;
+		uint32_t *addr_lo_ptr;
+		uint32_t format_field_shift = 0;
+		uint64_t addr;
+		uint64_t desc_offset_pl;
+		uint64_t offset;
+
+		if (reloc->field == PVA_FW_DMA_RELOC_FIELD_SRC) {
+			addr_hi_ptr = &desc->src_adr1;
+			addr_lo_ptr = &desc->src_adr0;
+			format_field_shift = 3; //SRC_TF in TRANSFER_CONTROL0
+		} else if (reloc->field == PVA_FW_DMA_RELOC_FIELD_DST) {
+			addr_hi_ptr = &desc->dst_adr1;
+			addr_lo_ptr = &desc->dst_adr0;
+			format_field_shift = 7; //DST_TF in TRANSFER_CONTROL0
+		} else { /* PVA_FW_DMA_RELOC_FIELD_DST2 */
+			pva_kmd_log_err("Binding DRAM buffer to DST2 slot");
+			err = PVA_INVAL;
+			goto out;
+		}
+		desc_offset_pl = assemble_addr(*addr_hi_ptr, *addr_lo_ptr);
+		offset = sat_add64(slot_offset_pl, desc_offset_pl);
+		desc->transfer_control0 &= ~(1 << format_field_shift);
+		if (is_block_linear) {
+			/* We need to insert bits surface_base_addr[13, 9] to
+			* transfer_control2[7:3] as specified by DMA IAS. This helps the
+			* HW identify starting GOB index inside a block. */
+			desc->transfer_control2 &= ~PVA_MASK(7, 3);
+			desc->transfer_control2 |=
+				PVA_INSERT8(PVA_EXTRACT64(surface_base_addr, 13,
+							  9, uint8_t),
+					    7, 3);
+			desc->transfer_control0 |= 1 << format_field_shift;
+
+			offset = pva_pl_to_bl_offset(offset, line_pitch,
+						     log2_block_height,
+						     &math_error);
+			if (math_error != MATH_OP_SUCCESS) {
+				pva_kmd_log_err(
+					"pva_fw_do_cmd_bind_dram_slot pva_pl_to_bl_offset triggered a math error");
+				err = PVA_ERR_MATH_OP;
+				goto out;
+			}
+			if (!pva_is_64B_aligned(offset)) {
+				pva_kmd_log_err(
+					"Descriptor starting address is not aligned to 64 bytes");
+				err = PVA_BAD_DESC_ADDR_ALIGNMENT;
+				goto out;
+			}
+		}
+		addr = sat_add64(surface_base_addr, offset);
+		addr |= (sector_pack_format << PVA_BL_SECTOR_PACK_BIT_SHIFT);
+		*addr_hi_ptr = iova_hi(addr);
+		*addr_lo_ptr = iova_lo(addr);
+	}
+out:
+	return err;
+}
+
+static enum pva_error
+bind_static_vmem_slot(struct pva_dma_config_resource *dma_config,
+		      struct pva_kmd_dma_resource_aux *dma_aux,
+		      struct pva_fw_dma_slot const *slot,
+		      struct pva_fw_dma_reloc const *static_relocs,
+		      struct pva_dma_vmem_binding const *vmem_bd)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_fw_dma_descriptor *descs =
+		pva_dma_config_get_descriptors(dma_config);
+	struct pva_kmd_vpu_bin_resource *vpu_bin;
+	struct pva_symbol_info *sym;
+	uint32_t buffer_size, buffer_addr;
+	struct pva_fw_dma_reloc const *relocs;
+	enum pva_symbol_type needed_sym_type;
+
+	if (slot->flags & PVA_FW_DMA_SLOT_FLAG_VMEM_DATA) {
+		needed_sym_type = PVA_SYM_TYPE_DATA;
+	} else if (slot->flags & PVA_FW_DMA_SLOT_FLAG_VMEM_VPUC_TABLE) {
+		needed_sym_type = PVA_SYM_TYPE_VPUC_TABLE;
+	} else {
+		pva_kmd_log_err("Unexpected VMEM slot flags");
+		err = PVA_INTERNAL;
+		goto out;
+	}
+
+#if defined(WAR_PVAAS16267)
+	needed_sym_type = PVA_SYM_TYPE_DATA;
+#endif
+
+	vpu_bin = &pva_kmd_peek_resource(dma_aux->res_table,
+					 dma_aux->vpu_bin_res_id)
+			   ->vpu_bin;
+	sym = pva_kmd_get_symbol_with_type(&vpu_bin->symbol_table,
+					   vmem_bd->addr.symbol_id,
+					   needed_sym_type);
+	if (sym == NULL) {
+		err = PVA_INVALID_SYMBOL;
+		goto out;
+	}
+
+	buffer_size = sat_sub32(sym->size, vmem_bd->addr.offset);
+	buffer_addr = sat_add32(sym->vmem_addr, vmem_bd->addr.offset);
+
+	if (buffer_size < get_slot_size(slot)) {
+		pva_kmd_log_err("VMEM buffer too small for slot binding");
+		err = PVA_RES_OUT_OF_RANGE;
+		goto out;
+	}
+
+	relocs = &static_relocs[slot->reloc_start_idx];
+	for (uint32_t i = 0; i < slot->reloc_count; i++) {
+		struct pva_fw_dma_reloc const *reloc = &relocs[i];
+		struct pva_fw_dma_descriptor *desc = &descs[reloc->desc_index];
+
+		if (reloc->field == PVA_FW_DMA_RELOC_FIELD_SRC) {
+			desc->src_adr0 = sat_add32(buffer_addr, desc->src_adr0);
+		} else if (reloc->field == PVA_FW_DMA_RELOC_FIELD_DST) {
+			desc->dst_adr0 = sat_add32(buffer_addr, desc->dst_adr0);
+		} else {
+			if (!pva_is_64B_aligned(buffer_addr)) {
+				pva_kmd_log_err(
+					"VMEM replication address not aligned to 64 bytes");
+				err = PVA_INVAL;
+				goto out;
+			}
+
+			desc->frda =
+				((uint16_t)(buffer_addr >> 6U) + desc->frda) &
+				0x3FFF;
+		}
+	}
+
+out:
+	return err;
+}
+
+enum pva_error pva_kmd_bind_static_buffers(
+	struct pva_dma_config_resource *fw_dma_cfg_hdr,
+	struct pva_kmd_dma_resource_aux *dma_aux,
+	struct pva_fw_dma_slot const *static_slots, uint16_t num_static_slots,
+	struct pva_fw_dma_reloc const *static_relocs,
+	struct pva_dma_static_binding const *static_bindings,
+	uint32_t num_static_bindings)
+{
+	uint32_t slot_id;
+	enum pva_error err = PVA_SUCCESS;
+
+	if (num_static_bindings != num_static_slots) {
+		pva_kmd_log_err("Invalid number of static bindings");
+		err = PVA_INVAL;
+		goto out;
+	}
+
+	// Reset BL status for each channel
+	fw_dma_cfg_hdr->ch_block_height_fixed_mask = 0U;
+
+	for (slot_id = 0U; slot_id < num_static_slots; slot_id++) {
+		struct pva_fw_dma_slot const *st_slot = &static_slots[slot_id];
+		struct pva_dma_static_binding const *binding =
+			&static_bindings[slot_id];
+
+		if (binding->type == PVA_DMA_STATIC_BINDING_DRAM) {
+			err = bind_static_dram_slot(fw_dma_cfg_hdr, dma_aux,
+						    st_slot, static_relocs,
+						    &binding->dram);
+
+		} else { // PVA_FW_DMA_SLOT_FLAG_VMEM
+			err = bind_static_vmem_slot(fw_dma_cfg_hdr, dma_aux,
+						    st_slot, static_relocs,
+						    &binding->vmem);
+		}
+
+		if (err != PVA_SUCCESS) {
+			goto out;
+		}
+	}
+
+out:
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_validate.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_validate.c
@@ -0,0 +1,821 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_kmd_hwseq_validate.h"
+#include "pva_api.h"
+#include "pva_kmd_dma_cfg.h"
+#include "pva_api_dma.h"
+#include "pva_kmd_device.h"
+#include "pva_math_utils.h"
+
+struct pva_fw_dma_reloc_slot_info {
+	struct pva_fw_dma_slot *slots;
+	struct pva_fw_dma_reloc *relocs;
+	uint16_t num_slots;
+	uint8_t *reloc_off;
+};
+struct pva_fw_dma_reloc_slots {
+	struct pva_fw_dma_reloc_slot_info dyn_slot;
+	struct pva_fw_dma_reloc_slot_info static_slot;
+};
+
+static enum pva_error
+validate_channel_mapping(struct pva_dma_config const *out_cfg,
+			 struct pva_kmd_hw_constants const *hw_consts)
+{
+	struct pva_dma_channel *channel;
+	struct pva_dma_config_header const *cfg_hdr = &out_cfg->header;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+
+	for (uint8_t i = 0U; i < cfg_hdr->num_channels; i++) {
+		channel = &out_cfg->channels[i];
+		if ((channel->desc_index >= out_cfg->header.num_descriptors) ||
+		    (pva_is_reserved_desc(channel->desc_index))) {
+			pva_kmd_log_err(
+				"ERR: Invalid Channel Descriptor Index");
+			return PVA_INVAL;
+		}
+		if (addu8(channel->vdb_count, channel->vdb_offset, &math_err) >
+		    PVA_NUM_DYNAMIC_VDB_BUFFS) {
+			pva_kmd_log_err("ERR: Invalid Channel control data");
+			return PVA_INVAL;
+		}
+		if (addu16(channel->adb_count, channel->adb_offset, &math_err) >
+		    hw_consts->n_dynamic_adb_buffs) {
+			pva_kmd_log_err("ERR: Invalid ADB Buff Size or Offset");
+			return PVA_INVAL;
+		}
+	}
+	if (math_err != MATH_OP_SUCCESS) {
+		pva_kmd_log_err("validate_channel_mapping math error");
+		return PVA_ERR_MATH_OP;
+	}
+
+	return PVA_SUCCESS;
+}
+
+static enum pva_error validate_padding(struct pva_dma_descriptor *desc)
+{
+	if ((desc->px != 0U) && (desc->px >= desc->tx)) {
+		return PVA_INVAL;
+	}
+
+	if ((desc->py != 0U) && (desc->py >= desc->ty)) {
+		return PVA_INVAL;
+	}
+
+	return PVA_SUCCESS;
+}
+
+static bool is_valid_vpu_trigger_mode(struct pva_dma_descriptor *desc)
+{
+	bool valid = true;
+	if (desc->trig_event_mode != 0U) {
+		switch (desc->trig_vpu_events) {
+		case PVA_DMA_NO_TRIG:
+			//HW Sequencer check
+			break;
+		case PVA_DMA_TRIG_VPU_CFG:
+			if (desc->src.transfer_mode !=
+			    PVA_DMA_TRANS_MODE_VPUCFG) {
+				valid = false;
+			}
+			break;
+		case PVA_DMA_TRIG_READ0:
+		case PVA_DMA_TRIG_READ1:
+		case PVA_DMA_TRIG_READ2:
+		case PVA_DMA_TRIG_READ3:
+		case PVA_DMA_TRIG_READ4:
+		case PVA_DMA_TRIG_READ5:
+		case PVA_DMA_TRIG_READ6:
+			if ((desc->src.transfer_mode !=
+			     (uint8_t)PVA_DMA_TRANS_MODE_VPUCFG) &&
+			    (desc->dst.transfer_mode !=
+			     (uint8_t)PVA_DMA_TRANS_MODE_VMEM)) {
+				valid = false;
+			}
+			break;
+		case PVA_DMA_TRIG_WRITE0:
+		case PVA_DMA_TRIG_WRITE1:
+		case PVA_DMA_TRIG_WRITE2:
+		case PVA_DMA_TRIG_WRITE3:
+		case PVA_DMA_TRIG_WRITE4:
+		case PVA_DMA_TRIG_WRITE5:
+		case PVA_DMA_TRIG_WRITE6:
+			if ((desc->src.transfer_mode !=
+			     (uint8_t)PVA_DMA_TRANS_MODE_VPUCFG) &&
+			    (desc->src.transfer_mode !=
+			     (uint8_t)PVA_DMA_TRANS_MODE_VMEM)) {
+				valid = false;
+			}
+			break;
+		default:
+			valid = false;
+			break;
+		}
+	}
+	return valid;
+}
+
+static bool validate_src_dst_adv_val(struct pva_dma_descriptor *desc,
+				     bool relax_dim3_check)
+{
+	uint8_t is_any_rpt_zero = 0U;
+
+	is_any_rpt_zero = desc->src.rpt1 & desc->src.rpt2 & desc->dst.rpt1 &
+			  desc->dst.rpt2;
+
+	if ((desc->trig_event_mode == (uint8_t)PVA_DMA_TRIG_MODE_4TH_DIM) &&
+	    (is_any_rpt_zero == 0U)) {
+		return false;
+	}
+
+	if (desc->trig_event_mode == ((uint8_t)PVA_DMA_TRIG_MODE_3RD_DIM)) {
+		if (false == relax_dim3_check) {
+			if (((desc->src.rpt1 == 0U) &&
+			     (desc->dst.rpt1 == 0U))) {
+				return false;
+			}
+		} else {
+			if (((desc->dst.rpt1 == 0U) ||
+			     (desc->src.rpt1 > desc->dst.rpt1))) {
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+static enum pva_error
+validate_dma_desc_trans_cntl2(struct pva_dma_descriptor *desc)
+{
+	if ((desc->prefetch_enable != 0U) &&
+	    ((desc->tx == 0U) || (desc->ty == 0U) ||
+	     (desc->src.transfer_mode != (uint32_t)PVA_DMA_TRANS_MODE_DRAM) ||
+	     (desc->dst.transfer_mode != (uint32_t)PVA_DMA_TRANS_MODE_VMEM))) {
+		return PVA_INVAL;
+	}
+	return PVA_SUCCESS;
+}
+
+static enum pva_error
+validate_descriptor(struct pva_dma_descriptor *desc,
+		    struct pva_dma_config_header const *cfg_hdr)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	err = validate_padding(desc);
+	if ((desc->dst.transfer_mode == PVA_DMA_TRANS_MODE_VMEM) &&
+	    (err != PVA_SUCCESS)) {
+		return err;
+	}
+
+	if (!(is_valid_vpu_trigger_mode(desc))) {
+		pva_kmd_log_err("Bad trigger");
+		return PVA_INVAL;
+	}
+
+	/** Check src/dstADV values with respect to ECET bits */
+	if (false == validate_src_dst_adv_val(desc, false)) {
+		pva_kmd_log_err(
+			"Invalid src/dst ADV values with respect to ECET");
+		return PVA_INVAL;
+	}
+
+	/* DMA_DESC_TRANS CNTL2 */
+	if (PVA_SUCCESS != validate_dma_desc_trans_cntl2(desc)) {
+		pva_kmd_log_err("Bad trans cntl 2");
+		return PVA_INVAL;
+	}
+
+	/* DMA_DESC_LDID */
+	if ((desc->link_desc_id > cfg_hdr->num_descriptors) ||
+	    ((desc->link_desc_id != 0) &&
+	     pva_is_reserved_desc(desc->link_desc_id - PVA_DMA_DESC0))) {
+		pva_kmd_log_err("ERR: Invalid linker Desc ID");
+		return PVA_INVAL;
+	}
+
+	return PVA_SUCCESS;
+}
+
+static bool
+is_dma_config_header_valid(struct pva_dma_config_header const *cfg_hdr,
+			   struct pva_kmd_hw_constants const *hw_consts)
+{
+	if (((cfg_hdr->base_descriptor + cfg_hdr->num_descriptors) >
+	     hw_consts->n_dma_descriptors) ||
+	    ((cfg_hdr->base_channel + cfg_hdr->num_channels) >
+	     (hw_consts->n_user_dma_channels + 1U)) ||
+	    ((cfg_hdr->base_hwseq_word + cfg_hdr->num_hwseq_words) >
+	     hw_consts->n_hwseq_words) ||
+	    (cfg_hdr->num_static_slots > PVA_KMD_MAX_NUM_DMA_SLOTS) ||
+	    (cfg_hdr->num_dynamic_slots > PVA_KMD_MAX_NUM_DMA_RELOCS) ||
+	    (cfg_hdr->base_channel == 0U)) {
+		return false;
+	}
+	return true;
+}
+
+enum pva_error
+pva_kmd_parse_dma_config(void *dma_config, uint32_t dma_config_size,
+			 struct pva_dma_config *out_cfg,
+			 struct pva_kmd_hw_constants const *hw_consts)
+{
+	struct pva_dma_config_header const *cfg_hdr = dma_config;
+	uintptr_t offset = 0;
+
+	if (dma_config_size < sizeof(*cfg_hdr)) {
+		pva_kmd_log_err("DMA configuration too small");
+		return PVA_INVAL;
+	}
+
+	out_cfg->header = *cfg_hdr;
+	if (!(is_dma_config_header_valid(cfg_hdr, hw_consts))) {
+		pva_kmd_log_err("Invalid PVA DMA Configuration Header");
+		return PVA_INVAL;
+	}
+
+	offset += PVA_ALIGN8(sizeof(*cfg_hdr));
+
+	out_cfg->hwseq_words = pva_offset_pointer(dma_config, offset);
+	offset += PVA_ALIGN8(cfg_hdr->num_hwseq_words *
+			     sizeof(*out_cfg->hwseq_words));
+
+	out_cfg->channels = pva_offset_pointer(dma_config, offset);
+	offset +=
+		PVA_ALIGN8(cfg_hdr->num_channels * sizeof(*out_cfg->channels));
+
+	out_cfg->descriptors = pva_offset_pointer(dma_config, offset);
+	offset += PVA_ALIGN8(cfg_hdr->num_descriptors *
+			     sizeof(*out_cfg->descriptors));
+
+	out_cfg->static_bindings = pva_offset_pointer(dma_config, offset);
+	offset += PVA_ALIGN8(cfg_hdr->num_static_slots *
+			     sizeof(*out_cfg->static_bindings));
+
+	if (offset > dma_config_size) {
+		pva_kmd_log_err("DMA configuration is smaller than expected");
+		return PVA_INVAL;
+	}
+
+	return PVA_SUCCESS;
+}
+
+static enum pva_error
+validate_descriptors(struct pva_dma_config const *dma_config)
+{
+	uint32_t i = 0U;
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_dma_config_header const *cfg_hdr = &dma_config->header;
+	struct pva_dma_descriptor *desc;
+
+	for (i = 0; i < cfg_hdr->num_descriptors; i++) {
+		if (pva_is_reserved_desc(i)) {
+			// skip over the reserved descriptor range
+			i = PVA_RESERVED_DESCRIPTORS_END;
+			continue;
+		}
+
+		desc = &dma_config->descriptors[i];
+		err = validate_descriptor(desc, cfg_hdr);
+		if (err != PVA_SUCCESS) {
+			return err;
+		}
+	}
+
+	return err;
+}
+
+enum pva_error
+pva_kmd_validate_dma_config(struct pva_dma_config const *dma_config,
+			    struct pva_kmd_hw_constants const *hw_consts,
+			    struct pva_kmd_dma_access *access_sizes,
+			    uint64_t *hw_dma_descs_mask)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	err = validate_channel_mapping(dma_config, hw_consts);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Bad Channels");
+		return err;
+	}
+
+	err = validate_descriptors(dma_config);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Bad Descriptors");
+		return err;
+	}
+
+	if (dma_config->header.num_hwseq_words != 0U) {
+		err = validate_hwseq(dma_config, hw_consts, access_sizes,
+				     hw_dma_descs_mask);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err("Bad HW Sequencer Blob");
+			return err;
+		}
+	}
+
+	return err;
+}
+
+enum pva_error
+pva_kmd_dma_use_resources(struct pva_dma_config const *dma_cfg,
+			  struct pva_kmd_dma_resource_aux *dma_aux)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_vpu_bin_resource *vpu_bin = NULL;
+	uint32_t i;
+
+	/* Increment reference count for VPU bin */
+	if (dma_cfg->header.vpu_exec_resource_id != PVA_RESOURCE_ID_INVALID) {
+		struct pva_kmd_resource_record *vpu_bin_rec;
+
+		vpu_bin_rec = pva_kmd_use_resource(
+			dma_aux->res_table,
+			dma_cfg->header.vpu_exec_resource_id);
+		if (vpu_bin_rec == NULL) {
+			pva_kmd_log_err(
+				"VPU exec resource id used by DMA config does not exist");
+			err = PVA_INVAL;
+			goto err_out;
+		}
+		if (vpu_bin_rec->type != PVA_RESOURCE_TYPE_EXEC_BIN) {
+			pva_kmd_log_err(
+				"Invalid VPU exec resource id used by DMA config");
+			err = PVA_INVAL;
+			goto drop_vpu_bin;
+		}
+		vpu_bin = &vpu_bin_rec->vpu_bin;
+	}
+
+	dma_aux->vpu_bin_res_id = dma_cfg->header.vpu_exec_resource_id;
+
+	dma_aux->dram_res_count = 0;
+	/* Increment reference count for all static DRAM buffers; For static
+	 * VMEM buffers, check that symbol ID is valid. */
+	for (i = 0; i < dma_cfg->header.num_static_slots; i++) {
+		struct pva_dma_static_binding const *slot_buf =
+			&dma_cfg->static_bindings[i];
+
+		if (slot_buf->type == PVA_DMA_STATIC_BINDING_DRAM) {
+			struct pva_kmd_resource_record *rec;
+
+			rec = pva_kmd_use_resource(dma_aux->res_table,
+						   slot_buf->dram.resource_id);
+			if (rec == NULL) {
+				pva_kmd_log_err(
+					"DRAM buffers used by DMA config do not exist");
+				err = PVA_INVAL;
+				goto drop_dram;
+			}
+
+			dma_aux->static_dram_res_ids[dma_aux->dram_res_count] =
+				slot_buf->dram.resource_id;
+			dma_aux->dram_res_count += 1;
+
+			if (rec->type != PVA_RESOURCE_TYPE_DRAM) {
+				pva_kmd_log_err(
+					"Invalid DRAM resource id used DMA config");
+				err = PVA_INVAL;
+				goto drop_dram;
+			}
+		} else if (slot_buf->type == PVA_DMA_STATIC_BINDING_VMEM) {
+			if (vpu_bin == NULL) {
+				pva_kmd_log_err(
+					"VPU bin resource not found for static VMEM buffer");
+				err = PVA_INVAL;
+				goto drop_dram;
+			}
+
+			if (pva_kmd_get_symbol(&vpu_bin->symbol_table,
+					       slot_buf->vmem.addr.symbol_id) ==
+			    NULL) {
+				pva_kmd_log_err("Invalid VMEM symbol ID");
+				err = PVA_INVAL;
+				goto drop_dram;
+			}
+		} else {
+			pva_kmd_log_err("Invalid slot buffer type");
+			err = PVA_INVAL;
+			goto drop_dram;
+		}
+	}
+
+	return PVA_SUCCESS;
+drop_dram:
+	for (i = 0; i < dma_aux->dram_res_count; i++) {
+		pva_kmd_drop_resource(dma_aux->res_table,
+				      dma_aux->static_dram_res_ids[i]);
+	}
+drop_vpu_bin:
+	if (dma_aux->vpu_bin_res_id != PVA_RESOURCE_ID_INVALID) {
+		pva_kmd_drop_resource(dma_aux->res_table,
+				      dma_aux->vpu_bin_res_id);
+	}
+err_out:
+	return err;
+}
+
+static uint16_t get_slot_id(uint16_t slot)
+{
+	return slot & PVA_DMA_SLOT_ID_MASK;
+}
+
+static uint8_t get_slot_flag(uint8_t transfer_mode, bool cb_enable)
+{
+	uint8_t flags = 0;
+	if (transfer_mode == PVA_DMA_TRANS_MODE_VMEM) {
+		flags |= PVA_FW_DMA_SLOT_FLAG_VMEM_DATA;
+	} else if (transfer_mode == PVA_DMA_TRANS_MODE_L2SRAM) {
+		flags |= PVA_FW_DMA_SLOT_FLAG_L2SRAM;
+	} else if (transfer_mode == PVA_DMA_TRANS_MODE_DRAM) {
+		flags |= PVA_FW_DMA_SLOT_FLAG_DRAM;
+	} else if (transfer_mode == PVA_DMA_TRANS_MODE_VPUCFG) {
+		flags |= PVA_FW_DMA_SLOT_FLAG_VMEM_VPUC_TABLE;
+	}
+
+	if (cb_enable) {
+		flags |= PVA_FW_DMA_SLOT_FLAG_CB;
+	}
+	return flags;
+}
+
+static void update_reloc_count(uint16_t slot, uint8_t transfer_mode,
+			       bool cb_enable,
+			       struct pva_fw_dma_slot *out_static_slots,
+			       uint16_t num_static_slots,
+			       struct pva_fw_dma_slot *out_dyn_slots,
+			       uint16_t num_dyn_slots)
+{
+	uint8_t slot_id = get_slot_id(slot);
+
+	if (slot & PVA_DMA_DYNAMIC_SLOT) {
+		out_dyn_slots[slot_id].reloc_count =
+			safe_addu16(out_dyn_slots[slot_id].reloc_count, 1U);
+		out_dyn_slots[slot_id].flags |=
+			get_slot_flag(transfer_mode, cb_enable);
+	} else if (slot & PVA_DMA_STATIC_SLOT) {
+		out_static_slots[slot_id].reloc_count =
+			safe_addu16(out_static_slots[slot_id].reloc_count, 1U);
+		;
+		out_static_slots[slot_id].flags |=
+			get_slot_flag(transfer_mode, cb_enable);
+	}
+}
+
+static void count_relocs(struct pva_dma_config const *dma_cfg,
+			 struct pva_fw_dma_slot *out_static_slots,
+			 uint16_t num_static_slots,
+			 struct pva_fw_dma_slot *out_dyn_slots,
+			 uint16_t num_dyn_slots)
+{
+	uint8_t i;
+	struct pva_dma_descriptor *desc;
+
+	for (i = 0U; i < dma_cfg->header.num_descriptors; i++) {
+		if (pva_is_reserved_desc(i)) {
+			// skip over the reserved descriptor range
+			i = PVA_RESERVED_DESCRIPTORS_END;
+			continue;
+		}
+		desc = &dma_cfg->descriptors[i];
+
+		update_reloc_count(desc->src.slot, desc->src.transfer_mode,
+				   desc->src.cb_enable, out_static_slots,
+				   num_static_slots, out_dyn_slots,
+				   num_dyn_slots);
+
+		update_reloc_count(desc->dst.slot, desc->dst.transfer_mode,
+				   desc->dst.cb_enable, out_static_slots,
+				   num_static_slots, out_dyn_slots,
+				   num_dyn_slots);
+
+		update_reloc_count(desc->dst2_slot, desc->dst.transfer_mode,
+				   desc->dst.cb_enable, out_static_slots,
+				   num_static_slots, out_dyn_slots,
+				   num_dyn_slots);
+	}
+}
+
+static void write_one_reloc(uint8_t ch_index, uint32_t desc_index,
+			    uint16_t slot, uint8_t transfer_mode,
+			    uint8_t reloc_field,
+			    struct pva_fw_dma_reloc_slot_info *info,
+			    struct pva_kmd_dma_access_entry const *access_entry)
+{
+	uint16_t slot_id = get_slot_id(slot);
+	uint16_t reloc_id = safe_addu16(info->slots[slot_id].reloc_start_idx,
+					info->reloc_off[slot_id]);
+
+	int64_t old_start_addr = info->slots[slot_id].start_addr;
+	int64_t old_end_addr = info->slots[slot_id].end_addr;
+
+	info->slots[slot_id].start_addr =
+		mins64(access_entry->start_addr, old_start_addr);
+	info->slots[slot_id].end_addr =
+		maxs64(access_entry->end_addr, old_end_addr);
+
+	info->slots[slot_id].ch_use_mask |= (1U << (ch_index & 0x1FU));
+
+	info->relocs[reloc_id].desc_index = desc_index;
+	info->relocs[reloc_id].field = reloc_field;
+
+	info->reloc_off[slot_id] = safe_addu8(info->reloc_off[slot_id], 1U);
+}
+
+static void handle_reloc(uint16_t slot, uint8_t transfer_mode,
+			 struct pva_kmd_dma_access_entry const *access_entry,
+			 struct pva_fw_dma_reloc_slots *rel_info,
+			 uint8_t reloc_field, uint8_t ch_index,
+			 uint8_t desc_index)
+{
+	if (slot & PVA_DMA_DYNAMIC_SLOT) {
+		write_one_reloc(ch_index, desc_index, slot, transfer_mode,
+				reloc_field, &rel_info->dyn_slot, access_entry);
+	} else if (slot & PVA_DMA_STATIC_SLOT) {
+		write_one_reloc(ch_index, desc_index, slot, transfer_mode,
+				reloc_field, &rel_info->static_slot,
+				access_entry);
+	}
+}
+
+static void write_relocs(struct pva_dma_config const *dma_cfg,
+			 struct pva_kmd_dma_access const *access_sizes,
+			 struct pva_fw_dma_reloc_slots *rel_info,
+			 uint8_t const *desc_to_ch)
+{
+	uint32_t i;
+	uint16_t start_idx = 0U;
+	struct pva_dma_descriptor *desc = NULL;
+	uint8_t ch_index = 0U;
+
+	for (i = 0U; i < rel_info->dyn_slot.num_slots; i++) {
+		rel_info->dyn_slot.slots[i].reloc_start_idx = start_idx;
+		start_idx = safe_addu16(
+			start_idx, rel_info->dyn_slot.slots[i].reloc_count);
+	}
+
+	for (i = 0U; i < rel_info->static_slot.num_slots; i++) {
+		rel_info->static_slot.slots[i].reloc_start_idx = start_idx;
+		start_idx = safe_addu16(
+			start_idx, rel_info->static_slot.slots[i].reloc_count);
+	}
+
+	for (i = 0U; i < dma_cfg->header.num_descriptors; i++) {
+		if (pva_is_reserved_desc(i)) {
+			// skip over the reserved descriptor range
+			i = PVA_RESERVED_DESCRIPTORS_END;
+			continue;
+		}
+		desc = &dma_cfg->descriptors[i];
+		ch_index = desc_to_ch[i];
+
+		handle_reloc(desc->src.slot, desc->src.transfer_mode,
+			     &access_sizes[i].src, rel_info,
+			     PVA_FW_DMA_RELOC_FIELD_SRC, ch_index, i);
+		handle_reloc(desc->dst.slot, desc->dst.transfer_mode,
+			     &access_sizes[i].dst, rel_info,
+			     PVA_FW_DMA_RELOC_FIELD_DST, ch_index, i);
+		handle_reloc(desc->dst2_slot, desc->dst.transfer_mode,
+			     &access_sizes[i].dst2, rel_info,
+			     PVA_FW_DMA_RELOC_FIELD_DST2, ch_index, i);
+	}
+}
+
+static enum pva_error
+validate_descriptor_tile_and_padding(struct pva_dma_descriptor *desc,
+				     bool is_dst)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	if (desc->ty == 0U) {
+		err = PVA_INVALID_DMA_CONFIG;
+		return err;
+	}
+
+	if (!is_dst) {
+		if ((desc->tx <= desc->px) || (desc->ty <= desc->py)) {
+			// invalid tile size/padding config
+			err = PVA_INVALID_DMA_CONFIG;
+			return err;
+		}
+	}
+
+	return PVA_SUCCESS;
+}
+
+static enum pva_error get_access_size(struct pva_dma_descriptor *desc,
+				      struct pva_kmd_dma_access_entry *entry,
+				      bool is_dst,
+				      struct pva_kmd_dma_access_entry *dst2)
+
+{
+	struct pva_dma_transfer_attr *attr = NULL;
+	uint32_t tx = 0U;
+	uint32_t ty = 0U;
+	uint64_t tile_size = 0U;
+	int64_t start = 0;
+	int64_t end = 0;
+	int32_t dim_offset = 0;
+	uint32_t dim_offset_U = 0U;
+	uint32_t num_bytes = 0U;
+	enum pva_error err = PVA_SUCCESS;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+
+	// early out for empty tiles
+	if (desc->tx == 0U) {
+		return err;
+	}
+
+	err = validate_descriptor_tile_and_padding(desc, is_dst);
+	if (err != PVA_SUCCESS) {
+		return err;
+	}
+
+	if (is_dst) {
+		attr = &desc->dst;
+		tx = desc->tx;
+		ty = desc->ty;
+	} else {
+		attr = &desc->src;
+		tx = subu32((uint32_t)desc->tx, (uint32_t)desc->px, &math_err);
+		ty = subu32((uint32_t)desc->ty, (uint32_t)desc->py, &math_err);
+	}
+
+	if (attr->offset > (uint64_t)(MAX_INT64)) {
+		err = PVA_INVALID_DMA_CONFIG;
+		pva_kmd_log_err("Offset is too large");
+		goto err_out;
+	}
+
+	dim_offset_U = mulu32((uint32_t)(attr->line_pitch),
+			      subu32(ty, 1U, &math_err), &math_err);
+
+	if (attr->cb_enable != 0U) {
+		tile_size = addu32(dim_offset_U, tx, &math_err);
+		tile_size = tile_size
+			    << (desc->log2_pixel_size & MAX_BYTES_PER_PIXEL);
+
+		if (tile_size > attr->cb_size) {
+			pva_kmd_log_err(
+				"Tile size is bigger than circular buffer size");
+			err = PVA_INVALID_DMA_CONFIG;
+		}
+		start = 0LL;
+		end = (int64_t)attr->cb_size;
+		goto end;
+	}
+
+	end += adds64((int64_t)dim_offset_U, (int64_t)tx, &math_err);
+
+	// 3rd dim
+	// 3rd dim
+	dim_offset = muls32((attr->adv1), (int32_t)(attr->rpt1), &math_err);
+	start += mins32(dim_offset, 0);
+	end += maxs32(dim_offset, 0);
+	// 4th dim
+	dim_offset = muls32((attr->adv2), (int32_t)(attr->rpt2), &math_err);
+	start += mins32(dim_offset, 0);
+	end += maxs32(dim_offset, 0);
+	// 5th dim
+	dim_offset = muls32((attr->adv3), (int32_t)(attr->rpt3), &math_err);
+	start += mins32(dim_offset, 0);
+	end += maxs32(dim_offset, 0);
+	// convert to byte range
+	num_bytes =
+		((uint32_t)1U << (desc->log2_pixel_size & MAX_BYTES_PER_PIXEL));
+	start *= (int64_t)num_bytes;
+	end *= (int64_t)num_bytes;
+
+	if (math_err != MATH_OP_SUCCESS) {
+		err = PVA_ERR_MATH_OP;
+		pva_kmd_log_err("get_access_size math error");
+		goto err_out;
+	}
+
+end:
+	entry->start_addr =
+		adds64(mins64(start, end), convert_to_signed_s64(attr->offset),
+		       &math_err);
+	entry->end_addr =
+		adds64(maxs64(start, end), convert_to_signed_s64(attr->offset),
+		       &math_err);
+
+	if (is_dst) {
+		dst2->start_addr =
+			adds64(mins64(start, end), (int64_t)desc->dst2_offset,
+			       &math_err);
+
+		dst2->end_addr = adds64(maxs64(start, end),
+					(int64_t)desc->dst2_offset, &math_err);
+	}
+	if (math_err != MATH_OP_SUCCESS) {
+		err = PVA_ERR_MATH_OP;
+		pva_kmd_log_err("get_access_size math error");
+	}
+err_out:
+	return err;
+}
+
+enum pva_error
+pva_kmd_compute_dma_access(struct pva_dma_config const *dma_cfg,
+			   struct pva_kmd_dma_access *access_sizes,
+			   uint64_t *hw_dma_descs_mask)
+{
+	uint32_t i;
+	struct pva_dma_descriptor *desc = NULL;
+	enum pva_error err = PVA_SUCCESS;
+	bool skip_swseq_size_compute = false;
+
+	for (i = 0; i < dma_cfg->header.num_descriptors; i++) {
+		/**
+		 * Check if DMA descriptor has been used in HW Sequencer.
+		 * If used, skip_swseq_size_compute = true
+		 * else skip_swseq_size_compute = false
+		 * 
+		 * If skip_swseq_size_compute == true then set access_sizes to 0
+		 * else go ahead with access_sizes calculation.access_sizes
+		 */
+		skip_swseq_size_compute = ((hw_dma_descs_mask[i / 64ULL] &
+					    (1ULL << (i & 0x3FU))) == 1U);
+		if (pva_is_reserved_desc(i)) {
+			// skip over the reserved descriptor range
+			i = PVA_RESERVED_DESCRIPTORS_END;
+			continue;
+		}
+
+		if (skip_swseq_size_compute == true) {
+			continue;
+		}
+
+		desc = &dma_cfg->descriptors[i];
+
+		//Calculate src_size
+		err = get_access_size(desc, &access_sizes[i].src, false,
+				      &access_sizes[i].dst2);
+		if (err != PVA_SUCCESS) {
+			goto out;
+		}
+
+		//Calculate dst_size
+		err = get_access_size(desc, &access_sizes[i].dst, true,
+				      &access_sizes[i].dst2);
+
+		if (err != PVA_SUCCESS) {
+			goto out;
+		}
+	}
+
+out:
+	return err;
+}
+
+void pva_kmd_collect_relocs(struct pva_dma_config const *dma_cfg,
+			    struct pva_kmd_dma_access const *access_sizes,
+			    struct pva_fw_dma_slot *out_static_slots,
+			    uint16_t num_static_slots,
+			    struct pva_fw_dma_reloc *out_static_relocs,
+			    struct pva_fw_dma_slot *out_dyn_slots,
+			    uint16_t num_dyn_slots,
+			    struct pva_fw_dma_reloc *out_dyn_relocs,
+			    uint8_t const *desc_to_ch)
+{
+	struct pva_fw_dma_reloc_slots rel_info = { 0 };
+	uint8_t static_reloc_off[PVA_MAX_NUM_DMA_DESC * 3];
+	uint8_t dyn_reloc_off[PVA_MAX_NUM_DMA_DESC * 3];
+
+	memset(out_static_slots, 0,
+	       num_static_slots * sizeof(*out_static_slots));
+	memset(out_dyn_slots, 0, num_dyn_slots * sizeof(*out_dyn_slots));
+
+	/* First pass: count the number of relocates for each slot */
+	count_relocs(dma_cfg, out_static_slots, num_static_slots, out_dyn_slots,
+		     num_dyn_slots);
+
+	memset(static_reloc_off, 0U, sizeof(static_reloc_off));
+	memset(dyn_reloc_off, 0U, sizeof(dyn_reloc_off));
+
+	rel_info.dyn_slot.slots = out_dyn_slots;
+	rel_info.dyn_slot.relocs = out_dyn_relocs;
+	rel_info.dyn_slot.num_slots = num_dyn_slots;
+	rel_info.dyn_slot.reloc_off = dyn_reloc_off;
+
+	rel_info.static_slot.slots = out_static_slots;
+	rel_info.static_slot.relocs = out_static_relocs;
+	rel_info.static_slot.num_slots = num_static_slots;
+	rel_info.static_slot.reloc_off = static_reloc_off;
+
+	/* Second pass: write reloc info */
+	write_relocs(dma_cfg, access_sizes, &rel_info, desc_to_ch);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_write.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_dma_cfg_write.c
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_api.h"
+#include "pva_api_types.h"
+#include "pva_kmd_dma_cfg.h"
+#include "pva_resource.h"
+#include "pva_kmd_hwseq_validate.h"
+
+static void write_dma_channel(struct pva_dma_channel const *ch,
+			      uint8_t base_desc_index,
+			      struct pva_fw_dma_channel *fw_ch,
+			      struct pva_dma_resource_map *dma_resource_map,
+			      bool support_hwseq_frame_linking)
+{
+	/* DMA_CHANNEL_CNTL0_CHSDID: DMA_CHANNEL_CNTL0[0] = descIndex + 1;*/
+	fw_ch->cntl0 =
+		(((ch->desc_index + base_desc_index + 1U) & 0xFFU) << 0U);
+
+	/* DMA_CHANNEL_CNTL0_CHVMEMOREQ */
+	fw_ch->cntl0 |= ((ch->vdb_count & 0xFFU) << 8U);
+
+	/* DMA_CHANNEL_CNTL0_CHBH */
+	fw_ch->cntl0 |= ((ch->adb_count & 0x1FFU) << 16U);
+
+	/* DMA_CHANNEL_CNTL0_CHPREF */
+	fw_ch->cntl0 |= ((ch->prefetch_enable & 1U) << 30U);
+
+	/* DMA_CHANNEL_CNTL1_CHPWT */
+	fw_ch->cntl1 = ((ch->req_per_grant & 0x7U) << 2U);
+
+	/* DMA_CHANNEL_CNTL1_CHVDBSTART */
+	fw_ch->cntl1 |= ((ch->vdb_offset & 0x7FU) << 16U);
+
+	/* DMA_CHANNEL_CNTL1_CHADBSTART */
+	fw_ch->cntl1 |= ((ch->adb_offset & 0x1FFU) << 23U);
+
+	fw_ch->boundary_pad = ch->pad_value;
+
+	fw_ch->cntl1 |= ((ch->ch_rep_factor & 0x7U) << 8U);
+
+	/* DMA_CHANNEL_HWSEQCNTL_CHHWSEQSTART */
+	fw_ch->hwseqcntl = ((ch->hwseq_start & 0x1FFU) << 0U);
+
+	/* DMA_CHANNEL_HWSEQCNTL_CHHWSEQEND */
+	fw_ch->hwseqcntl |= ((ch->hwseq_end & 0x1FFU) << 12U);
+
+	/* DMA_CHANNEL_HWSEQCNTL_CHHWSEQTD */
+	fw_ch->hwseqcntl |= ((ch->hwseq_trigger_done & 0x3U) << 24U);
+
+	/* DMA_CHANNEL_HWSEQCNTL_CHHWSEQTS */
+	fw_ch->hwseqcntl |= ((ch->hwseq_tx_select & 0x1U) << 27U);
+
+	/* DMA_CHANNEL_HWSEQCNTL_CHHWSEQTO */
+	fw_ch->hwseqcntl |= ((ch->hwseq_traversal_order & 0x1U) << 30U);
+
+	/* DMA_CHANNEL_HWSEQCNTL_CHHWSEQEN */
+	fw_ch->hwseqcntl |= ((ch->hwseq_enable & 0x1U) << 31U);
+
+	/* DMA_CHANNEL_HWSEQFSCNTL_CHHWSEQFCNT*/
+	fw_ch->hwseqfscntl |=
+		(((uint32_t)ch->hwseq_con_frame_seq & 0x1U) << 0U);
+
+	/* DMA_CHANNEL_HWSEQFSCNTL_CHHWSEQCFS*/
+	fw_ch->hwseqfscntl |=
+		(((uint32_t)ch->hwseq_frame_count & 0x3FU) << 16U);
+
+	pva_dma_resource_map_add_adbs(dma_resource_map, ch->adb_offset,
+				      ch->adb_count);
+}
+
+static uint32_t assemble_rpt_cntl(uint8_t rpt, uint32_t adv)
+{
+	return PVA_INSERT(rpt, 31, 24) | PVA_INSERT(adv, 23, 0);
+}
+
+static void write_dma_descriptor(struct pva_dma_descriptor const *desc,
+				 struct pva_fw_dma_descriptor *fw_desc)
+{
+	fw_desc->src_adr0 = iova_lo(desc->src.offset);
+	fw_desc->src_adr1 = iova_hi(desc->src.offset);
+
+	fw_desc->dst_adr0 = iova_lo(desc->dst.offset);
+	fw_desc->dst_adr1 = iova_hi(desc->dst.offset);
+
+	/* DMA_DESC_TRANS CNTL0 */
+	fw_desc->transfer_control0 = PVA_INSERT(desc->src.transfer_mode, 2, 0) |
+				     PVA_INSERT(desc->dst.transfer_mode, 6, 4);
+	/* DMA_DESC_TRANS CNTL1 */
+	fw_desc->transfer_control1 =
+		PVA_INSERT(desc->log2_pixel_size, 1, 0) |
+		PVA_INSERT(desc->px_direction, 2, 2) |
+		PVA_INSERT(desc->py_direction, 3, 3) |
+		PVA_INSERT(desc->boundary_pixel_extension, 4, 4) |
+		PVA_INSERT(desc->tts, 5, 5) |
+		PVA_INSERT(desc->trans_true_completion, 7, 7);
+	/* DMA_DESC_TRANS CNTL2 */
+	fw_desc->transfer_control2 = PVA_INSERT(desc->prefetch_enable, 0, 0) |
+				     PVA_INSERT(desc->dst.cb_enable, 1, 1) |
+				     PVA_INSERT(desc->src.cb_enable, 2, 2);
+
+	fw_desc->link_did = desc->link_desc_id;
+
+	/* DMA_DESC_TX */
+	fw_desc->tx = desc->tx;
+	/* DMA_DESC_TY */
+	fw_desc->ty = desc->ty;
+	/* DMA_DESC_DLP_ADV */
+	fw_desc->dlp_adv = desc->dst.line_pitch;
+	/* DMA_DESC_SLP_ADV */
+	fw_desc->slp_adv = desc->src.line_pitch;
+	/* DMA_DESC_DB_START */
+	fw_desc->db_start = desc->dst.cb_start;
+	/* DMA_DESC_DB_SIZE */
+	fw_desc->db_size = desc->dst.cb_size;
+	/* DMA_DESC_SB_START */
+	fw_desc->sb_start = desc->src.cb_start;
+	/* DMA_DESC_SB_SIZE */
+	fw_desc->sb_size = desc->src.cb_size;
+	/* DMA_DESC_TRIG_CH */
+	/* Channel events are not supported */
+	fw_desc->trig_ch_events = 0U;
+	/* DMA_DESC_HW_SW_TRIG */
+	fw_desc->hw_sw_trig_events =
+		PVA_INSERT(desc->trig_event_mode, 1, 0) |
+		PVA_INSERT(desc->trig_vpu_events, 5, 2) |
+		PVA_INSERT(desc->desc_reload_enable, 12, 12);
+	/* DMA_DESC_PX */
+	fw_desc->px = desc->px;
+	/* DMA_DESC_PY */
+	fw_desc->py = desc->py;
+	/* DMA_DESC_FRDA */
+	fw_desc->frda = ((desc->dst2_offset >> 6U) & 0x3FFF);
+
+	/* DMA_DESC_NDTM_CNTL0 */
+	fw_desc->cb_ext = (((desc->src.cb_start >> 16) & 0x1) << 0) |
+			  (((desc->dst.cb_start >> 16) & 0x1) << 2) |
+			  (((desc->src.cb_size >> 16) & 0x1) << 4) |
+			  (((desc->dst.cb_size >> 16) & 0x1) << 6);
+
+	/* DMA_DESC_NS1_ADV & DMA_DESC_ST1_ADV */
+	fw_desc->srcpt1_cntl =
+		assemble_rpt_cntl(desc->src.rpt1, desc->src.adv1);
+	fw_desc->srcpt2_cntl =
+		assemble_rpt_cntl(desc->src.rpt2, desc->src.adv2);
+	fw_desc->srcpt3_cntl =
+		assemble_rpt_cntl(desc->src.rpt3, desc->src.adv3);
+	fw_desc->dstpt1_cntl =
+		assemble_rpt_cntl(desc->dst.rpt1, desc->dst.adv1);
+	fw_desc->dstpt2_cntl =
+		assemble_rpt_cntl(desc->dst.rpt2, desc->dst.adv2);
+	fw_desc->dstpt3_cntl =
+		assemble_rpt_cntl(desc->dst.rpt3, desc->dst.adv3);
+}
+
+static void write_triggers(struct pva_dma_config const *dma_cfg,
+			   struct pva_dma_config_resource *fw_cfg,
+			   struct pva_dma_resource_map *dma_resource_map)
+{
+	uint32_t i, j;
+	bool trigger_required = false;
+
+	memset(fw_cfg->output_enable, 0, sizeof(fw_cfg->output_enable));
+
+	for (i = 0; i < dma_cfg->header.num_channels; i++) {
+		struct pva_dma_channel const *ch = &dma_cfg->channels[i];
+		uint8_t ch_num = i + dma_cfg->header.base_channel;
+		uint32_t mask;
+
+		mask = ch->output_enable_mask;
+		/* READ/STORE triggers */
+		for (j = 0; j < 7; j++) {
+			fw_cfg->output_enable[j] |=
+				(((mask >> 2 * j) & 1U) << ch_num);
+			fw_cfg->output_enable[j] |=
+				(((mask >> (2 * j + 1)) & 1U)
+				 << (ch_num + 16U));
+		}
+
+		/* VPU config trigger */
+		fw_cfg->output_enable[7] |= (((mask >> 14) & 1U) << ch_num);
+		/* HWSEQ tirgger */
+		fw_cfg->output_enable[8] |= (((mask >> 15) & 1U) << ch_num);
+		fw_cfg->output_enable[8] |=
+			(((mask >> 16) & 1U) << (ch_num + 16U));
+
+		if (mask != 0) {
+			trigger_required = true;
+		}
+	}
+
+	if (trigger_required) {
+		pva_dma_resource_map_add_triggers(dma_resource_map);
+	}
+}
+
+void pva_kmd_write_fw_dma_config(struct pva_dma_config const *dma_cfg,
+				 void *fw_dma_config,
+				 uint32_t *out_fw_fetch_size,
+				 bool support_hwseq_frame_linking)
+{
+	struct pva_dma_config_resource *hdr;
+	struct pva_fw_dma_channel *fw_channels;
+	struct pva_fw_dma_descriptor *fw_descs;
+	struct pva_fw_dma_slot *fw_slots, *last_slot;
+	struct pva_dma_resource_map *dma_resource_map;
+	uint32_t *hwseq_words;
+	uintptr_t offset;
+	uint32_t i;
+
+	hdr = fw_dma_config;
+	hdr->base_channel = dma_cfg->header.base_channel;
+	hdr->base_descriptor = dma_cfg->header.base_descriptor;
+	hdr->base_hwseq_word = dma_cfg->header.base_hwseq_word;
+	hdr->num_channels = dma_cfg->header.num_channels;
+	hdr->num_descriptors = dma_cfg->header.num_descriptors;
+	hdr->num_hwseq_words = dma_cfg->header.num_hwseq_words;
+	hdr->vpu_exec_resource_id = dma_cfg->header.vpu_exec_resource_id;
+	hdr->num_dynamic_slots = dma_cfg->header.num_dynamic_slots;
+
+	dma_resource_map = &hdr->dma_resource_map;
+	pva_dma_resource_map_reset(dma_resource_map);
+	pva_dma_resource_map_add_channels(dma_resource_map,
+					  dma_cfg->header.base_channel,
+					  dma_cfg->header.num_channels);
+	pva_dma_resource_map_add_descriptors(dma_resource_map,
+					     dma_cfg->header.base_descriptor,
+					     dma_cfg->header.num_descriptors);
+	pva_dma_resource_map_add_hwseq_words(dma_resource_map,
+					     dma_cfg->header.base_hwseq_word,
+					     dma_cfg->header.num_hwseq_words);
+
+	offset = sizeof(*hdr);
+	fw_slots = pva_offset_pointer(fw_dma_config, offset);
+
+	if (hdr->num_dynamic_slots > 0) {
+		last_slot = &fw_slots[hdr->num_dynamic_slots - 1];
+
+		hdr->num_relocs = safe_addu16(last_slot->reloc_start_idx,
+					      last_slot->reloc_count);
+		/* Round of the number of relocs to satisfy alignment requirement */
+		hdr->num_relocs = safe_pow2_roundup_u16(hdr->num_relocs, 2U);
+
+		offset += sizeof(struct pva_fw_dma_slot) *
+				  hdr->num_dynamic_slots +
+			  sizeof(struct pva_fw_dma_reloc) * hdr->num_relocs;
+	} else {
+		hdr->num_relocs = 0;
+	}
+
+	fw_channels = pva_offset_pointer(fw_dma_config, offset);
+	offset += sizeof(*fw_channels) * hdr->num_channels;
+
+	fw_descs = pva_offset_pointer(fw_dma_config, offset);
+	offset += sizeof(*fw_descs) * hdr->num_descriptors;
+
+	/* Do not include fields beyond descriptors as they are not fetched to
+	 * TCM */
+	*out_fw_fetch_size = offset;
+
+	for (i = 0; i < hdr->num_channels; i++) {
+		write_dma_channel(&dma_cfg->channels[i],
+				  dma_cfg->header.base_descriptor,
+				  &fw_channels[i], dma_resource_map,
+				  support_hwseq_frame_linking);
+	}
+
+	for (i = 0; i < dma_cfg->header.num_descriptors; i++) {
+		if (pva_is_reserved_desc(i)) {
+			// skip over the reserved descriptor range
+			i = PVA_RESERVED_DESCRIPTORS_END;
+			continue;
+		}
+		write_dma_descriptor(&dma_cfg->descriptors[i], &fw_descs[i]);
+	}
+
+	write_triggers(dma_cfg, fw_dma_config, dma_resource_map);
+
+	hwseq_words = pva_offset_pointer(fw_dma_config, offset);
+
+	memcpy(hwseq_words, dma_cfg->hwseq_words,
+	       sizeof(*hwseq_words) * hdr->num_hwseq_words);
+
+	/*TODO: write hdr->common_config for hwseq and MISR*/
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_executable.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_executable.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_EXECUTABLE_H
+#define PVA_KMD_EXECUTABLE_H
+#include "pva_kmd.h"
+#include "pva_resource.h"
+#include "pva_kmd_utils.h"
+
+struct pva_kmd_device;
+struct pva_kmd_device_memory;
+
+struct pva_kmd_exec_symbol_table {
+	uint32_t n_symbols;
+	struct pva_symbol_info *symbols;
+};
+
+static inline struct pva_symbol_info *
+pva_kmd_get_symbol(struct pva_kmd_exec_symbol_table *symbol_table,
+		   uint32_t symbol_id)
+{
+	struct pva_symbol_info *symbol = NULL;
+	uint32_t idx = symbol_id - PVA_SYMBOL_ID_BASE;
+
+	if (idx >= symbol_table->n_symbols) {
+		pva_kmd_log_err("Symbol ID out of range\n");
+		return NULL;
+	}
+
+	symbol = &symbol_table->symbols[idx];
+	return symbol;
+}
+
+static inline struct pva_symbol_info *
+pva_kmd_get_symbol_with_type(struct pva_kmd_exec_symbol_table *symbol_table,
+			     uint32_t symbol_id,
+			     enum pva_symbol_type symbol_type)
+{
+	struct pva_symbol_info *symbol = NULL;
+
+	symbol = pva_kmd_get_symbol(symbol_table, symbol_id);
+	if (!symbol) {
+		return NULL;
+	}
+
+#if !defined(PVA_SKIP_SYMBOL_TYPE_CHECK)
+	if (symbol->symbol_type != symbol_type) {
+		pva_kmd_log_err("Unexpected symbol type\n");
+		return NULL;
+	}
+#endif
+
+	return symbol;
+}
+
+enum pva_error
+pva_kmd_load_executable(void *executable_data, uint32_t executable_size,
+			struct pva_kmd_device *pva, uint8_t dma_smmu_id,
+			struct pva_kmd_exec_symbol_table *out_symbol_table,
+			struct pva_kmd_device_memory **out_metainfo,
+			struct pva_kmd_device_memory **out_sections);
+
+void pva_kmd_unload_executable(struct pva_kmd_exec_symbol_table *symbol_table,
+			       struct pva_kmd_device_memory *metainfo,
+			       struct pva_kmd_device_memory *sections);
+
+#endif // PVA_KMD_EXECUTABLE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_fw_debug.h"
+#include "pva_kmd_utils.h"
+#include "pva_api.h"
+
+void pva_kmd_drain_fw_print(struct pva_kmd_fw_print_buffer *print_buffer)
+{
+	uint32_t tail = print_buffer->buffer_info->tail;
+
+	if (tail > print_buffer->size) {
+		pva_kmd_log_err(
+			"Firmware print tail is out of bounds! Refusing to print\n");
+		pva_dbg_printf("Tail %u vs size %u\n", tail,
+			       print_buffer->size);
+		return;
+	}
+
+	while (print_buffer->head < tail) {
+		uint32_t max_len = tail - print_buffer->head;
+		const char *str = print_buffer->content + print_buffer->head;
+		uint32_t print_size;
+
+		/* It must be null terminted */
+		if (print_buffer->content[tail - 1] != '\0') {
+			pva_kmd_log_err(
+				"Firmware print is not null terminated! Refusing to print");
+		}
+		print_size = strnlen(str, max_len);
+		pva_kmd_print_str(str);
+
+		/* +1 for null terminator */
+		print_buffer->head += print_size + 1;
+	}
+
+	if (print_buffer->buffer_info->flags & PVA_FW_PRINT_BUFFER_OVERFLOWED) {
+		pva_kmd_log_err("Firmware print buffer overflowed!");
+	}
+
+	if (print_buffer->buffer_info->flags & PVA_FW_PRINT_FAILURE) {
+		pva_kmd_log_err("Firmware print failed!");
+	}
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_debug.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_FW_DEBUG_H
+#define PVA_KMD_FW_DEBUG_H
+#include "pva_api.h"
+#include "pva_fw.h"
+
+struct pva_kmd_fw_print_buffer {
+	struct pva_fw_print_buffer_header *buffer_info;
+	char const *content;
+	uint32_t size;
+	uint32_t head;
+};
+
+void pva_kmd_drain_fw_print(struct pva_kmd_fw_print_buffer *print_buffer);
+
+#endif // PVA_KMD_FW_DEBUG_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.c
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_api_cmdbuf.h"
+#include "pva_api_types.h"
+#include "pva_bit.h"
+#include "pva_fw.h"
+#include "pva_kmd_cmdbuf.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_constants.h"
+#include "pva_utils.h"
+#include "pva_kmd_fw_profiler.h"
+
+// TODO: This is here temporarily just for testing. Should be moved to a common header
+#define CMD_ID(x) PVA_EXTRACT(x, 6, 0, uint8_t)
+#define CMD(name) [CMD_ID(PVA_CMD_OPCODE_##name)] = #name
+
+static const char *cmd_names[PVA_CMD_OPCODE_COUNT] = {
+	CMD(LINK_CHUNK),
+	CMD(BARRIER),
+	CMD(ACQUIRE_ENGINE),
+	CMD(RELEASE_ENGINE),
+	CMD(SET_CURRENT_ENGINE),
+	CMD(CLEAR_VMEM),
+	CMD(BIND_L2SRAM),
+	CMD(RELEASE_L2SRAM),
+	CMD(INVALIDATE_L2SRAM),
+	CMD(FLUSH_L2SRAM),
+	CMD(PATCH_L2SRAM_OFFSET),
+	CMD(SET_VPU_EXECUTABLE),
+	CMD(INIT_VPU_EXECUTABLE),
+	CMD(PREFETCH_VPU_CODE),
+	CMD(SET_VPU_PARAMETER),
+	CMD(SET_VPU_PARAMETER_WITH_ADDRESS),
+	CMD(SET_VPU_INSTANCE_PARAMETER),
+	CMD(SET_VPU_PARAMETER_WITH_BUFFER),
+	CMD(RUN_VPU),
+	CMD(SET_PPE_EXECUTABLE),
+	CMD(INIT_PPE_EXECUTABLE),
+	CMD(PREFETCH_PPE_CODE),
+	CMD(RUN_PPE),
+	CMD(FETCH_DMA_CONFIGURATION),
+	CMD(SETUP_DMA),
+	CMD(RUN_DMA),
+	CMD(BIND_DRAM_SLOT),
+	CMD(BIND_VMEM_SLOT),
+	CMD(UNREGISTER_RESOURCE),
+	CMD(WRITE_DRAM),
+	CMD(CAPTURE_TIMESTAMP),
+	CMD(RUN_UNIT_TESTS)
+};
+
+static const char *priv_cmd_names[PVA_CMD_PRIV_OPCODE_COUNT] = {
+	CMD(INIT_RESOURCE_TABLE),
+	CMD(DEINIT_RESOURCE_TABLE),
+	CMD(UPDATE_RESOURCE_TABLE),
+	CMD(INIT_QUEUE),
+	CMD(DEINIT_QUEUE),
+	CMD(ENABLE_FW_PROFILING),
+	CMD(DISABLE_FW_PROFILING),
+	CMD(SUSPEND_FW),
+	CMD(RESUME_FW)
+};
+
+static inline const char *pva_fw_get_cmd_name(uint32_t opcode)
+{
+	uint32_t cmd_id;
+	const char *name;
+
+	cmd_id = CMD_ID(opcode);
+
+	if (opcode & PVA_CMD_PRIV_OPCODE_FLAG) {
+		if (cmd_id >= PVA_CMD_PRIV_OPCODE_COUNT) {
+			return "INVALID";
+		}
+		name = priv_cmd_names[cmd_id];
+	} else {
+		if (cmd_id >= PVA_CMD_OPCODE_COUNT) {
+			return "INVALID";
+		}
+		name = cmd_names[cmd_id];
+	}
+
+	if (name == NULL) {
+		return "UNKNOWN";
+	} else {
+		return name;
+	}
+}
+
+void pva_kmd_device_init_profiler(struct pva_kmd_device *pva)
+{
+	enum pva_error err = PVA_SUCCESS;
+	const uint32_t profiling_buffer_size = PVA_KMD_FW_PROFILING_BUFFER_SIZE;
+
+	struct pva_kmd_fw_profiling_buffer *fw_profiling_buffer =
+		&pva->fw_profiling_buffer;
+
+	// Event message should be 32-bit to keep logging latency low
+	ASSERT(sizeof(struct pva_fw_event_message) == sizeof(uint32_t));
+
+	pva->fw_profiling_buffer_memory =
+		pva_kmd_device_memory_alloc_map(profiling_buffer_size, pva,
+						PVA_ACCESS_RW,
+						PVA_R5_SMMU_CONTEXT_ID);
+	ASSERT(pva->fw_profiling_buffer_memory != NULL);
+
+	/* Add profiling memory to resource table */
+	err = pva_kmd_add_dram_buffer_resource(
+		&pva->dev_resource_table, pva->fw_profiling_buffer_memory,
+		&pva->fw_profiling_buffer_resource_id);
+	ASSERT(err == PVA_SUCCESS);
+	pva_kmd_update_fw_resource_table(&pva->dev_resource_table);
+
+	fw_profiling_buffer->buffer_info =
+		(struct pva_fw_profiling_buffer_header *)
+			pva->fw_profiling_buffer_memory->va;
+	fw_profiling_buffer->content =
+		pva_offset_pointer(pva->fw_profiling_buffer_memory->va,
+				   sizeof(*fw_profiling_buffer->buffer_info));
+	fw_profiling_buffer->size = pva->fw_profiling_buffer_memory->size;
+	fw_profiling_buffer->head = 0U;
+	fw_profiling_buffer->buffer_info->flags = 0U;
+	fw_profiling_buffer->buffer_info->tail = 0U;
+
+	pva->debugfs_context.g_fw_profiling_config.enabled = false;
+	pva->debugfs_context.g_fw_profiling_config.filter = 0x0;
+}
+
+void pva_kmd_device_deinit_profiler(struct pva_kmd_device *pva)
+{
+	pva_kmd_drop_resource(&pva->dev_resource_table,
+			      pva->fw_profiling_buffer_resource_id);
+	pva->debugfs_context.g_fw_profiling_config.enabled = false;
+}
+
+enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva)
+{
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
+	struct pva_cmd_enable_fw_profiling *cmd;
+	uint64_t buffer_offset = 0U;
+	uint32_t filter = 0U;
+	uint8_t timestamp_type = TIMESTAMP_TYPE_CYCLE_COUNT;
+	uint32_t fence_val;
+	enum pva_error err;
+
+	// filter |= PVA_FW_EVENT_DO_CMD;
+	filter |= PVA_FW_EVENT_RUN_VPU;
+
+	if (pva->debugfs_context.g_fw_profiling_config.enabled) {
+		return PVA_SUCCESS;
+	}
+
+	pva->fw_profiling_buffer.head = 0U;
+	pva->fw_profiling_buffer.buffer_info->flags = 0U;
+	pva->fw_profiling_buffer.buffer_info->tail = 0U;
+
+	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
+	ASSERT(cmd != NULL);
+	pva_kmd_set_cmd_enable_fw_profiling(
+		cmd, pva->fw_profiling_buffer_resource_id,
+		pva->fw_profiling_buffer.size, buffer_offset, filter,
+		timestamp_type);
+
+	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Waiting for FW timed out when initializing context");
+		goto err_out;
+	}
+
+	pva->debugfs_context.g_fw_profiling_config.enabled = true;
+	pva->debugfs_context.g_fw_profiling_config.filter = filter;
+	pva->debugfs_context.g_fw_profiling_config.timestamp_type =
+		timestamp_type;
+	pva->debugfs_context.g_fw_profiling_config.timestamp_size =
+		(pva->debugfs_context.g_fw_profiling_config.timestamp_type ==
+		 TIMESTAMP_TYPE_TSE) ?
+			      8 :
+			      4;
+
+	return PVA_SUCCESS;
+err_out:
+	return err;
+}
+
+enum pva_error pva_kmd_notify_fw_disable_profiling(struct pva_kmd_device *pva)
+{
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
+	struct pva_cmd_disable_fw_profiling *cmd;
+	uint32_t fence_val;
+	enum pva_error err;
+
+	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
+	ASSERT(cmd != NULL);
+	pva_kmd_set_cmd_disable_fw_profiling(cmd);
+
+	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Waiting for FW timed out when initializing context");
+		goto err_out;
+	}
+
+	pva->debugfs_context.g_fw_profiling_config.enabled = false;
+	pva->debugfs_context.g_fw_profiling_config.filter = 0x0;
+
+	return PVA_SUCCESS;
+err_out:
+	return err;
+}
+
+static void decode_and_print_event(unsigned long walltime,
+				   unsigned long relative_time,
+				   struct pva_fw_event_message message,
+				   char *msg_string)
+{
+	switch (PVA_BIT(message.event)) {
+	case PVA_FW_EVENT_DO_CMD: {
+		sprintf(msg_string,
+			"pva_fw@%lu: [%8lu] event=%-12s type=%-7s slot=%u  idx=%-5u    opcode=%s",
+			walltime, relative_time, "DO_CMD",
+			event_type_to_string(message.type), message.arg2,
+			message.arg3, pva_fw_get_cmd_name(message.arg1));
+	} break;
+	case PVA_FW_EVENT_SCAN_QUEUES: {
+		sprintf(msg_string,
+			"pva_fw@%lu: [%8lu] event=%-12s type=%-7s found=%u ccq_id=%-5u queue_id=%u",
+			walltime, relative_time, "SCAN_QUEUES",
+			event_type_to_string(message.type), message.arg1,
+			message.arg2, message.arg3);
+	} break;
+	case PVA_FW_EVENT_SCAN_SLOTS: {
+		sprintf(msg_string,
+			"pva_fw@%lu: [%8lu] event=%-12s type=%-7s state=%u slot=%u",
+			walltime, relative_time, "SCAN_SLOTS",
+			event_type_to_string(message.type), message.arg1,
+			message.arg2);
+	} break;
+	case PVA_FW_EVENT_RUN_VPU: {
+		sprintf(msg_string,
+			"pva_fw@%lu: [%8lu] event=%-12s type=%-7s slot=%u  idx=%-5u    opcode=%s",
+			walltime, relative_time, "RUN_VPU",
+			event_type_to_string(message.type), message.arg2,
+			message.arg3, pva_fw_get_cmd_name(message.arg1));
+	} break;
+	default:
+		pva_dbg_printf("Unknown event type\n");
+		break;
+	}
+}
+
+void pva_kmd_drain_fw_profiling_buffer(
+	struct pva_kmd_device *pva,
+	struct pva_kmd_fw_profiling_buffer *profiling_buffer)
+{
+	char msg_string[200] = { '\0' };
+	struct pva_fw_event_message message;
+	uint64_t prev_walltime = 0U;
+	uint64_t timestamp = 0U;
+	uint64_t relative_time = 0U;
+	uint32_t buffer_space;
+
+	// TODO: R5 frequency is hard-coded for now. Get this at runtime.
+	static const uint32_t r5_freq = 716800000U;
+	static const unsigned long r5_cycle_duration = 1000000000000 / r5_freq;
+	unsigned long walltime = 0U; // in nanoseconds
+	uint64_t walltime_diff;
+
+	const uint32_t message_size =
+		sizeof(message) +
+		pva->debugfs_context.g_fw_profiling_config.timestamp_size;
+	uint32_t *profiling_buffer_head = &profiling_buffer->head;
+	uint32_t profiling_buffer_tail = profiling_buffer->buffer_info->tail;
+	while (*profiling_buffer_head < profiling_buffer_tail) {
+		buffer_space = safe_addu32(*profiling_buffer_head,
+					   safe_subu32(message_size, 1U));
+		ASSERT(buffer_space <= profiling_buffer_tail);
+		memcpy(&message,
+		       &profiling_buffer->content[*profiling_buffer_head],
+		       sizeof(message));
+		memcpy(&timestamp,
+		       &profiling_buffer->content[*profiling_buffer_head +
+						  sizeof(message)],
+		       pva->debugfs_context.g_fw_profiling_config
+			       .timestamp_size);
+
+		if (pva->debugfs_context.g_fw_profiling_config.timestamp_type ==
+		    TIMESTAMP_TYPE_TSE) {
+			walltime = (timestamp << 5);
+		} else if (pva->debugfs_context.g_fw_profiling_config
+				   .timestamp_type ==
+			   TIMESTAMP_TYPE_CYCLE_COUNT) {
+			timestamp = PVA_LOW32(timestamp);
+			walltime = (r5_cycle_duration * timestamp) / 1000U;
+		}
+		walltime_diff = safe_subu64((uint64_t)walltime, prev_walltime);
+		relative_time = (prev_walltime == 0U) ? 0U : walltime_diff;
+		decode_and_print_event(walltime, relative_time, message,
+				       &msg_string[0]);
+		pva_kmd_print_str(msg_string);
+		*profiling_buffer_head = *profiling_buffer_head + message_size;
+		prev_walltime = walltime;
+	}
+
+	return;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_fw_profiler.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_FW_PROFILER_H
+#define PVA_KMD_FW_PROFILER_H
+#include "pva_kmd_device.h"
+
+struct pva_kmd_fw_profiling_buffer {
+#define PVA_KMD_FW_PROFILING_BUFFER_SIZE (512 * 1024)
+	struct pva_fw_profiling_buffer_header *buffer_info;
+	char const *content;
+	uint32_t size;
+	uint32_t head;
+};
+
+struct pva_kmd_fw_profiling_config {
+	uint32_t filter;
+	enum pva_fw_timestamp_t timestamp_type;
+	uint8_t timestamp_size;
+	uint8_t enabled;
+};
+
+void pva_kmd_device_init_profiler(struct pva_kmd_device *pva);
+
+void pva_kmd_device_deinit_profiler(struct pva_kmd_device *pva);
+
+void pva_kmd_drain_fw_profiling_buffer(
+	struct pva_kmd_device *pva,
+	struct pva_kmd_fw_profiling_buffer *profiling_buffer);
+
+enum pva_error pva_kmd_notify_fw_enable_profiling(struct pva_kmd_device *pva);
+
+enum pva_error pva_kmd_notify_fw_disable_profiling(struct pva_kmd_device *pva);
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_hwseq_validate.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_hwseq_validate.c
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_hwseq_validate.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_hwseq_validate.h
@@ -0,0 +1,336 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_HWSEQ_VALIDATE_H
+#define PVA_KMD_HWSEQ_VALIDATE_H
+
+#include "pva_api_dma.h"
+#include "pva_kmd_device.h"
+
+#define PVA_HWSEQ_RRA_MAX_NOCR 31U
+#define PVA_HWSEQ_RRA_MAX_FRAME_COUNT 63U
+
+/**
+ * List of valid Addressing Modes in HW Sequencer Header
+ */
+enum pva_dma_hwseq_fid {
+	PVA_DMA_HWSEQ_RRA_MODE = 0xC0DA, /*!< RRA addressing */
+	PVA_DMA_HWSEQ_FRAME_MODE = 0xC0DE, /*!< frame addressing */
+	PVA_DMA_HWSEQ_DESC_MODE = 0xDEAD /*!< descriptor addressing */
+};
+
+/**
+ * Combine three headers common in HW Sequencer
+ *
+ * ----------------------------------------------------------------------------
+ * |        | byte 3        | byte 2       | byte 1          | byte 0         |
+ * |--------|---------------|--------------|-----------------|----------------|
+ * | Head 1 | NOCR          | FR           | FID1            | FID0           |
+ * | Head 2 | FO in LP 15:8 | FO in LP 7:0 | TO in P/LP 15:8 | TO in P/LP 7:0 |
+ * | Head 3 | padB          | padL         | padT            | padR           |
+ * ----------------------------------------------------------------------------
+ **/
+struct pva_dma_hwseq_hdr {
+	//hdr_1
+	uint16_t fid; /*!< addressing type: frame or descriptor */
+	uint8_t fr; /*!< frame repetition factor */
+	uint8_t nocr; /*!< number of descriptor column/row */
+	//hdr_2
+	int16_t to; /*!< tile offset in pixel/Line Pitch */
+	int16_t fo; /*!< frame offset in Line Pitch */
+	//hdr_3
+	uint8_t padr; /*!< pad right */
+	uint8_t padt; /*!< pad top */
+	uint8_t padl; /*!< pad left */
+	uint8_t padb; /*!< pad bottom */
+};
+
+/**
+ * A struct which represents Column/Row Header in HW Sequencer
+ */
+struct pva_dma_hwseq_colrow_hdr {
+	uint8_t dec; /*!< descriptor entry count */
+	uint8_t crr; /*!< col/row repetition factor */
+	int16_t cro; /*!< col/row ofst in pixel/line pitch */
+};
+
+/**
+ * A struct which represents a DMA Descriptor Header in HW Sequencer
+ */
+struct pva_dma_hwseq_desc_entry {
+	uint8_t did; /*!< desc id */
+	uint8_t dr; /*!< desc repetition */
+};
+
+/**
+ * A struct which represents a Column/Row Header Entry in HW Sequencer
+ */
+struct pva_dma_hwseq_colrow_entry_hdr {
+	struct pva_dma_hwseq_colrow_hdr hdr; /*!< Col/Row Header */
+};
+
+/**
+ * A struct representing Grid Information
+ */
+struct pva_hwseq_grid_info {
+	/**
+	 * tile co-ordinates
+	 * In Raster Mode:
+	 * 	- tile_x[0] = Tile width of the first tile in HW Seq DMA Transfer
+	 * 	- tile_x[1] = Tile width of the last tile in HW Seq DMA Transfer
+	 * In Vertical Mining Mode:
+	 * 	- tile_x[0] = Tile height of the first tile in HW Seq DMA Transfer
+	 * 	- tile_x[1] = Tile height of the last tile in HW Seq DMA Transfer
+	 */
+	int32_t tile_x[2];
+	/**
+	 * tile co-ordinates
+	 * In Raster Mode:
+	 * 	- tile_y[0] = Tile height of the first tile in HW Seq DMA Transfer
+	 * 	- tile_y[1] = Tile height of the last tile in HW Seq DMA Transfer
+	 * In Vertical Mining Mode:
+	 * 	- tile_y[0] = Tile width of the first tile in HW Seq DMA Transfer
+	 * 	- tile_y[1] = Tile width of the last tile in HW Seq DMA Transfer
+	 */
+	int32_t tile_y[2];
+	/**
+	 * tile co-ordinates
+	 * In Tensor Data Flow Mode:
+	 */
+	int32_t tile_z;
+	/**
+	 * Padding values
+	 * In Raster Mode:
+	 * 	- pad_x[0] = Left Padding
+	 * 	- pad_x[1] = Right Padding
+	 * In Vertical Mining Mode:
+	 * 	- pad_x[0] = Top Padding
+	 * 	- pad_x[1] = Bottom Padding
+	 */
+	int32_t pad_x[2];
+	/**
+	 * Padding values
+	 * In Raster Mode:
+	 * 	- pad_y[0] = Top Padding
+	 * 	- pad_y[1] = Bottom Padding
+	 * In Vertical Mining Mode:
+	 * 	- pad_y[0] = Left Padding
+	 * 	- pad_y[1] = Right Padding
+	 */
+	int32_t pad_y[2];
+	/**
+	 * Tiles per packet. Grid size in X dimension
+	 */
+	uint32_t grid_size_x;
+	/**
+	 * Repeat Count
+	 */
+	uint32_t grid_size_y;
+	/**
+	 * Grid Size in Z dimension for Tensor Data Flow
+	 */
+	uint32_t grid_size_z;
+	/**
+	 * Tile Offset as specified in the HW Sequencer Header
+	 */
+	int32_t grid_step_x;
+	/**
+	 * Col/Row Offset as specified in the HW Sequencer Col/Row Header
+	 */
+	int32_t grid_step_y;
+	/**
+	 * Repetition factor for Head Descriptor in HW Sequencer Blob
+	 */
+	uint32_t head_tile_count;
+	/**
+	 * Boolean value to indicate if HW Sequencer has split padding
+	 */
+	bool is_split_padding;
+};
+
+/**
+ * A struct representing a valid Frame Information
+ */
+struct pva_hwseq_frame_info {
+	/**
+	 * X co-ordinate of start of Frame
+	 */
+	int64_t start_x;
+	/**
+	 * Y co-ordinate of start of Frame
+	 */
+	int64_t start_y;
+	/**
+	 * Z co-ordinates of starte of Frame
+	 */
+	int64_t start_z;
+	/**
+	 * X co-ordinate of end of Frame
+	 */
+	int64_t end_x;
+	/**
+	 * Y co-ordinate of end of Frame
+	 */
+	int64_t end_y;
+	/**
+	 * Z co-ordinate of end of Frame
+	 */
+	int64_t end_z;
+};
+
+/**
+ * Struct which holds the HW Sequencer Buffer as received from User Space
+ */
+struct pva_hwseq_buffer {
+	/**
+	 * Pointer to HW Sequencer Blob in Buffer
+	 */
+	const uint8_t *data;
+	/**
+	 * Number of bytes left to be read from the data buffer
+	 */
+	uint32_t bytes_left;
+};
+
+/**
+ * @struct hw_seq_blob_entry
+ * @brief Structure to hold information about a hardware sequence blob entry.
+ *
+ * This structure is used to store the details of a DMA channel and the range of hardware sequencer
+ * associated with it, along with the number of frames involved.
+ */
+struct hw_seq_blob_entry {
+	/**
+	 * Pointer to a const \ref pva_dma_channel which holds the current DMA Channel Information
+	 * in which current HW Sequencer Blob is present
+	 */
+	struct pva_dma_channel const *ch;
+	/**
+	 * The starting index of the hardware sequencer.
+	 */
+	uint16_t hwseq_start;
+	/**
+	 * The ending index of the hardware sequencer.
+	 */
+	uint16_t hwseq_end;
+	/**
+	 * The number of frames associated with the hardware sequencer.
+	 */
+	uint32_t num_frames;
+};
+
+/**
+ * TODO: Separate out pva_hwseq_priv to be more modular
+ * 
+ * Items in pva_hwseq_main
+ * 	- dma_config
+ * 	- hw_gen
+ * 	- blob
+ * 	- num_hwseq_words
+ * Items per segment of main i.e. pva_hwseq_segment
+ * 	- hwseq_start, hwseq_end
+ * 	- channel id
+ * 	- hwseq_header,
+ *  - desc_count
+ * 	- num_frames
+ * 	- head_desc, tail_desc
+ * 	- is_split_padding
+ * 	- is_raster_scan
+ */
+
+/**
+ * A struct holding private data to HW Sequencer Blob being parsed
+ */
+struct pva_hwseq_priv {
+	/**
+	 * Number of descriptors in the HW Sequencer Blob
+	 */
+	uint32_t desc_count;
+	/**
+	 * Number of tiles in the packet
+	 * This is the sum total of descriptor repetition factors
+	 * present in the HW Sequencer Blob
+	 */
+	uint32_t tiles_per_packet;
+	int32_t max_tx;
+	int32_t max_ty;
+
+	/**
+	 * Struct that holds the entry info of HW Sequencer Blob
+	 */
+	struct hw_seq_blob_entry entry;
+
+	/**
+	 * Struct that holds HW Sequencer Blob to be read
+	 */
+	struct pva_hwseq_buffer blob;
+
+	/**
+	 * Boolean to indicate if split padding is present in the HW Sequener Blob
+	 */
+	bool is_split_padding;
+	/**
+	 * Bool to indicate if HW Sequencer uses raster scan or Vertical mining
+	 * TRUE: Raster Scan
+	 * FALSE: Vertical Mining
+	 */
+	bool is_raster_scan;
+
+	/**
+	 * @brief Indicates the generation of PVA HW.
+	 * Allowed values: 0 (GEN 1), 1 (GEN 2), 2 (GEN 3)
+	 */
+	enum pva_hw_gen hw_gen;
+
+	/**
+	 * @brief Pointer to the DMA configuration header.
+	 */
+	const struct pva_dma_config *dma_config;
+
+	/**
+	 * Pointer to \ref pva_dma_hwseq_hdr_t which holds the HW Sequencer Header
+	 */
+	const struct pva_dma_hwseq_hdr *hdr;
+	/**
+	 * Pointer to \ref pva_dma_hwseq_colrow_hdr_t which holds the Header of the
+	 * Col/Row inside HW Sequencer
+	 */
+	const struct pva_dma_hwseq_colrow_hdr *colrow;
+
+	/**
+	 * Pointer to the Head Descriptor of type \ref nvpva_dma_descriptor in the HW Sequencer
+	 */
+	struct pva_dma_descriptor *head_desc;
+	/**
+	 * Pointer to the Tail Descriptor of type \ref nvpva_dma_descriptor in the HW Sequencer
+	 */
+	struct pva_dma_descriptor *tail_desc;
+	/**
+	 * DMA Descriptor information obtained from HW Sequencer Blob of type
+	 * \ref pva_dma_hwseq_desc_entry_t
+	 */
+	struct pva_dma_hwseq_desc_entry dma_descs[2];
+	/**
+	 * Access Sizes are calculated and stored here from HW Sequencer Blob
+	 */
+	struct pva_kmd_dma_access *access_sizes;
+};
+
+struct pva_hwseq_per_frame_info {
+	uint32_t seq_tile_count;
+	uint32_t vmem_tiles_per_frame;
+};
+
+enum pva_error validate_hwseq(struct pva_dma_config const *dma_config,
+			      struct pva_kmd_hw_constants const *hw_consts,
+			      struct pva_kmd_dma_access *access_sizes,
+			      uint64_t *hw_dma_descs_mask);
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.c
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_msg.h"
+#include "pva_fw.h"
+#include "pva_kmd_utils.h"
+#include "pva_kmd_thread_sema.h"
+#include "pva_kmd_fw_debug.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_context.h"
+
+static uint8_t get_msg_type(uint32_t hdr)
+{
+	return PVA_EXTRACT(hdr, PVA_FW_MSG_TYPE_MSB, PVA_FW_MSG_TYPE_LSB,
+			   uint32_t);
+}
+
+void pva_kmd_handle_hyp_msg(void *pva_dev, uint32_t const *data, uint8_t len)
+{
+	struct pva_kmd_device *pva = pva_dev;
+	uint8_t type = get_msg_type(data[0]);
+	uint8_t updated_len = safe_subu8(len, 1U);
+	uint8_t size = safe_mulu8((uint8_t)sizeof(uint32_t), updated_len);
+
+	switch (type) {
+	case PVA_FW_MSG_TYPE_BOOT_DONE: {
+		uint64_t r5_start_time =
+			pack64(data[PVA_FW_MSG_R5_START_TIME_HI_IDX],
+			       data[PVA_FW_MSG_R5_START_TIME_LO_IDX]);
+		uint64_t r5_ready_time =
+			pack64(data[PVA_FW_MSG_R5_READY_TIME_HI_IDX],
+			       data[PVA_FW_MSG_R5_READY_TIME_LO_IDX]);
+
+		pva_kmd_log_err("Firmware boot completes");
+		pva_kmd_log_err_u64("R5 start time (us)",
+				    tsc_to_us(r5_start_time));
+		pva_kmd_log_err_u64("R5 ready time (us)",
+				    tsc_to_us(r5_ready_time));
+
+		pva_kmd_sema_post(&pva->fw_boot_sema);
+	} break;
+	case PVA_FW_MSG_TYPE_ABORT: {
+		char abort_msg[PVA_FW_MSG_ABORT_STR_MAX_LEN + 1];
+
+		pva_kmd_drain_fw_print(&pva->fw_print_buffer);
+
+		pva_kmd_log_err("Firmware aborted! The abort message is: ");
+		abort_msg[0] = PVA_EXTRACT(data[0], 7, 0, uint32_t);
+		abort_msg[1] = PVA_EXTRACT(data[0], 15, 8, uint32_t);
+		memcpy(abort_msg + 2, &data[1], size);
+		abort_msg[PVA_FW_MSG_ABORT_STR_MAX_LEN] = '\0';
+		pva_kmd_log_err(abort_msg);
+	} break;
+	case PVA_FW_MSG_TYPE_FLUSH_PRINT:
+		pva_kmd_drain_fw_print(&pva->fw_print_buffer);
+		break;
+
+	default:
+		FAULT("Unknown message type from firmware");
+	}
+}
+
+void pva_kmd_handle_msg(void *pva_dev, uint32_t const *data, uint8_t len)
+{
+	struct pva_kmd_device *pva = pva_dev;
+
+	uint8_t type = get_msg_type(data[0]);
+	switch (type) {
+	case PVA_FW_MSG_TYPE_RESOURCE_UNREGISTER: {
+		uint8_t table_id =
+			PVA_EXTRACT(data[0], PVA_FW_MSG_RESOURCE_TABLE_ID_MSB,
+				    PVA_FW_MSG_RESOURCE_TABLE_ID_LSB, uint8_t);
+		/* Resource table ID equals context id */
+		struct pva_kmd_context *ctx =
+			pva_kmd_get_context(pva, table_id);
+		uint32_t i;
+
+		pva_kmd_mutex_lock(&ctx->resource_table_lock);
+		for (i = 1; i < len; i++) {
+			pva_kmd_drop_resource(&ctx->ctx_resource_table,
+					      data[i]);
+		}
+		pva_kmd_mutex_unlock(&ctx->resource_table_lock);
+		break;
+	}
+	default:
+		FAULT("Unexpected CCQ msg type from FW");
+		break;
+	}
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_msg.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_api.h"
+
+/**
+ * @brief Handle messages from FW to hypervisor.
+ *
+ * This is just a provision for future hypervisor support. For now, this just
+ * handles all messages from mailboxes.
+ */
+void pva_kmd_handle_hyp_msg(void *pva_dev, uint32_t const *data, uint8_t len);
+
+/**
+ * @brief Handle messages from FW to KMD.
+ *
+ * These messages come from CCQ0 statues registers.
+ */
+void pva_kmd_handle_msg(void *pva_dev, uint32_t const *data, uint8_t len);
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_mutex.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_mutex.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_MUTEX_H
+#define PVA_KMD_MUTEX_H
+
+#include "pva_api.h"
+
+#if defined(__KERNEL__) /* For Linux */
+
+#include <linux/mutex.h>
+typedef struct mutex pva_kmd_mutex_t;
+
+#else /* For user space code, including QNX KMD */
+
+#include <pthread.h>
+/* Mutex */
+typedef pthread_mutex_t pva_kmd_mutex_t;
+
+#endif
+
+enum pva_error pva_kmd_mutex_init(pva_kmd_mutex_t *m);
+void pva_kmd_mutex_lock(pva_kmd_mutex_t *m);
+void pva_kmd_mutex_unlock(pva_kmd_mutex_t *m);
+void pva_kmd_mutex_deinit(pva_kmd_mutex_t *m);
+
+#endif // PVA_KMD_MUTEX_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.c
@@ -0,0 +1,814 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_op_handler.h"
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_kmd_cmdbuf.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_cmdbuf.h"
+#include "pva_kmd_queue.h"
+#include "pva_kmd_constants.h"
+#include "pva_fw.h"
+#include "pva_kmd_vpu_app_auth.h"
+#include "pva_math_utils.h"
+
+struct pva_kmd_buffer {
+	void const *base;
+	uint32_t offset;
+	uint32_t size;
+};
+
+/* Offset will always be multiple of 8 bytes */
+static void incr_offset(struct pva_kmd_buffer *buf, uint32_t incr)
+{
+	buf->offset = safe_addu32(buf->offset, incr);
+	buf->offset =
+		safe_pow2_roundup_u32(buf->offset, (uint32_t)sizeof(uint64_t));
+}
+
+static bool access_ok(struct pva_kmd_buffer const *buf, uint32_t size)
+{
+	return safe_addu32(buf->offset, size) <= buf->size;
+}
+
+static void *read_data(struct pva_kmd_buffer *buf, uint32_t size)
+{
+	void *data = (void *)((uint8_t *)buf->base + buf->offset);
+	incr_offset(buf, size);
+	return data;
+}
+
+static void write_data(struct pva_kmd_buffer *buf, void const *data,
+		       uint32_t size)
+{
+	memcpy((uint8_t *)buf->base + buf->offset, data, size);
+	incr_offset(buf, size);
+}
+
+static enum pva_error
+pva_kmd_op_memory_register_async(struct pva_kmd_context *ctx,
+				 struct pva_kmd_buffer *in_buffer,
+				 struct pva_kmd_buffer *out_buffer,
+				 struct pva_kmd_cmdbuf_builder *cmdbuf_builder)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_memory_register_in_args *args;
+	struct pva_kmd_register_out_args out_args = { 0 };
+	struct pva_kmd_device_memory *dev_mem;
+	struct pva_cmd_update_resource_table *update_cmd;
+	struct pva_resource_entry entry = { 0 };
+	uint8_t smmu_ctx_id;
+
+	uint32_t resource_id = 0;
+
+	if (!access_ok(out_buffer, sizeof(struct pva_kmd_register_out_args))) {
+		return PVA_INVAL;
+	}
+
+	if (!access_ok(in_buffer,
+		       sizeof(struct pva_kmd_memory_register_in_args))) {
+		err = PVA_INVAL;
+		goto err_out;
+	}
+
+	args = read_data(in_buffer,
+			 sizeof(struct pva_kmd_memory_register_in_args));
+
+	dev_mem = pva_kmd_device_memory_acquire(args->memory_handle,
+						args->offset, args->size, ctx);
+	if (dev_mem == NULL) {
+		err = PVA_NOMEM;
+		goto err_out;
+	}
+
+	if (args->segment == PVA_MEMORY_SEGMENT_R5) {
+		smmu_ctx_id = PVA_R5_SMMU_CONTEXT_ID;
+	} else {
+		smmu_ctx_id = ctx->smmu_ctx_id;
+	}
+
+	err = pva_kmd_device_memory_iova_map(dev_mem, ctx->pva,
+					     args->access_flags, smmu_ctx_id);
+	if (err != PVA_SUCCESS) {
+		goto release;
+	}
+
+	if ((smmu_ctx_id == PVA_R5_SMMU_CONTEXT_ID) &&
+	    (dev_mem->iova < FW_SHARED_MEMORY_START)) {
+		pva_kmd_log_err(
+			"Not able to map memory in the R5 shared region");
+		err = PVA_NOMEM;
+		goto unmap;
+	}
+
+	pva_kmd_mutex_lock(&ctx->resource_table_lock);
+	err = pva_kmd_add_dram_buffer_resource(&ctx->ctx_resource_table,
+					       dev_mem, &resource_id);
+	pva_kmd_mutex_unlock(&ctx->resource_table_lock);
+	if (err != PVA_SUCCESS) {
+		goto unmap;
+	}
+
+	update_cmd =
+		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
+	if (update_cmd == NULL) {
+		pva_kmd_log_err("Unable to reserve command buffer space");
+		err = PVA_NOMEM;
+		goto free_dram_buffer_resource;
+	}
+
+	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
+					  &entry);
+	if (err != PVA_SUCCESS) {
+		goto free_cmdbuf;
+	}
+
+	pva_kmd_set_cmd_update_resource_table(
+		update_cmd, ctx->resource_table_id, resource_id, &entry);
+
+	out_args.error = PVA_SUCCESS;
+	out_args.resource_id = resource_id;
+	write_data(out_buffer, &out_args, sizeof(out_args));
+	return err;
+free_cmdbuf:
+	pva_kmd_cmdbuf_builder_cancel(cmdbuf_builder);
+free_dram_buffer_resource:
+	pva_kmd_drop_resource(&ctx->ctx_resource_table, resource_id);
+unmap:
+	pva_kmd_device_memory_iova_unmap(dev_mem);
+release:
+	pva_kmd_device_memory_free(dev_mem);
+err_out:
+	out_args.error = err;
+	write_data(out_buffer, &out_args, sizeof(out_args));
+	return err;
+}
+
+static enum pva_error pva_kmd_op_executable_register_async(
+	struct pva_kmd_context *ctx, struct pva_kmd_buffer *in_buffer,
+	struct pva_kmd_buffer *out_buffer,
+	struct pva_kmd_cmdbuf_builder *cmdbuf_builder)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_executable_register_in_args *args;
+	struct pva_kmd_exec_register_out_args out_args = { 0 };
+	struct pva_cmd_update_resource_table *update_cmd;
+	struct pva_resource_entry entry = { 0 };
+	struct pva_kmd_resource_record *rec;
+	uint32_t num_symbols = 0;
+	void *exec_data;
+
+	uint32_t resource_id = 0;
+
+	if (!access_ok(out_buffer,
+		       sizeof(struct pva_kmd_exec_register_out_args))) {
+		return PVA_INVAL;
+	}
+
+	if (!access_ok(in_buffer,
+		       sizeof(struct pva_kmd_executable_register_in_args))) {
+		err = PVA_INVAL;
+		goto err_out;
+	}
+
+	args = read_data(in_buffer,
+			 sizeof(struct pva_kmd_executable_register_in_args));
+
+	if (!access_ok(in_buffer, args->size)) {
+		err = PVA_INVAL;
+		goto err_out;
+	}
+
+	exec_data = read_data(in_buffer, args->size);
+
+	err = pva_kmd_verify_exectuable_hash(ctx->pva, (uint8_t *)exec_data,
+					     args->size);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	pva_kmd_mutex_lock(&ctx->resource_table_lock);
+	err = pva_kmd_add_vpu_bin_resource(&ctx->ctx_resource_table, exec_data,
+					   args->size, &resource_id);
+	if (err == PVA_SUCCESS) {
+		rec = pva_kmd_use_resource(&ctx->ctx_resource_table,
+					   resource_id);
+		ASSERT(rec != NULL);
+		num_symbols = rec->vpu_bin.symbol_table.n_symbols;
+		pva_kmd_drop_resource(&ctx->ctx_resource_table, resource_id);
+	}
+	pva_kmd_mutex_unlock(&ctx->resource_table_lock);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	update_cmd =
+		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
+	if (update_cmd == NULL) {
+		pva_kmd_log_err("Unable to reserve memory in command buffer");
+		err = PVA_NOMEM;
+		goto drop_resource;
+	}
+	ASSERT(update_cmd != NULL);
+
+	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
+					  &entry);
+	ASSERT(err == PVA_SUCCESS);
+
+	pva_kmd_set_cmd_update_resource_table(
+		update_cmd, ctx->resource_table_id, resource_id, &entry);
+
+	out_args.error = PVA_SUCCESS;
+	out_args.resource_id = resource_id;
+	out_args.num_symbols = num_symbols;
+	write_data(out_buffer, &out_args, sizeof(out_args));
+	return err;
+drop_resource:
+	pva_kmd_drop_resource(&ctx->ctx_resource_table, resource_id);
+err_out:
+	out_args.error = err;
+	write_data(out_buffer, &out_args, sizeof(out_args));
+	return err;
+}
+
+static enum pva_error
+pva_kmd_op_dma_register_async(struct pva_kmd_context *ctx,
+			      struct pva_kmd_buffer *in_buffer,
+			      struct pva_kmd_buffer *out_buffer,
+			      struct pva_kmd_cmdbuf_builder *cmdbuf_builder)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_dma_config_register_in_args *args;
+	struct pva_kmd_register_out_args out_args = { 0 };
+	struct pva_cmd_update_resource_table *update_cmd;
+	struct pva_resource_entry entry = { 0 };
+	void *dma_cfg_data;
+	uint32_t dma_cfg_payload_size;
+	uint32_t resource_id = 0;
+	uint32_t dma_config_size = 0;
+
+	if (!access_ok(out_buffer, sizeof(struct pva_kmd_register_out_args))) {
+		return PVA_INVAL;
+	}
+
+	if (!access_ok(in_buffer,
+		       sizeof(struct pva_kmd_dma_config_register_in_args))) {
+		return PVA_INVAL;
+	}
+
+	args = read_data(in_buffer,
+			 sizeof(struct pva_kmd_dma_config_register_in_args));
+
+	dma_cfg_data = &args->dma_config_header;
+	dma_cfg_payload_size = in_buffer->size - in_buffer->offset;
+	// Discard the data we are about to pass to pva_kmd_add_dma_config_resource
+	read_data(in_buffer, dma_cfg_payload_size);
+
+	pva_kmd_mutex_lock(&ctx->resource_table_lock);
+	dma_config_size =
+		safe_addu32(dma_cfg_payload_size,
+			    (uint32_t)sizeof(args->dma_config_header));
+	err = pva_kmd_add_dma_config_resource(&ctx->ctx_resource_table,
+					      dma_cfg_data, dma_config_size,
+					      &resource_id);
+	pva_kmd_mutex_unlock(&ctx->resource_table_lock);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	update_cmd =
+		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
+	if (update_cmd == NULL) {
+		err = PVA_NOMEM;
+		goto drop_dma_config;
+	}
+
+	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
+					  &entry);
+	ASSERT(err == PVA_SUCCESS);
+
+	pva_kmd_set_cmd_update_resource_table(
+		update_cmd, ctx->resource_table_id, resource_id, &entry);
+
+	out_args.error = PVA_SUCCESS;
+	out_args.resource_id = resource_id;
+	write_data(out_buffer, &out_args, sizeof(out_args));
+
+	return PVA_SUCCESS;
+drop_dma_config:
+	pva_kmd_drop_resource(&ctx->ctx_resource_table, resource_id);
+err_out:
+	out_args.error = err;
+	write_data(out_buffer, &out_args, sizeof(out_args));
+	/* Error is reported in the output buffer. So we return success here.  */
+	return PVA_SUCCESS;
+}
+
+static enum pva_error
+pva_kmd_op_unregister_async(struct pva_kmd_context *ctx,
+			    struct pva_kmd_buffer *in_buffer,
+			    struct pva_kmd_buffer *out_buffer,
+			    struct pva_kmd_cmdbuf_builder *cmdbuf_builder)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_unregister_in_args *args;
+	struct pva_cmd_unregister_resource *unreg_cmd;
+
+	if (!access_ok(in_buffer, sizeof(struct pva_kmd_unregister_in_args))) {
+		err = PVA_INVAL;
+		goto err_out;
+	}
+
+	args = read_data(in_buffer, sizeof(struct pva_kmd_unregister_in_args));
+
+	unreg_cmd =
+		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*unreg_cmd));
+	if (unreg_cmd == NULL) {
+		pva_kmd_log_err(
+			"Unable to reserve memory for unregister command");
+		err = PVA_NOMEM;
+		goto err_out;
+	}
+
+	pva_kmd_set_cmd_unregister_resource(unreg_cmd, args->resource_id);
+
+	return PVA_SUCCESS;
+err_out:
+	return err;
+}
+
+static enum pva_error pva_kmd_async_ops_handler(
+	struct pva_kmd_context *ctx, struct pva_fw_postfence *post_fence,
+	struct pva_kmd_buffer *in_arg, struct pva_kmd_buffer *out_arg)
+{
+	struct pva_kmd_cmdbuf_builder cmdbuf_builder;
+	enum pva_error err = PVA_SUCCESS;
+	uint32_t wait_time = 0;
+
+	//first check if we have space in queue
+	while (pva_kmd_queue_space(&ctx->ctx_queue) == 0) {
+		pva_kmd_sleep_us(PVA_KMD_WAIT_FW_POLL_INTERVAL_US);
+		wait_time += PVA_KMD_WAIT_FW_POLL_INTERVAL_US;
+		if (wait_time > PVA_KMD_WAIT_FW_TIMEOUT_US) {
+			err = PVA_TIMEDOUT;
+			goto out;
+		}
+	}
+
+	err = pva_kmd_submitter_prepare(&ctx->submitter, &cmdbuf_builder);
+	if (err != PVA_SUCCESS) {
+		goto out;
+	}
+
+	while (access_ok(in_arg, sizeof(struct pva_kmd_op_header))) {
+		struct pva_kmd_op_header *header =
+			read_data(in_arg, sizeof(struct pva_kmd_op_header));
+
+		if (header->op_type >= PVA_KMD_OP_MAX) {
+			err = PVA_INVAL;
+			goto out;
+		}
+
+		switch (header->op_type) {
+		case PVA_KMD_OP_MEMORY_REGISTER:
+			err = pva_kmd_op_memory_register_async(
+				ctx, in_arg, out_arg, &cmdbuf_builder);
+			break;
+
+		case PVA_KMD_OP_EXECUTABLE_REGISTER:
+			err = pva_kmd_op_executable_register_async(
+				ctx, in_arg, out_arg, &cmdbuf_builder);
+			break;
+
+		case PVA_KMD_OP_DMA_CONFIG_REGISTER:
+			err = pva_kmd_op_dma_register_async(
+				ctx, in_arg, out_arg, &cmdbuf_builder);
+			break;
+		case PVA_KMD_OP_UNREGISTER:
+			err = pva_kmd_op_unregister_async(ctx, in_arg, out_arg,
+							  &cmdbuf_builder);
+			break;
+
+		default:
+			err = PVA_INVAL;
+			break;
+		}
+
+		if (err != PVA_SUCCESS) {
+			break;
+		}
+	}
+
+	/* This fence comes from user, so set the flag to inform FW */
+	post_fence->flags |= PVA_FW_POSTFENCE_FLAGS_USER_FENCE;
+	err = pva_kmd_submitter_submit_with_fence(&ctx->submitter,
+						  &cmdbuf_builder, post_fence);
+	ASSERT(err == PVA_SUCCESS);
+
+out:
+	return err;
+}
+
+static enum pva_error pva_kmd_op_context_init(struct pva_kmd_context *ctx,
+					      struct pva_kmd_buffer *in_buffer,
+					      struct pva_kmd_buffer *out_buffer)
+{
+	struct pva_kmd_context_init_in_args *ctx_init_args;
+	struct pva_kmd_context_init_out_args ctx_init_out = { 0 };
+	enum pva_error err;
+
+	if (!access_ok(in_buffer,
+		       sizeof(struct pva_kmd_context_init_in_args))) {
+		return PVA_INVAL;
+	}
+
+	if (!access_ok(out_buffer,
+		       sizeof(struct pva_kmd_context_init_out_args))) {
+		return PVA_INVAL;
+	}
+
+	ctx_init_args = read_data(in_buffer,
+				  sizeof(struct pva_kmd_context_init_in_args));
+
+	err = pva_kmd_context_init(ctx, ctx_init_args->resource_table_capacity);
+	ctx_init_out.error = err;
+	ctx_init_out.ccq_shm_hdl = (uint64_t)ctx->ccq_shm_handle;
+
+	write_data(out_buffer, &ctx_init_out, sizeof(ctx_init_out));
+
+	return err;
+}
+
+static enum pva_error
+pva_kmd_op_syncpt_register_async(struct pva_kmd_context *ctx,
+				 struct pva_kmd_buffer *in_buffer,
+				 struct pva_kmd_buffer *out_buffer,
+				 struct pva_kmd_cmdbuf_builder *cmdbuf_builder)
+{
+	enum pva_error err;
+	struct pva_syncpt_rw_info *syncpts;
+	struct pva_kmd_device_memory dev_mem;
+	uint32_t resource_id = 0;
+	struct pva_cmd_update_resource_table *update_cmd;
+	struct pva_resource_entry entry = { 0 };
+	struct pva_kmd_syncpt_register_out_args syncpt_register_out = { 0 };
+
+	/* Register RO syncpts */
+	dev_mem.iova = ctx->pva->syncpt_ro_iova;
+	dev_mem.va = 0;
+	dev_mem.size = ctx->pva->syncpt_offset * ctx->pva->num_syncpts;
+	dev_mem.pva = ctx->pva;
+	dev_mem.smmu_ctx_idx = PVA_R5_SMMU_CONTEXT_ID;
+	pva_kmd_mutex_lock(&ctx->resource_table_lock);
+	err = pva_kmd_add_syncpt_resource(&ctx->ctx_resource_table, &dev_mem,
+					  &resource_id);
+	pva_kmd_mutex_unlock(&ctx->resource_table_lock);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	syncpt_register_out.syncpt_ro_res_id = resource_id;
+	syncpt_register_out.num_ro_syncpoints = ctx->pva->num_syncpts;
+	update_cmd =
+		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
+	ASSERT(update_cmd != NULL);
+	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
+					  &entry);
+	ASSERT(err == PVA_SUCCESS);
+	pva_kmd_set_cmd_update_resource_table(
+		update_cmd, ctx->resource_table_id, resource_id, &entry);
+
+	/* Register RW syncpts */
+	syncpts = (struct pva_syncpt_rw_info *)pva_kmd_get_block(
+		&ctx->pva->syncpt_allocator, ctx->syncpt_block_index);
+	ASSERT(syncpts != NULL);
+
+	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS_PER_CONTEXT; i++) {
+		ctx->syncpt_ids[i] = syncpts[i].syncpt_id;
+		syncpt_register_out.synpt_ids[i] = syncpts[i].syncpt_id;
+	}
+
+	dev_mem.iova = syncpts[0].syncpt_iova;
+	dev_mem.va = 0;
+	dev_mem.size = ctx->pva->syncpt_offset * PVA_NUM_RW_SYNCPTS_PER_CONTEXT;
+	dev_mem.pva = ctx->pva;
+	dev_mem.smmu_ctx_idx = PVA_R5_SMMU_CONTEXT_ID;
+	pva_kmd_mutex_lock(&ctx->resource_table_lock);
+	err = pva_kmd_add_syncpt_resource(&ctx->ctx_resource_table, &dev_mem,
+					  &resource_id);
+	pva_kmd_mutex_unlock(&ctx->resource_table_lock);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	syncpt_register_out.syncpt_rw_res_id = resource_id;
+	syncpt_register_out.synpt_size = ctx->pva->syncpt_offset;
+	ctx->ctx_resource_table.syncpt_allocator = &ctx->pva->syncpt_allocator;
+	update_cmd =
+		pva_kmd_reserve_cmd_space(cmdbuf_builder, sizeof(*update_cmd));
+	ASSERT(update_cmd != NULL);
+	err = pva_kmd_make_resource_entry(&ctx->ctx_resource_table, resource_id,
+					  &entry);
+	ASSERT(err == PVA_SUCCESS);
+	pva_kmd_set_cmd_update_resource_table(
+		update_cmd, ctx->resource_table_id, resource_id, &entry);
+
+err_out:
+	syncpt_register_out.error = err;
+	write_data(out_buffer, &syncpt_register_out,
+		   sizeof(syncpt_register_out));
+	return err;
+}
+
+static enum pva_error pva_kmd_op_queue_create(struct pva_kmd_context *ctx,
+					      struct pva_kmd_buffer *in_arg,
+					      struct pva_kmd_buffer *out_arg)
+{
+	struct pva_kmd_queue_create_in_args *queue_create_args;
+	struct pva_kmd_queue_create_out_args queue_out_args = { 0 };
+	uint32_t queue_id = PVA_INVALID_QUEUE_ID;
+	enum pva_error err = PVA_SUCCESS;
+
+	if (!access_ok(in_arg, sizeof(struct pva_kmd_queue_create_in_args))) {
+		return PVA_INVAL;
+	}
+
+	if (!access_ok(out_arg, sizeof(struct pva_kmd_queue_create_out_args))) {
+		return PVA_INVAL;
+	}
+
+	queue_create_args =
+		read_data(in_arg, sizeof(struct pva_kmd_queue_create_in_args));
+	queue_out_args.error =
+		pva_kmd_queue_create(ctx, queue_create_args, &queue_id);
+	if (queue_out_args.error == PVA_SUCCESS) {
+		queue_out_args.queue_id = queue_id;
+	}
+
+	if (queue_id >= PVA_MAX_NUM_QUEUES_PER_CONTEXT) {
+		pva_kmd_log_err("pva_kmd_op_queue_create invalid queue id");
+		err = PVA_INVAL;
+		goto err_out;
+	}
+	pva_kmd_read_syncpt_val(ctx->pva, ctx->syncpt_ids[queue_id],
+				&queue_out_args.syncpt_fence_counter);
+
+	write_data(out_arg, &queue_out_args,
+		   sizeof(struct pva_kmd_queue_create_out_args));
+
+err_out:
+	return err;
+}
+
+static enum pva_error pva_kmd_op_queue_destroy(struct pva_kmd_context *ctx,
+					       struct pva_kmd_buffer *in_arg,
+					       struct pva_kmd_buffer *out_arg)
+{
+	struct pva_kmd_queue_destroy_in_args *queue_destroy_args;
+	struct pva_kmd_queue_destroy_out_args queue_out_args = { 0 };
+
+	if (!access_ok(in_arg, sizeof(struct pva_kmd_queue_destroy_in_args))) {
+		return PVA_INVAL;
+	}
+
+	if (!access_ok(out_arg,
+		       sizeof(struct pva_kmd_queue_destroy_out_args))) {
+		return PVA_INVAL;
+	}
+
+	queue_destroy_args =
+		read_data(in_arg, sizeof(struct pva_kmd_queue_destroy_in_args));
+	queue_out_args.error = pva_kmd_queue_destroy(ctx, queue_destroy_args);
+
+	write_data(out_arg, &queue_out_args,
+		   sizeof(struct pva_kmd_queue_destroy_out_args));
+
+	return PVA_SUCCESS;
+}
+
+static enum pva_error
+pva_kmd_op_executable_get_symbols(struct pva_kmd_context *ctx,
+				  struct pva_kmd_buffer *in_arg,
+				  struct pva_kmd_buffer *out_arg)
+{
+	struct pva_kmd_executable_get_symbols_in_args *sym_in_args;
+	struct pva_kmd_executable_get_symbols_out_args sym_out_args = { 0 };
+	struct pva_kmd_resource_record *rec;
+	enum pva_error err = PVA_SUCCESS;
+	uint32_t table_size = 0;
+	uint32_t size = 0;
+
+	if (!access_ok(in_arg,
+		       sizeof(struct pva_kmd_executable_get_symbols_in_args))) {
+		return PVA_INVAL;
+	}
+
+	if (!access_ok(out_arg,
+		       sizeof(struct pva_kmd_executable_get_symbols_out_args))) {
+		return PVA_INVAL;
+	}
+
+	sym_in_args = read_data(
+		in_arg, sizeof(struct pva_kmd_executable_get_symbols_in_args));
+
+	rec = pva_kmd_use_resource(&ctx->ctx_resource_table,
+				   sym_in_args->exec_resource_id);
+	if (rec == NULL) {
+		err = PVA_INVAL;
+		pva_kmd_log_err("pva_kmd_use_resource failed");
+		goto err_out;
+	}
+	if (rec->type != PVA_RESOURCE_TYPE_EXEC_BIN) {
+		err = PVA_INVAL;
+		pva_kmd_log_err("Not an executable resource");
+		goto err_drop;
+	}
+
+	table_size = safe_mulu32(rec->vpu_bin.symbol_table.n_symbols,
+				 sizeof(struct pva_symbol_info));
+	size = safe_addu32(
+		table_size,
+		sizeof(struct pva_kmd_executable_get_symbols_out_args));
+	if (!access_ok(out_arg, size)) {
+		err = PVA_INVAL;
+		goto err_drop;
+	}
+
+	sym_out_args.error = err;
+	sym_out_args.num_symbols = rec->vpu_bin.symbol_table.n_symbols;
+	write_data(out_arg, &sym_out_args, sizeof(sym_out_args));
+	write_data(out_arg, rec->vpu_bin.symbol_table.symbols, table_size);
+
+	pva_kmd_drop_resource(&ctx->ctx_resource_table,
+			      sym_in_args->exec_resource_id);
+
+	return PVA_SUCCESS;
+
+err_drop:
+	pva_kmd_drop_resource(&ctx->ctx_resource_table,
+			      sym_in_args->exec_resource_id);
+
+err_out:
+	sym_out_args.error = err;
+	write_data(out_arg, &sym_out_args, sizeof(sym_out_args));
+	return err;
+}
+
+typedef enum pva_error (*pva_kmd_async_op_func_t)(
+	struct pva_kmd_context *ctx, struct pva_kmd_buffer *in_buffer,
+	struct pva_kmd_buffer *out_buffer,
+	struct pva_kmd_cmdbuf_builder *cmdbuf_builder);
+
+static enum pva_error
+pva_kmd_op_synced_submit(struct pva_kmd_context *ctx,
+			 struct pva_kmd_buffer *in_buffer,
+			 struct pva_kmd_buffer *out_buffer,
+			 pva_kmd_async_op_func_t async_op_func)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_cmdbuf_builder cmdbuf_builder;
+	uint32_t fence_val;
+
+	err = pva_kmd_submitter_prepare(&ctx->submitter, &cmdbuf_builder);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	err = async_op_func(ctx, in_buffer, out_buffer, &cmdbuf_builder);
+	if (err != PVA_SUCCESS) {
+		goto cancel_submit;
+	}
+
+	err = pva_kmd_submitter_submit(&ctx->submitter, &cmdbuf_builder,
+				       &fence_val);
+	/* TODO: handle this error */
+	ASSERT(err == PVA_SUCCESS);
+
+	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	/* TODO: handle this error when FW reboot is supported */
+	ASSERT(err == PVA_SUCCESS);
+
+	return PVA_SUCCESS;
+cancel_submit:
+	pva_kmd_cmdbuf_builder_cancel(&cmdbuf_builder);
+err_out:
+	return err;
+}
+
+static enum pva_error pva_kmd_sync_ops_handler(struct pva_kmd_context *ctx,
+					       struct pva_kmd_buffer *in_arg,
+					       struct pva_kmd_buffer *out_arg)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_op_header *header;
+
+	if (!access_ok(in_arg, sizeof(struct pva_kmd_op_header))) {
+		err = PVA_INVAL;
+		goto out;
+	}
+
+	header = read_data(in_arg, sizeof(struct pva_kmd_op_header));
+	switch (header->op_type) {
+	case PVA_KMD_OP_CONTEXT_INIT:
+		err = pva_kmd_op_context_init(ctx, in_arg, out_arg);
+		break;
+	case PVA_KMD_OP_QUEUE_CREATE:
+		err = pva_kmd_op_queue_create(ctx, in_arg, out_arg);
+		break;
+	case PVA_KMD_OP_QUEUE_DESTROY:
+		err = pva_kmd_op_queue_destroy(ctx, in_arg, out_arg);
+		break;
+	case PVA_KMD_OP_EXECUTABLE_GET_SYMBOLS:
+		err = pva_kmd_op_executable_get_symbols(ctx, in_arg, out_arg);
+		break;
+	case PVA_KMD_OP_MEMORY_REGISTER:
+		err = pva_kmd_op_synced_submit(
+			ctx, in_arg, out_arg, pva_kmd_op_memory_register_async);
+		break;
+	case PVA_KMD_OP_SYNPT_REGISTER:
+		err = pva_kmd_op_synced_submit(
+			ctx, in_arg, out_arg, pva_kmd_op_syncpt_register_async);
+		break;
+	case PVA_KMD_OP_EXECUTABLE_REGISTER:
+		err = pva_kmd_op_synced_submit(
+			ctx, in_arg, out_arg,
+			pva_kmd_op_executable_register_async);
+		break;
+	case PVA_KMD_OP_DMA_CONFIG_REGISTER:
+		err = pva_kmd_op_synced_submit(ctx, in_arg, out_arg,
+					       pva_kmd_op_dma_register_async);
+		break;
+	case PVA_KMD_OP_UNREGISTER:
+		err = pva_kmd_op_synced_submit(ctx, in_arg, out_arg,
+					       pva_kmd_op_unregister_async);
+		break;
+	default:
+		err = PVA_INVAL;
+		break;
+	}
+
+out:
+	return err;
+}
+
+enum pva_error pva_kmd_ops_handler(struct pva_kmd_context *ctx,
+				   void const *ops_buffer, uint32_t ops_size,
+				   void *response,
+				   uint32_t response_buffer_size,
+				   uint32_t *out_response_size)
+{
+	struct pva_kmd_operations *ops;
+	struct pva_kmd_buffer in_buffer = { 0 }, out_buffer = { 0 };
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_response_header *resp_hdr;
+
+	in_buffer.base = ops_buffer;
+	in_buffer.size = ops_size;
+
+	out_buffer.base = response;
+	out_buffer.size = response_buffer_size;
+
+	if (!access_ok(&in_buffer, sizeof(struct pva_kmd_operations))) {
+		err = PVA_INVAL;
+		goto out;
+	}
+
+	if (!access_ok(&out_buffer, sizeof(struct pva_kmd_response_header))) {
+		err = PVA_INVAL;
+		goto out;
+	}
+
+	resp_hdr =
+		read_data(&out_buffer, sizeof(struct pva_kmd_response_header));
+
+	ops = read_data(&in_buffer, sizeof(struct pva_kmd_operations));
+
+	if (ops->mode == PVA_KMD_OPS_MODE_SYNC) {
+		/* Process one sync operation */
+		err = pva_kmd_sync_ops_handler(ctx, &in_buffer, &out_buffer);
+
+	} else {
+		/* Process async operations:
+		 * - memory register
+		 * - executable register
+		 * - DMA configuration registration
+		 * - unregister
+		 */
+		err = pva_kmd_async_ops_handler(ctx, &ops->postfence,
+						&in_buffer, &out_buffer);
+	}
+	//Update the size of the responses in the response header.
+	// This size also include the header size.
+	resp_hdr->rep_size = out_buffer.offset;
+out:
+	*out_response_size = out_buffer.offset;
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_op_handler.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_OP_HANDLER_H
+#define PVA_KMD_OP_HANDLER_H
+
+#include "pva_kmd_context.h"
+#include "pva_fw.h"
+#include "pva_kmd.h"
+
+/** @brief Handler for PVA KMD operations.
+*
+* This function implements the only runtime interface with UMD. Shim layers
+* receive the input data from UMD and call this function to execute the
+* operations. Then, shim layers send the response back to UMD.
+*
+* @param ctx The KMD context.
+* @param ops Pointer to the input buffer containing the operations to be
+*            executed. The common layer assumes that this buffer is private to
+*            KMD and will dereference it directly without making a copy.
+*            Specifically on Linux, this parameter should point to a private
+*            kernel space buffer instead of the user space buffer.
+* @param ops_size Size of the input buffer.
+* @param response Pointer to the buffer where the response will be written.
+* @param response_buffer_size Size of the response buffer.
+* @param out_response_size Pointer to a variable where the actual size of the
+*                          response will be written.
+*
+* @return pva_error indicating the success or failure of the operation.
+*/
+enum pva_error pva_kmd_ops_handler(struct pva_kmd_context *ctx, void const *ops,
+				   uint32_t ops_size, void *response,
+				   uint32_t response_buffer_size,
+				   uint32_t *out_response_size);
+
+#endif // PVA_KMD_OP_HANDLER_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.c
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_utils.h"
+#include "pva_fw.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_queue.h"
+#include "pva_kmd_context.h"
+#include "pva_kmd_block_allocator.h"
+#include "pva_utils.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_constants.h"
+#include "pva_kmd_pm.h"
+
+enum pva_error pva_kmd_prepare_suspend(struct pva_kmd_device *pva)
+{
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_cmd_suspend_fw *fw_suspend;
+	uint32_t fence_val;
+
+	pva_kmd_mutex_lock(&pva->powercycle_lock);
+	if (pva->refcount == 0u) {
+		pva_dbg_printf("PVA: Nothing to prepare for suspend");
+		err = PVA_SUCCESS;
+		goto err_out;
+	}
+
+	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"PVA: Prepare submitter for FW suspend command failed\n");
+		goto err_out;
+	}
+
+	//Build args
+	fw_suspend = pva_kmd_reserve_cmd_space(&builder, sizeof(*fw_suspend));
+	if (fw_suspend == NULL) {
+		pva_kmd_log_err(
+			"PVA: Memory alloc for FW suspend command failed\n");
+		err = PVA_NOMEM;
+		goto cancel_submit;
+	}
+
+	pva_kmd_set_cmd_suspend_fw(fw_suspend);
+
+	//Submit
+	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"PVA: Submission for FW suspend command failed\n");
+		goto cancel_submit;
+	}
+
+	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"PVA: Waiting for FW timed out when preparing for suspend state\n");
+		goto err_out;
+	}
+
+cancel_submit:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+
+err_out:
+	pva_kmd_mutex_unlock(&pva->powercycle_lock);
+	return err;
+}
+
+enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva)
+{
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
+	struct pva_cmd_init_resource_table *res_cmd;
+	struct pva_cmd_init_queue *queue_cmd;
+	struct pva_cmd_resume_fw *fw_resume;
+	enum pva_error err;
+	uint32_t fence_val;
+	struct pva_kmd_queue *queue;
+
+	pva_kmd_mutex_lock(&pva->powercycle_lock);
+	if (pva->refcount == 0u) {
+		pva_dbg_printf(
+			"PVA : Nothing to check for completion in resume");
+		err = PVA_SUCCESS;
+		goto err_out;
+	}
+
+	pva_kmd_send_resource_table_info_by_ccq(pva, &pva->dev_resource_table);
+	pva_kmd_send_queue_info_by_ccq(pva, &pva->dev_queue);
+
+	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"PVA: Prepare submitter for FW resume command failed\n");
+		goto err_out;
+	}
+
+	fw_resume = pva_kmd_reserve_cmd_space(&builder, sizeof(*fw_resume));
+	if (fw_resume == NULL) {
+		pva_kmd_log_err(
+			"PVA: Memory alloc for FW resume command failed\n");
+		err = PVA_NOMEM;
+		goto cancel_builder;
+	}
+
+	pva_kmd_set_cmd_resume_fw(fw_resume);
+
+	for (uint8_t i = 0; i < pva->max_n_contexts; i++) {
+		struct pva_kmd_context *ctx = pva_kmd_get_context(
+			pva, sat_add8(i, PVA_KMD_USER_CONTEXT_ID_BASE));
+		if (ctx != NULL) {
+			/**Initialize resource table */
+			res_cmd = pva_kmd_reserve_cmd_space(&builder,
+							    sizeof(*res_cmd));
+			if (res_cmd == NULL) {
+				pva_kmd_log_err(
+					"PVA: Memory alloc for context registration in FW resume command failed\n");
+				err = PVA_NOMEM;
+				goto cancel_builder;
+			}
+
+			pva_dbg_printf(
+				"PVA: Resume init resource table for context %d\n",
+				ctx->ccq_id);
+			pva_kmd_set_cmd_init_resource_table(
+				res_cmd, ctx->resource_table_id,
+				ctx->ctx_resource_table.table_mem->iova,
+				ctx->ctx_resource_table.n_entries);
+
+			queue_cmd = pva_kmd_reserve_cmd_space(
+				&builder, sizeof(*queue_cmd));
+			if (queue_cmd == NULL) {
+				pva_kmd_log_err(
+					"PVA: Memory alloc for queue registration in FW resume command failed\n");
+				err = PVA_NOMEM;
+				goto cancel_builder;
+			}
+
+			pva_dbg_printf(
+				"PVA: Resume priv queue for context %d\n",
+				ctx->ccq_id);
+			pva_kmd_set_cmd_init_queue(
+				queue_cmd, PVA_PRIV_CCQ_ID,
+				ctx->ccq_id, /* For privileged queues, queue ID == user CCQ ID*/
+				ctx->ctx_queue.queue_memory->iova,
+				ctx->ctx_queue.max_num_submit);
+
+			/**Initialize resource table */
+			for (uint32_t j = 0; j < ctx->max_n_queues; j++) {
+				queue = pva_kmd_get_block(&ctx->queue_allocator,
+							  j);
+				if (queue != NULL) {
+					pva_dbg_printf(
+						"PVA: Resume queue for context %d, queue %d\n",
+						queue->ccq_id, queue->queue_id);
+					queue_cmd = pva_kmd_reserve_cmd_space(
+						&builder, sizeof(*queue_cmd));
+					if (queue_cmd == NULL) {
+						pva_kmd_log_err(
+							"PVA: Memory alloc for queue registration in FW resume command failed\n");
+						err = PVA_NOMEM;
+						goto cancel_builder;
+					}
+
+					pva_kmd_set_cmd_init_queue(
+						queue_cmd, queue->ccq_id,
+						queue->queue_id,
+						queue->queue_memory->iova,
+						queue->max_num_submit);
+				}
+			}
+		}
+	}
+
+	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		// Error is either QUEUE_FULL or TIMEDOUT
+		pva_kmd_log_err(
+			"PVA: Submission for FW resume command failed\n");
+		goto cancel_builder;
+	}
+
+	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Waiting for FW timed out when resuming from suspend state");
+		goto err_out;
+	}
+
+cancel_builder:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+
+err_out:
+	pva_kmd_mutex_unlock(&pva->powercycle_lock);
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_pm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_PM_H
+#define PVA_KMD_PM_H
+
+struct pva_kmd_device;
+enum pva_error pva_kmd_prepare_suspend(struct pva_kmd_device *pva);
+enum pva_error pva_kmd_complete_resume(struct pva_kmd_device *pva);
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.c
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_utils.h"
+#include "pva_fw.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_kmd_queue.h"
+#include "pva_kmd_context.h"
+#include "pva_kmd_block_allocator.h"
+#include "pva_utils.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_constants.h"
+
+void pva_kmd_queue_init(struct pva_kmd_queue *queue, struct pva_kmd_device *pva,
+			uint8_t ccq_id, uint8_t queue_id,
+			pva_kmd_mutex_t *ccq_lock,
+			struct pva_kmd_device_memory *queue_memory,
+			uint32_t max_num_submit)
+{
+	queue->pva = pva;
+	queue->queue_memory = queue_memory;
+	queue->ccq_id = ccq_id;
+	queue->queue_id = queue_id;
+	queue->max_num_submit = max_num_submit;
+	queue->queue_header = queue_memory->va;
+	queue->ccq_lock = ccq_lock;
+}
+
+uint32_t pva_kmd_queue_space(struct pva_kmd_queue *queue)
+{
+	uint32_t head = queue->queue_header->cb_head;
+	uint32_t tail = queue->queue_header->cb_tail;
+	uint32_t size = queue->max_num_submit;
+	return pva_fw_queue_space(head, tail, size);
+}
+
+enum pva_error
+pva_kmd_queue_submit(struct pva_kmd_queue *queue,
+		     struct pva_fw_cmdbuf_submit_info const *submit_info)
+{
+	uint32_t head = queue->queue_header->cb_head;
+	uint32_t tail = queue->queue_header->cb_tail;
+	uint32_t size = queue->max_num_submit;
+	uint64_t ccq_entry;
+	enum pva_error err;
+	struct pva_fw_cmdbuf_submit_info *items = pva_offset_pointer(
+		queue->queue_header, sizeof(*queue->queue_header));
+
+	if (pva_fw_queue_space(head, tail, size) == 0) {
+		return PVA_QUEUE_FULL;
+	}
+
+	items[tail] = *submit_info;
+
+	/* Update tail  */
+	tail = wrap_add(tail, 1, size);
+	ccq_entry =
+		PVA_INSERT64(PVA_FW_CCQ_OP_UPDATE_TAIL, PVA_FW_CCQ_OPCODE_MSB,
+			     PVA_FW_CCQ_OPCODE_LSB) |
+		PVA_INSERT64(queue->queue_id, PVA_FW_CCQ_QUEUE_ID_MSB,
+			     PVA_FW_CCQ_QUEUE_ID_LSB) |
+		PVA_INSERT64(tail, PVA_FW_CCQ_TAIL_MSB, PVA_FW_CCQ_TAIL_LSB);
+
+	pva_kmd_mutex_lock(queue->ccq_lock);
+	/* TODO: memory write barrier is needed here */
+	err = pva_kmd_ccq_push_with_timeout(queue->pva, queue->ccq_id,
+					    ccq_entry,
+					    PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+					    PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err == PVA_SUCCESS) {
+		queue->queue_header->cb_tail = tail;
+	}
+	pva_kmd_mutex_unlock(queue->ccq_lock);
+
+	return err;
+}
+
+void pva_kmd_queue_deinit(struct pva_kmd_queue *queue)
+{
+	queue->queue_memory = NULL;
+	queue->ccq_id = PVA_INVALID_QUEUE_ID;
+	queue->max_num_submit = 0;
+}
+
+static enum pva_error notify_fw_queue_deinit(struct pva_kmd_context *ctx,
+					     struct pva_kmd_queue *queue)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_cmd_deinit_queue *queue_cmd;
+	uint32_t fence_val;
+
+	err = pva_kmd_submitter_prepare(&ctx->submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto end;
+	}
+
+	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
+	if (queue_cmd == NULL) {
+		err = PVA_NOMEM;
+		goto cancel_submitter;
+	}
+	pva_kmd_set_cmd_deinit_queue(queue_cmd, queue->ccq_id, queue->queue_id);
+
+	err = pva_kmd_submitter_submit(&ctx->submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		goto cancel_submitter;
+	}
+
+	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	ASSERT(err == PVA_SUCCESS);
+	return PVA_SUCCESS;
+cancel_submitter:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+end:
+	return err;
+}
+
+enum pva_error
+pva_kmd_queue_create(struct pva_kmd_context *ctx,
+		     struct pva_kmd_queue_create_in_args *in_args,
+		     uint32_t *queue_id)
+{
+	struct pva_kmd_device_memory *submission_mem_kmd = NULL;
+	struct pva_kmd_queue *queue = NULL;
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_cmd_init_queue *queue_cmd;
+	uint32_t fence_val;
+	enum pva_error err, tmperr;
+
+	queue = pva_kmd_zalloc_block(&ctx->queue_allocator, queue_id);
+	if (queue == NULL) {
+		err = PVA_NOMEM;
+		goto err_out;
+	}
+
+	/* Get handle from mapped memory */
+	submission_mem_kmd = pva_kmd_device_memory_acquire(
+		in_args->queue_memory_handle, in_args->queue_memory_offset,
+		pva_get_submission_queue_memory_size(
+			in_args->max_submission_count),
+		ctx);
+	if (submission_mem_kmd == NULL) {
+		err = PVA_INVAL;
+		goto err_free_queue;
+	}
+
+	pva_kmd_queue_init(queue, ctx->pva, ctx->ccq_id, *queue_id,
+			   &ctx->ccq_lock, submission_mem_kmd,
+			   in_args->max_submission_count);
+
+	/* Get device mapped IOVA to share with FW */
+	err = pva_kmd_device_memory_iova_map(submission_mem_kmd, ctx->pva,
+					     PVA_ACCESS_RW,
+					     PVA_R5_SMMU_CONTEXT_ID);
+	if (err != PVA_SUCCESS) {
+		goto err_free_kmd_memory;
+	}
+
+	if (submission_mem_kmd->iova < FW_SHARED_MEMORY_START) {
+		pva_kmd_log_err(
+			"Not able to map memory in the R5 shared region");
+		err = PVA_NOMEM;
+		goto unmap_iova;
+	}
+
+	err = pva_kmd_submitter_prepare(&ctx->submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto unmap_iova;
+	}
+
+	queue_cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*queue_cmd));
+	if (queue_cmd == NULL) {
+		err = PVA_NOMEM;
+		goto cancel_submitter;
+	}
+	ASSERT(queue_cmd != NULL);
+	pva_kmd_set_cmd_init_queue(queue_cmd, queue->ccq_id, queue->queue_id,
+				   queue->queue_memory->iova,
+				   queue->max_num_submit);
+
+	err = pva_kmd_submitter_submit(&ctx->submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		goto cancel_submitter;
+	}
+
+	err = pva_kmd_submitter_wait(&ctx->submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		goto cancel_submitter;
+	}
+
+	return PVA_SUCCESS;
+
+cancel_submitter:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+unmap_iova:
+	pva_kmd_device_memory_iova_unmap(submission_mem_kmd);
+err_free_kmd_memory:
+	pva_kmd_device_memory_free(queue->queue_memory);
+	pva_kmd_queue_deinit(queue);
+err_free_queue:
+	tmperr = pva_kmd_free_block(&ctx->queue_allocator, *queue_id);
+	ASSERT(tmperr == PVA_SUCCESS);
+
+	*queue_id = PVA_INVALID_QUEUE_ID;
+err_out:
+	return err;
+}
+
+enum pva_error
+pva_kmd_queue_destroy(struct pva_kmd_context *ctx,
+		      struct pva_kmd_queue_destroy_in_args *in_args)
+{
+	struct pva_kmd_queue *queue;
+	enum pva_error err = PVA_SUCCESS;
+
+	/*
+	 * TODO :
+	 * Send command to FW to stop queue usage. Wait for ack.
+	 * This call needs to be added after syncpoint and ccq functions are ready.
+	 */
+	queue = pva_kmd_get_block(&ctx->queue_allocator, in_args->queue_id);
+	if (queue == NULL) {
+		return PVA_INVAL;
+	}
+
+	err = notify_fw_queue_deinit(ctx, queue);
+	if (err != PVA_SUCCESS) {
+		return err;
+	}
+
+	pva_kmd_device_memory_iova_unmap(queue->queue_memory);
+
+	pva_kmd_device_memory_free(queue->queue_memory);
+
+	pva_kmd_queue_deinit(queue);
+
+	err = pva_kmd_free_block(&ctx->queue_allocator, in_args->queue_id);
+	ASSERT(err == PVA_SUCCESS);
+	return PVA_SUCCESS;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_queue.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_QUEUE_H
+#define PVA_KMD_QUEUE_H
+#include "pva_fw.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_kmd_mutex.h"
+
+struct pva_kmd_queue {
+	struct pva_kmd_device *pva;
+	struct pva_kmd_device_memory *queue_memory;
+	struct pva_fw_submit_queue_header *queue_header;
+	pva_kmd_mutex_t *ccq_lock;
+	uint8_t ccq_id;
+	uint8_t queue_id;
+	uint32_t max_num_submit;
+};
+
+void pva_kmd_queue_init(struct pva_kmd_queue *queue, struct pva_kmd_device *pva,
+			uint8_t ccq_id, uint8_t queue_id,
+			pva_kmd_mutex_t *ccq_lock,
+			struct pva_kmd_device_memory *queue_memory,
+			uint32_t max_num_submit);
+enum pva_error
+pva_kmd_queue_create(struct pva_kmd_context *ctx,
+		     struct pva_kmd_queue_create_in_args *in_args,
+		     uint32_t *queue_id);
+
+enum pva_error
+pva_kmd_queue_destroy(struct pva_kmd_context *ctx,
+		      struct pva_kmd_queue_destroy_in_args *in_args);
+
+enum pva_error
+pva_kmd_queue_submit(struct pva_kmd_queue *queue,
+		     struct pva_fw_cmdbuf_submit_info const *submit_info);
+uint32_t pva_kmd_queue_space(struct pva_kmd_queue *queue);
+void pva_kmd_queue_deinit(struct pva_kmd_queue *queue);
+
+#endif // PVA_KMD_QUEUE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_regs.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_regs.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_REGS_H
+#define PVA_KMD_REGS_H
+
+#include "pva_api.h"
+#include "pva_constants.h"
+
+/* Exception vectors */
+#define PVA_REG_EVP_RESET_ADDR 0x20
+#define PVA_REG_EVP_UNDEF_ADDR 0x24
+#define PVA_REG_EVP_SWI_ADDR 0x28
+#define PVA_REG_EVP_PREFETCH_ABORT_ADDR 0x2c
+#define PVA_REG_EVP_DATA_ABORT_ADDR 0x30
+#define PVA_REG_EVP_RSVD_ADDR 0x34
+#define PVA_REG_EVP_IRQ_ADDR 0x38
+#define PVA_REG_EVP_FIQ_ADDR 0x3c
+
+/* R5 */
+#define PVA_REG_PROC_CPUHALT_ADDR 0x30000
+
+/* SCRs */
+#define PVA_SEC_SCR_SECEXT_INTR_EVENT 0x28804
+#define PVA_PROC_SCR_PROC 0x30800
+
+#define PVA_REG_EVP_SCR_ADDR 0x40 //PVA_EVP_SCR_EVP_0
+#define PVA_CFG_SCR_STATUS_CNTL 0x258000 //PVA_CFG_SCR_STATUS_CNTL_0
+#define PVA_CFG_SCR_PRIV 0x258008 //PVA_CFG_SCR_PRIV_0
+#define PVA_CFG_SCR_CCQ_CNTL 0x258010 //PVA_CFG_SCR_CCQ_CNTL_0
+
+/* HSP */
+#define PVA_REG_HSP_COMMON_ADDR 0x160000
+#define PVA_REG_HSP_INT_IE0_ADDR 0x160100
+#define PVA_REG_HSP_INT_IE1_ADDR 0x160104
+#define PVA_REG_HSP_INT_IE2_ADDR 0x160108
+#define PVA_REG_HSP_INT_IE3_ADDR 0x16010c
+#define PVA_REG_HSP_INT_IE4_ADDR 0x160110
+#define PVA_REG_HSP_INT_EXTERNAL_ADDR 0x160300
+#define PVA_REG_HSP_INT_INTERNAL_ADDR 0x160304
+#define PVA_REG_HSP_SM0_ADDR 0x170000
+#define PVA_REG_HSP_SM1_ADDR 0x178000
+#define PVA_REG_HSP_SM2_ADDR 0x180000
+#define PVA_REG_HSP_SM3_ADDR 0x188000
+#define PVA_REG_HSP_SM4_ADDR 0x190000
+#define PVA_REG_HSP_SM5_ADDR 0x198000
+#define PVA_REG_HSP_SM6_ADDR 0x1a0000
+#define PVA_REG_HSP_SM7_ADDR 0x1a8000
+#define PVA_REG_HSP_SS0_STATE_ADDR 0x1b0000
+#define PVA_REG_HSP_SS0_SET_ADDR 0x1b0004
+#define PVA_REG_HSP_SS0_CLR_ADDR 0x1b0008
+#define PVA_REG_HSP_SS1_STATE_ADDR 0x1c0000
+#define PVA_REG_HSP_SS1_SET_ADDR 0x1c0004
+#define PVA_REG_HSP_SS1_CLR_ADDR 0x1c0008
+#define PVA_REG_HSP_SS2_STATE_ADDR 0x1d0000
+#define PVA_REG_HSP_SS2_SET_ADDR 0x1d0004
+#define PVA_REG_HSP_SS2_CLR_ADDR 0x1d0008
+#define PVA_REG_HSP_SS3_STATE_ADDR 0x1e0000
+#define PVA_REG_HSP_SS3_SET_ADDR 0x1e0004
+#define PVA_REG_HSP_SS3_CLR_ADDR 0x1e0008
+
+/* SEC */
+#define PVA_REG_SEC_ERRSLICE0_MISSIONERR_ENABLE_ADDR 0x20030
+#define PVA_REG_SEC_ERRSLICE1_MISSIONERR_ENABLE_ADDR 0x20060
+#define PVA_REG_SEC_ERRSLICE2_MISSIONERR_ENABLE_ADDR 0x20090
+#define PVA_REG_SEC_ERRSLICE3_MISSIONERR_ENABLE_ADDR 0x200c0
+#define PVA_REG_SEC_ERRSLICE0_LATENTERR_ENABLE_ADDR 0x20040
+#define PVA_REG_SEC_ERRSLICE1_LATENTERR_ENABLE_ADDR 0x20070
+#define PVA_REG_SEC_ERRSLICE2_LATENTERR_ENABLE_ADDR 0x200a0
+#define PVA_REG_SEC_ERRSLICE3_LATENTERR_ENABLE_ADDR 0x200d0
+
+/* SEC_LIC_INTR_STATUS */
+#define PVA_REG_SEC_LIC_INTR_H1X_MSB 7
+#define PVA_REG_SEC_LIC_INTR_H1X_LSB 5
+#define PVA_REG_SEC_LIC_INTR_HSP_MSB 4
+#define PVA_REG_SEC_LIC_INTR_HSP_LSB 1
+#define PVA_REG_SEC_LIC_INTR_WDT_MSB 0
+#define PVA_REG_SEC_LIC_INTR_WDT_LSB 0
+
+/* CCQ status 2  */
+#define PVA_REG_CCQ_STATUS2_INTR_OVERFLOW_BIT PVA_BIT(28)
+#define PVA_REG_CCQ_STATUS2_INTR_STATUS8_BIT PVA_BIT(24)
+#define PVA_REG_CCQ_STATUS2_INTR_STATUS7_BIT PVA_BIT(20)
+#define PVA_REG_CCQ_STATUS2_INTR_ALL_BITS                                      \
+	(PVA_REG_CCQ_STATUS2_INTR_OVERFLOW_BIT |                               \
+	 PVA_REG_CCQ_STATUS2_INTR_STATUS8_BIT |                                \
+	 PVA_REG_CCQ_STATUS2_INTR_STATUS7_BIT)
+#define PVA_REG_CCQ_STATUS2_NUM_ENTRIES_MSB 4
+#define PVA_REG_CCQ_STATUS2_NUM_ENTRIES_LSB 0
+
+struct pva_kmd_ccq_regspec {
+	uint32_t status_count;
+	uint32_t status[PVA_CFG_CCQ_STATUS_COUNT];
+	uint32_t fifo;
+};
+
+struct pva_kmd_regspec {
+	uint32_t sec_lic_intr_enable;
+	uint32_t sec_lic_intr_status;
+	uint32_t cfg_r5user_lsegreg;
+	uint32_t cfg_r5user_usegreg;
+	uint32_t cfg_priv_ar1_lsegreg;
+	uint32_t cfg_priv_ar1_usegreg;
+	uint32_t cfg_priv_ar2_lsegreg;
+	uint32_t cfg_priv_ar2_usegreg;
+	uint32_t cfg_priv_ar1_start;
+	uint32_t cfg_priv_ar1_end;
+	uint32_t cfg_priv_ar2_start;
+	uint32_t cfg_priv_ar2_end;
+	uint32_t cfg_user_sid_base;
+	uint32_t cfg_priv_sid;
+	uint32_t cfg_vps_sid;
+	uint32_t cfg_perf_mon;
+
+	uint32_t cfg_scr_priv_0;
+
+	uint32_t ccq_count;
+	uint32_t vpu_dbg_instr_reg_offset[PVA_NUM_ENGINES];
+	struct pva_kmd_ccq_regspec ccq_regs[PVA_MAX_NUM_CCQ];
+};
+
+enum pva_kmd_reg_aperture {
+	PVA_KMD_APERTURE_PVA_CLUSTER = 0,
+	PVA_KMD_APERTURE_VPU_DEBUG,
+	PVA_KMD_APERTURE_COUNT,
+};
+
+#endif // PVA_KMD_REGS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.c
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_resource_table.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_constants.h"
+
+static uint32_t get_max_dma_config_size(struct pva_kmd_device *pva)
+{
+	uint32_t max_num_dyn_slots = PVA_DMA_MAX_NUM_SLOTS;
+	uint32_t max_num_reloc_infos =
+		safe_pow2_roundup_u32(max_num_dyn_slots, 2U);
+
+	uint32_t max_dma_cfg_size =
+		(uint32_t)sizeof(struct pva_dma_config_resource);
+
+	max_dma_cfg_size = safe_addu32(
+		max_dma_cfg_size,
+		safe_mulu32(max_num_dyn_slots,
+			    (uint32_t)sizeof(struct pva_fw_dma_slot)));
+
+	max_dma_cfg_size = safe_addu32(
+		max_dma_cfg_size,
+		safe_mulu32(max_num_reloc_infos,
+			    (uint32_t)sizeof(struct pva_fw_dma_reloc)));
+
+	max_dma_cfg_size = safe_addu32(
+		max_dma_cfg_size,
+		safe_mulu32(pva->hw_consts.n_user_dma_channels,
+			    (uint32_t)sizeof(struct pva_dma_channel)));
+
+	max_dma_cfg_size = safe_addu32(
+		max_dma_cfg_size,
+		safe_mulu32(pva->hw_consts.n_dma_descriptors,
+			    (uint32_t)sizeof(struct pva_dma_descriptor)));
+
+	max_dma_cfg_size = safe_addu32(max_dma_cfg_size,
+				       safe_mulu32(pva->hw_consts.n_hwseq_words,
+						   (uint32_t)sizeof(uint32_t)));
+
+	//Must be aligned to 8 to form array
+	return safe_pow2_roundup_u32(max_dma_cfg_size,
+				     (uint32_t)sizeof(uint64_t));
+}
+
+enum pva_error
+pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
+			    struct pva_kmd_device *pva,
+			    uint8_t user_smmu_ctx_id, uint32_t n_entries,
+			    uint32_t max_num_dma_configs)
+{
+	uint32_t max_dma_config_size = get_max_dma_config_size(pva);
+	enum pva_error err;
+	uint64_t size;
+
+	res_table->pva = pva;
+	res_table->n_entries = n_entries;
+	res_table->user_smmu_ctx_id = user_smmu_ctx_id;
+
+	size = (uint64_t)safe_mulu32(
+		n_entries, (uint32_t)sizeof(struct pva_resource_entry));
+	res_table->table_mem = pva_kmd_device_memory_alloc_map(
+		size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+	ASSERT(res_table->table_mem != NULL);
+
+	pva_kmd_sema_init(&res_table->resource_semaphore, n_entries);
+
+	size = (uint64_t)safe_mulu32(sizeof(struct pva_kmd_resource_record),
+				     n_entries);
+	res_table->records_mem = pva_kmd_zalloc(size);
+
+	ASSERT(res_table->records_mem != NULL);
+
+	err = pva_kmd_block_allocator_init(
+		&res_table->resource_record_allocator, res_table->records_mem,
+		PVA_RESOURCE_ID_BASE, sizeof(struct pva_kmd_resource_record),
+		n_entries);
+	ASSERT(err == PVA_SUCCESS);
+
+	size = (uint64_t)safe_mulu32(max_num_dma_configs, max_dma_config_size);
+	res_table->dma_config_mem = pva_kmd_device_memory_alloc_map(
+		size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+	ASSERT(res_table->dma_config_mem != NULL);
+
+	err = pva_kmd_block_allocator_init(&res_table->dma_config_allocator,
+					   res_table->dma_config_mem->va, 0,
+					   max_dma_config_size,
+					   max_num_dma_configs);
+	ASSERT(err == PVA_SUCCESS);
+
+	res_table->dma_aux = pva_kmd_zalloc(
+		safe_mulu32((uint32_t)sizeof(struct pva_kmd_dma_resource_aux),
+			    max_num_dma_configs));
+	ASSERT(res_table->dma_aux != NULL);
+
+	return PVA_SUCCESS;
+}
+
+void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table)
+{
+	pva_kmd_free(res_table->dma_aux);
+	pva_kmd_block_allocator_deinit(&res_table->dma_config_allocator);
+	pva_kmd_device_memory_free(res_table->dma_config_mem);
+	pva_kmd_block_allocator_deinit(&res_table->resource_record_allocator);
+	pva_kmd_free(res_table->records_mem);
+	pva_kmd_sema_deinit(&res_table->resource_semaphore);
+	pva_kmd_device_memory_free(res_table->table_mem);
+}
+
+static struct pva_kmd_resource_record *
+pva_kmd_alloc_resource(struct pva_kmd_resource_table *resource_table,
+		       uint32_t *out_resource_id)
+{
+	enum pva_error err;
+	struct pva_kmd_resource_record *rec = NULL;
+
+	err = pva_kmd_sema_wait_timeout(&resource_table->resource_semaphore,
+					PVA_KMD_TIMEOUT_RESOURCE_SEMA_MS);
+	if (err == PVA_TIMEDOUT) {
+		pva_kmd_log_err("pva_kmd_alloc_resource Timed out");
+	}
+
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Failed to wait for resource IDs");
+		goto out;
+	}
+
+	rec = (struct pva_kmd_resource_record *)pva_kmd_alloc_block(
+		&resource_table->resource_record_allocator, out_resource_id);
+	ASSERT(rec != NULL);
+
+out:
+	return rec;
+}
+
+static void pva_kmd_free_resource(struct pva_kmd_resource_table *resource_table,
+				  uint32_t resource_id)
+{
+	enum pva_error err;
+
+	err = pva_kmd_free_block(&resource_table->resource_record_allocator,
+				 resource_id);
+	ASSERT(err == PVA_SUCCESS);
+
+	pva_kmd_sema_post(&resource_table->resource_semaphore);
+}
+
+enum pva_error
+pva_kmd_add_syncpt_resource(struct pva_kmd_resource_table *resource_table,
+			    struct pva_kmd_device_memory *dev_mem,
+			    uint32_t *out_resource_id)
+{
+	struct pva_kmd_resource_record *rec =
+		pva_kmd_alloc_resource(resource_table, out_resource_id);
+
+	if (rec == NULL) {
+		pva_kmd_log_err("No more resource id");
+		return PVA_NO_RESOURCE_ID;
+	}
+
+	if (*out_resource_id > resource_table->curr_max_resource_id) {
+		resource_table->curr_max_resource_id = *out_resource_id;
+	}
+
+	rec->type = PVA_RESOURCE_TYPE_DRAM;
+	rec->dram.mem = dev_mem;
+	rec->dram.syncpt = true;
+	rec->ref_count = 1;
+
+	return PVA_SUCCESS;
+}
+
+enum pva_error
+pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,
+				 struct pva_kmd_device_memory *dev_mem,
+				 uint32_t *out_resource_id)
+{
+	struct pva_kmd_resource_record *rec =
+		pva_kmd_alloc_resource(resource_table, out_resource_id);
+
+	if (rec == NULL) {
+		pva_kmd_log_err("No more resource id");
+		return PVA_NO_RESOURCE_ID;
+	}
+
+	if (*out_resource_id > resource_table->curr_max_resource_id) {
+		resource_table->curr_max_resource_id = *out_resource_id;
+	}
+
+	rec->type = PVA_RESOURCE_TYPE_DRAM;
+	rec->dram.mem = dev_mem;
+	rec->dram.syncpt = false;
+	rec->ref_count = 1;
+
+	return PVA_SUCCESS;
+}
+
+static struct pva_resource_entry *
+get_fw_resource(struct pva_kmd_resource_table *res_table, uint32_t resource_id)
+{
+	struct pva_resource_entry *entries = res_table->table_mem->va;
+	uint32_t index;
+
+	ASSERT(resource_id >= PVA_RESOURCE_ID_BASE);
+	index = safe_subu32(resource_id, PVA_RESOURCE_ID_BASE);
+	return &entries[index];
+}
+
+void pva_kmd_update_fw_resource_table(struct pva_kmd_resource_table *res_table)
+{
+	uint32_t id;
+
+	for (id = PVA_RESOURCE_ID_BASE; id <= res_table->curr_max_resource_id;
+	     id++) {
+		struct pva_resource_entry *entry =
+			get_fw_resource(res_table, id);
+		struct pva_kmd_resource_record *rec = pva_kmd_get_block(
+			&res_table->resource_record_allocator, id);
+		if (rec == NULL) {
+			continue;
+		}
+
+		entry->type = rec->type;
+		switch (rec->type) {
+		case PVA_RESOURCE_TYPE_DRAM:
+			entry->addr_lo = iova_lo(rec->dram.mem->iova);
+			entry->addr_hi = iova_hi(rec->dram.mem->iova);
+			entry->size_lo = iova_lo(rec->dram.mem->size);
+			entry->size_hi = iova_hi(rec->dram.mem->size);
+			entry->smmu_context_id = rec->dram.mem->smmu_ctx_idx;
+			break;
+		case PVA_RESOURCE_TYPE_INVALID:
+			break;
+		default:
+			pva_kmd_log_err("Unsupported resource type");
+			pva_kmd_fault();
+		}
+	}
+}
+
+struct pva_kmd_resource_record *
+pva_kmd_use_resource(struct pva_kmd_resource_table *res_table,
+		     uint32_t resource_id)
+{
+	struct pva_kmd_resource_record *rec = pva_kmd_get_block(
+		&res_table->resource_record_allocator, resource_id);
+
+	if (rec == NULL) {
+		return NULL;
+	}
+
+	rec->ref_count = safe_addu32(rec->ref_count, 1U);
+	return rec;
+}
+
+struct pva_kmd_resource_record *
+pva_kmd_peek_resource(struct pva_kmd_resource_table *res_table,
+		      uint32_t resource_id)
+{
+	struct pva_kmd_resource_record *rec = pva_kmd_get_block(
+		&res_table->resource_record_allocator, resource_id);
+
+	return rec;
+}
+
+void pva_kmd_drop_resource(struct pva_kmd_resource_table *resource_table,
+			   uint32_t resource_id)
+{
+	struct pva_kmd_resource_record *rec;
+
+	rec = pva_kmd_get_block(&resource_table->resource_record_allocator,
+				resource_id);
+
+	ASSERT(rec != NULL);
+
+	rec->ref_count = safe_subu32(rec->ref_count, 1U);
+	if (rec->ref_count == 0) {
+		pva_dbg_printf("Dropping resource %u of type %u\n", resource_id,
+			       rec->type);
+		switch (rec->type) {
+		case PVA_RESOURCE_TYPE_DRAM:
+			if (rec->dram.syncpt != true) {
+				pva_kmd_device_memory_free(rec->dram.mem);
+			}
+			break;
+		case PVA_RESOURCE_TYPE_EXEC_BIN:
+			pva_kmd_unload_executable(&rec->vpu_bin.symbol_table,
+						  rec->vpu_bin.metainfo_mem,
+						  rec->vpu_bin.sections_mem);
+			break;
+		case PVA_RESOURCE_TYPE_DMA_CONFIG: {
+			struct pva_kmd_dma_resource_aux *dma_aux;
+			dma_aux =
+				&resource_table
+					 ->dma_aux[rec->dma_config.block_index];
+			pva_kmd_unload_dma_config(dma_aux);
+			pva_kmd_free_block(
+				&resource_table->dma_config_allocator,
+				rec->dma_config.block_index);
+			break;
+		}
+
+		default:
+			pva_kmd_log_err("Unsupported resource type");
+			pva_kmd_fault();
+		}
+
+		pva_kmd_free_resource(resource_table, resource_id);
+	}
+}
+
+enum pva_error
+pva_kmd_add_vpu_bin_resource(struct pva_kmd_resource_table *resource_table,
+			     void *executable, uint32_t executable_size,
+			     uint32_t *out_resource_id)
+{
+	uint32_t res_id;
+	struct pva_kmd_resource_record *rec =
+		pva_kmd_alloc_resource(resource_table, &res_id);
+	enum pva_error err;
+	struct pva_kmd_vpu_bin_resource *vpu_bin;
+
+	if (rec == NULL) {
+		err = PVA_NO_RESOURCE_ID;
+		goto err_out;
+	}
+
+	vpu_bin = &rec->vpu_bin;
+	err = pva_kmd_load_executable(
+		executable, executable_size, resource_table->pva,
+		resource_table->user_smmu_ctx_id, &vpu_bin->symbol_table,
+		&vpu_bin->metainfo_mem, &vpu_bin->sections_mem);
+	if (err != PVA_SUCCESS) {
+		goto free_block;
+	}
+
+	if (res_id > resource_table->curr_max_resource_id) {
+		resource_table->curr_max_resource_id = res_id;
+	}
+
+	rec->type = PVA_RESOURCE_TYPE_EXEC_BIN;
+	rec->ref_count = 1;
+	*out_resource_id = res_id;
+
+	return PVA_SUCCESS;
+free_block:
+	pva_kmd_free_resource(resource_table, res_id);
+err_out:
+	return err;
+}
+
+enum pva_error
+pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
+			    uint32_t resource_id,
+			    struct pva_resource_entry *entry)
+{
+	struct pva_kmd_resource_record *rec =
+		pva_kmd_use_resource(resource_table, resource_id);
+	if (rec == NULL) {
+		return PVA_NO_RESOURCE_ID;
+	}
+
+	switch (rec->type) {
+	case PVA_RESOURCE_TYPE_DRAM:
+		entry->type = rec->type;
+		entry->addr_lo = iova_lo(rec->dram.mem->iova);
+		entry->addr_hi = iova_hi(rec->dram.mem->iova);
+		entry->size_lo = iova_lo(rec->dram.mem->size);
+		entry->size_hi = iova_hi(rec->dram.mem->size);
+		entry->smmu_context_id = rec->dram.mem->smmu_ctx_idx;
+		break;
+	case PVA_RESOURCE_TYPE_EXEC_BIN:
+		entry->type = rec->type;
+		entry->addr_lo = iova_lo(rec->vpu_bin.metainfo_mem->iova);
+		entry->addr_hi = iova_hi(rec->vpu_bin.metainfo_mem->iova);
+		entry->size_lo = iova_lo(rec->vpu_bin.metainfo_mem->size);
+		entry->size_hi = iova_hi(rec->vpu_bin.metainfo_mem->size);
+		entry->smmu_context_id =
+			rec->vpu_bin.metainfo_mem->smmu_ctx_idx;
+		break;
+	case PVA_RESOURCE_TYPE_DMA_CONFIG:
+		entry->type = rec->type;
+		entry->addr_lo = iova_lo(rec->dma_config.iova_addr);
+		entry->addr_hi = iova_hi(rec->dma_config.iova_addr);
+		entry->size_lo = iova_lo(rec->dma_config.size);
+		entry->size_hi = iova_hi(rec->dma_config.size);
+		entry->smmu_context_id = PVA_R5_SMMU_CONTEXT_ID;
+		break;
+	default:
+		pva_kmd_log_err("Unsupported resource type");
+		pva_kmd_fault();
+	}
+
+	pva_kmd_drop_resource(resource_table, resource_id);
+	return PVA_SUCCESS;
+}
+
+enum pva_error pva_kmd_add_dma_config_resource(
+	struct pva_kmd_resource_table *resource_table, void *dma_config_payload,
+	uint32_t dma_config_size, uint32_t *out_resource_id)
+{
+	enum pva_error err = PVA_SUCCESS;
+	uint32_t block_idx, fw_fetch_size;
+	void *fw_dma_cfg;
+	struct pva_kmd_dma_resource_aux *dma_aux;
+	struct pva_kmd_resource_record *rec;
+	uint32_t res_id;
+
+	fw_dma_cfg = pva_kmd_zalloc_block(&resource_table->dma_config_allocator,
+					  &block_idx);
+	if (fw_dma_cfg == NULL) {
+		err = PVA_NOMEM;
+		goto err_out;
+	}
+
+	// Must satisfy alignment requirement for converting to struct
+	// pva_dma_config_resource*
+	ASSERT(((uintptr_t)fw_dma_cfg) % sizeof(uint64_t) == 0);
+
+	dma_aux = &resource_table->dma_aux[block_idx];
+
+	err = pva_kmd_load_dma_config(resource_table, dma_config_payload,
+				      dma_config_size, dma_aux, fw_dma_cfg,
+				      &fw_fetch_size);
+	if (err != PVA_SUCCESS) {
+		goto free_block;
+	}
+
+	rec = pva_kmd_alloc_resource(resource_table, &res_id);
+	if (rec == NULL) {
+		err = PVA_NO_RESOURCE_ID;
+		goto unload_dma;
+	}
+
+	if (res_id > resource_table->curr_max_resource_id) {
+		resource_table->curr_max_resource_id = res_id;
+	}
+
+	rec->type = PVA_RESOURCE_TYPE_DMA_CONFIG;
+	rec->ref_count = 1;
+	rec->dma_config.block_index = block_idx;
+	rec->dma_config.iova_addr = safe_addu64(
+		resource_table->dma_config_mem->iova,
+		(uint64_t)safe_mulu32(
+			block_idx,
+			resource_table->dma_config_allocator.block_size));
+	rec->dma_config.size = fw_fetch_size;
+
+	*out_resource_id = res_id;
+
+	return PVA_SUCCESS;
+unload_dma:
+	pva_kmd_unload_dma_config(dma_aux);
+free_block:
+	pva_kmd_free_block(&resource_table->dma_config_allocator, block_idx);
+err_out:
+	return err;
+}
+
+void pva_kmd_verify_all_resources_free(
+	struct pva_kmd_resource_table *resource_table)
+{
+	enum pva_error err;
+	for (uint32_t i = 0; i < resource_table->n_entries; i++) {
+		err = pva_kmd_sema_wait_timeout(
+			&resource_table->resource_semaphore,
+			PVA_KMD_TIMEOUT_RESOURCE_SEMA_MS);
+		ASSERT(err == PVA_SUCCESS);
+	}
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_resource_table.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_RESOURCE_TABLE_H
+#define PVA_KMD_RESOURCE_TABLE_H
+#include "pva_fw.h"
+#include "pva_bit.h"
+#include "pva_resource.h"
+#include "pva_kmd_block_allocator.h"
+#include "pva_kmd.h"
+#include "pva_kmd_utils.h"
+#include "pva_kmd_executable.h"
+#include "pva_constants.h"
+#include "pva_kmd_dma_cfg.h"
+#include "pva_kmd_mutex.h"
+#include "pva_kmd_thread_sema.h"
+
+struct pva_kmd_device;
+
+struct pva_kmd_dram_resource {
+	struct pva_kmd_device_memory *mem;
+	bool syncpt;
+};
+
+struct pva_kmd_vpu_bin_resource {
+	struct pva_kmd_device_memory *metainfo_mem;
+	struct pva_kmd_device_memory *sections_mem;
+	struct pva_kmd_exec_symbol_table symbol_table;
+};
+
+struct pva_kmd_dma_config_resource {
+	uint32_t block_index;
+	uint64_t size;
+	uint64_t iova_addr;
+};
+
+struct pva_kmd_resource_record {
+	/**
+	* Possible types:
+	* PVA_RESOURCE_TYPE_DRAM
+	* PVA_RESOURCE_TYPE_EXEC_BIN
+	* PVA_RESOURCE_TYPE_DMA_CONFIG
+	*/
+	uint8_t type;
+	uint32_t ref_count;
+	union {
+		struct pva_kmd_dram_resource dram;
+		struct pva_kmd_vpu_bin_resource vpu_bin;
+		struct pva_kmd_dma_config_resource dma_config;
+	};
+};
+
+/**
+ *
+ */
+struct pva_kmd_resource_table {
+	/** @brief User smmu context ID.
+	 *
+	 * - DRAM memory, VPU data/text sections will be mapped to this space.
+	 * - VPU metadata, DMA configurations will always be mapped to R5 SMMU
+	 * context. */
+	uint8_t user_smmu_ctx_id;
+	uint32_t n_entries;
+	/** Maximum resource ID we have seen so far */
+	uint32_t curr_max_resource_id;
+
+	/** Semaphore to keep track of resources in use*/
+	pva_kmd_sema_t resource_semaphore;
+
+	/** Memory for resource table entries, in R5 segment */
+	struct pva_kmd_device_memory *table_mem;
+
+	/** Memory for fw dma configs, in DMA segment */
+	struct pva_kmd_device_memory *dma_config_mem;
+	struct pva_kmd_block_allocator dma_config_allocator;
+
+	/** Memory for tracking resources used by DMA configuration. Single
+	 * allocation shared by all DMA configs */
+	struct pva_kmd_dma_resource_aux *dma_aux;
+
+	/** Pointer to syncpt_allocator in pva_kmd_device created during kmd boot */
+	struct pva_kmd_block_allocator *syncpt_allocator;
+
+	/** Memory for resource records */
+	void *records_mem;
+	struct pva_kmd_block_allocator resource_record_allocator;
+	struct pva_kmd_device *pva;
+};
+
+enum pva_error
+pva_kmd_resource_table_init(struct pva_kmd_resource_table *res_table,
+			    struct pva_kmd_device *pva,
+			    uint8_t user_smmu_ctx_id, uint32_t n_entries,
+			    uint32_t max_num_dma_configs);
+void pva_kmd_resource_table_deinit(struct pva_kmd_resource_table *res_table);
+
+/** KMD only writes to FW resource table during init time. Once the address of
+ * the resource table is sent to FW, all updates should be done through commands.
+ */
+void pva_kmd_update_fw_resource_table(struct pva_kmd_resource_table *res_table);
+
+enum pva_error
+pva_kmd_add_syncpt_resource(struct pva_kmd_resource_table *resource_table,
+			    struct pva_kmd_device_memory *dev_mem,
+			    uint32_t *out_resource_id);
+
+enum pva_error
+pva_kmd_add_dram_buffer_resource(struct pva_kmd_resource_table *resource_table,
+				 struct pva_kmd_device_memory *memory,
+				 uint32_t *out_resource_id);
+
+enum pva_error
+pva_kmd_add_vpu_bin_resource(struct pva_kmd_resource_table *resource_table,
+			     void *executable, uint32_t executable_size,
+			     uint32_t *out_resource_id);
+
+enum pva_error
+pva_kmd_add_dma_config_resource(struct pva_kmd_resource_table *resource_table,
+				void *dma_config, uint32_t dma_config_size,
+				uint32_t *out_resource_id);
+
+/**
+ * Increment reference count of the resources
+ *
+ * TODO: make use and drop thread safe.
+ * */
+struct pva_kmd_resource_record *
+pva_kmd_use_resource(struct pva_kmd_resource_table *resource_table,
+		     uint32_t resource_id);
+
+struct pva_kmd_resource_record *
+pva_kmd_peek_resource(struct pva_kmd_resource_table *resource_table,
+		      uint32_t resource_id);
+
+void pva_kmd_drop_resource(struct pva_kmd_resource_table *resource_table,
+			   uint32_t resource_id);
+
+enum pva_error
+pva_kmd_make_resource_entry(struct pva_kmd_resource_table *resource_table,
+			    uint32_t resource_id,
+			    struct pva_resource_entry *entry);
+
+void pva_kmd_verify_all_resources_free(
+	struct pva_kmd_resource_table *resource_table);
+
+#endif // PVA_KMD_RESOURCE_TABLE_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2023, NVIDIA Corporation.  All rights reserved.
+ */
+
+#include "pva_kmd_sha256.h"
+
+#define ROTLEFT(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+#define ROTRIGHT(a, b) (((a) >> (b)) | ((a) << (32 - (b))))
+
+#define CH(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
+#define MAJ(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+#define SHA_EP0(x) (ROTRIGHT(x, 2) ^ ROTRIGHT(x, 13) ^ ROTRIGHT(x, 22))
+#define SHA_EP1(x) (ROTRIGHT(x, 6) ^ ROTRIGHT(x, 11) ^ ROTRIGHT(x, 25))
+#define SIG0(x) (ROTRIGHT(x, 7) ^ ROTRIGHT(x, 18) ^ ((x) >> 3))
+#define SIG1(x) (ROTRIGHT(x, 17) ^ ROTRIGHT(x, 19) ^ ((x) >> 10))
+
+#define SWAP32(x) __builtin_bswap32(x)
+#define SWAP64(x) __builtin_bswap64(x)
+
+/**
+ * This variable is used internally by \ref sha256_transform()
+ */
+static const uint32_t k[64] = {
+	U32(0x428a2f98U), U32(0x71374491U), U32(0xb5c0fbcfU), U32(0xe9b5dba5U),
+	U32(0x3956c25bU), U32(0x59f111f1U), U32(0x923f82a4U), U32(0xab1c5ed5U),
+	U32(0xd807aa98U), U32(0x12835b01U), U32(0x243185beU), U32(0x550c7dc3U),
+	U32(0x72be5d74U), U32(0x80deb1feU), U32(0x9bdc06a7U), U32(0xc19bf174U),
+	U32(0xe49b69c1U), U32(0xefbe4786U), U32(0x0fc19dc6U), U32(0x240ca1ccU),
+	U32(0x2de92c6fU), U32(0x4a7484aaU), U32(0x5cb0a9dcU), U32(0x76f988daU),
+	U32(0x983e5152U), U32(0xa831c66dU), U32(0xb00327c8U), U32(0xbf597fc7U),
+	U32(0xc6e00bf3U), U32(0xd5a79147U), U32(0x06ca6351U), U32(0x14292967U),
+	U32(0x27b70a85U), U32(0x2e1b2138U), U32(0x4d2c6dfcU), U32(0x53380d13U),
+	U32(0x650a7354U), U32(0x766a0abbU), U32(0x81c2c92eU), U32(0x92722c85U),
+	U32(0xa2bfe8a1U), U32(0xa81a664bU), U32(0xc24b8b70U), U32(0xc76c51a3U),
+	U32(0xd192e819U), U32(0xd6990624U), U32(0xf40e3585U), U32(0x106aa070U),
+	U32(0x19a4c116U), U32(0x1e376c08U), U32(0x2748774cU), U32(0x34b0bcb5U),
+	U32(0x391c0cb3U), U32(0x4ed8aa4aU), U32(0x5b9cca4fU), U32(0x682e6ff3U),
+	U32(0x748f82eeU), U32(0x78a5636fU), U32(0x84c87814U), U32(0x8cc70208U),
+	U32(0x90befffaU), U32(0xa4506cebU), U32(0xbef9a3f7U), U32(0xc67178f2U)
+};
+
+/**
+ * \brief
+ * This function is a helper function used by \ref pva_sha256_update
+ * to hash 512-bit blocks and forms the core of the algorithm.
+ * Use \ref sha256_init(), \ref pva_sha256_update(), and
+ * \ref sha256_finalize() instead of calling sha256_transform() directly.
+ * \param[in] ctx  pointer of struct sha256_ctx context.
+ * \param[in] data_in  pointer to the data block to be hashed.
+ * \return Void
+ */
+static void sha256_transform(struct sha256_ctx *ctx, const void *data_in)
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2, m[64];
+	const uint32_t *const data = data_in;
+	size_t i;
+
+	for (i = 0; i < U32(16); i++) {
+		m[i] = SWAP32(data[i]);
+	}
+	for (i = 0; i < U32(64) - U32(16); ++i) {
+		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		m[i + U32(16)] = SIG1(m[U32(14) + i]) + m[U32(9) + i] +
+				 SIG0(m[U32(1) + i]) + m[i];
+	}
+
+	a = ctx->state[0];
+	b = ctx->state[1];
+	c = ctx->state[2];
+	d = ctx->state[3];
+	e = ctx->state[4];
+	f = ctx->state[5];
+	g = ctx->state[6];
+	h = ctx->state[7];
+
+	for (i = 0; i < U32(64); ++i) {
+		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		t1 = h + SHA_EP1(e) + CH(e, f, g) + k[i] + m[i];
+		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		t2 = SHA_EP0(a) + MAJ(a, b, c);
+		h = g;
+		g = f;
+		f = e;
+		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		e = d + t1;
+		d = c;
+		c = b;
+		b = a;
+		/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+		a = t1 + t2;
+	}
+
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[0] += a;
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[1] += b;
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[2] += c;
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[3] += d;
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[4] += e;
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[5] += f;
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[6] += g;
+	/* coverity[cert_int30_c_violation]; Deviation-MOD32_DEVIATION_ID */
+	ctx->state[7] += h;
+}
+
+void sha256_init(struct sha256_ctx *ctx)
+{
+	ctx->bitlen = 0;
+	ctx->state[0] = U32(0x6a09e667);
+	ctx->state[1] = U32(0xbb67ae85);
+	ctx->state[2] = U32(0x3c6ef372);
+	ctx->state[3] = U32(0xa54ff53a);
+	ctx->state[4] = U32(0x510e527f);
+	ctx->state[5] = U32(0x9b05688c);
+	ctx->state[6] = U32(0x1f83d9ab);
+	ctx->state[7] = U32(0x5be0cd19);
+}
+
+void sha256_update(struct sha256_ctx *ctx, const void *data, size_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i += U32(64)) {
+		ctx->bitlen &= U32(0xffffffff);
+		sha256_transform(ctx, ((const uint8_t *)data) + i);
+		ctx->bitlen += U32(512);
+	}
+}
+
+void sha256_copy(const struct sha256_ctx *ctx_in, struct sha256_ctx *ctx_out)
+{
+	*ctx_out = *ctx_in;
+}
+
+void sha256_finalize(struct sha256_ctx *ctx, const void *input,
+		     size_t input_size, uint32_t out[8])
+{
+	uint8_t data[64];
+	void *p = data;
+	uint32_t t;
+
+	input_size &= U32(0xffffffff);
+	ctx->bitlen &= U32(0xffffffff);
+
+	/* the false of this condition is illegal for this API agreement */
+	/* this check is here only for Coverity INT30-C */
+	ctx->bitlen += input_size * U32(8);
+	(void)memcpy(p, input, input_size);
+	data[input_size] = 0x80;
+
+	if (input_size < U32(56)) { /* can we fit an 8-byte counter? */
+		/* Pad whatever data is left in the buffer. */
+		(void)memset(data + input_size + U32(1), 0,
+			     U32(56) - input_size - U32(1));
+	} else { /* Go into another block. We are here only for message hashing */
+		if (input_size + U32(1) < U32(64)) {
+			(void)memset(data + input_size + U32(1), 0,
+				     U32(64) - input_size - U32(1));
+		}
+		sha256_transform(ctx, data);
+		(void)memset(data, 0, 56);
+	}
+
+	t = ctx->bitlen_low;
+
+	*(uint32_t *)(void *)(data + 56) = 0;
+	*(uint32_t *)(void *)(data + 60) = SWAP32(t);
+
+	sha256_transform(ctx, data);
+
+	out[0] = SWAP32(ctx->state[0]);
+	out[1] = SWAP32(ctx->state[1]);
+	out[2] = SWAP32(ctx->state[2]);
+	out[3] = SWAP32(ctx->state[3]);
+	out[4] = SWAP32(ctx->state[4]);
+	out[5] = SWAP32(ctx->state[5]);
+	out[6] = SWAP32(ctx->state[6]);
+	out[7] = SWAP32(ctx->state[7]);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_sha256.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021-2023, NVIDIA Corporation.  All rights reserved.
+ */
+
+#ifndef PVA_KMD_SHA256_H
+#define PVA_KMD_SHA256_H
+
+#include "pva_api_types.h"
+#define U32(x) ((uint32_t)(x))
+
+struct sha256_ctx {
+	/*
+	 * On bitlen:
+	 *
+	 * While we don't exceed 2^32 bit (2^29 byte) length for the input buffer,
+	 * size_t is more efficient at least on RISC-V. This particular
+	 * structure is needed to make Coverity happy.
+	 */
+	union {
+		size_t bitlen;
+		uint32_t bitlen_low;
+	};
+	uint32_t state[8];
+};
+
+/**
+ * Initializes struct sha256_ctx
+ *
+ * \param[in] ctx  pointer of struct sha256_ctx context
+ *
+ * \return  void
+ */
+void sha256_init(struct sha256_ctx *ctx);
+
+/**
+ * \brief
+ * Hash full blocks, in units of 64 bytes
+ * can be called repeatedly with chunks of the message
+ * to be hashed (len bytes at data).
+ *
+ * \param[in] ctx  pointer of struct sha256_ctx context
+ * \param[in] data  pointer to the data block to be hashed
+ * \param[in] len  length (in units of 64 bytes) of the data to be hashed.
+ *
+ * \return  void
+ */
+void sha256_update(struct sha256_ctx *ctx, const void *data, size_t len);
+
+/**
+ * \brief
+ * Finalize the hash and keep the calcualted hash in out.
+ * Required: input_size < 64. Call pva_sha256_update() first otherwise.
+ *
+ * \param[in] ctx  pointer of struct sha256_ctx context
+ * \param[in] input pointer to the data block
+ * (left over from \ref pva_sha256_update) to be hashed
+ * \param[in] input_size size of the data block to hashed
+ * (left over from \ref pva_sha256_update to be hashed)
+ * \param[out] out places the calcuated sha256 key in out.
+ *
+ * \return void
+ */
+void sha256_finalize(struct sha256_ctx *ctx, const void *input,
+		     size_t input_size, uint32_t out[8]);
+
+/**
+ * \brief
+ * copy state information to ctx_out from ctx_in
+ * \param[in] ctx_in input struct sha256_ctx
+ * \param[out] ctx_out output struct sha256_ctx
+ * \return void
+ */
+void sha256_copy(const struct sha256_ctx *ctx_in, struct sha256_ctx *ctx_out);
+
+#endif /* PVA_SHA256_H */
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.c
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_device.h"
+#include "pva_fw_address_map.h"
+#include "pva_fw_hyp.h"
+#include "pva_kmd_thread_sema.h"
+#include "pva_kmd_constants.h"
+#include "pva_kmd_silicon_isr.h"
+#include "pva_kmd_silicon_boot.h"
+#include "pva_kmd_shim_silicon.h"
+
+static inline void pva_kmd_set_sema(struct pva_kmd_device *pva,
+				    uint32_t sema_idx, uint32_t val)
+{
+	uint32_t gap = PVA_REG_HSP_SS1_SET_ADDR - PVA_REG_HSP_SS0_SET_ADDR;
+	gap = safe_mulu32(gap, sema_idx);
+	pva_kmd_write(pva, safe_addu32(PVA_REG_HSP_SS0_SET_ADDR, gap), val);
+}
+
+static void init_fw_print_buffer(struct pva_kmd_fw_print_buffer *print_buffer,
+				 void *debug_buffer_va)
+{
+	print_buffer->buffer_info = pva_offset_pointer(
+		debug_buffer_va,
+		FW_TRACE_BUFFER_SIZE + FW_CODE_COVERAGE_BUFFER_SIZE);
+	print_buffer->size =
+		FW_DEBUG_LOG_BUFFER_SIZE - sizeof(*print_buffer->buffer_info);
+	print_buffer->head = 0;
+	print_buffer->content = pva_offset_pointer(
+		print_buffer->buffer_info, sizeof(*print_buffer->buffer_info));
+}
+
+static void disable_sec_mission_error_reporting(struct pva_kmd_device *pva)
+{
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE0_MISSIONERR_ENABLE_ADDR, 0U);
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE1_MISSIONERR_ENABLE_ADDR, 0U);
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE2_MISSIONERR_ENABLE_ADDR, 0U);
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE3_MISSIONERR_ENABLE_ADDR, 0U);
+}
+
+static void disable_sec_latent_error_reporting(struct pva_kmd_device *pva)
+{
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE0_LATENTERR_ENABLE_ADDR, 0U);
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE1_LATENTERR_ENABLE_ADDR, 0U);
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE2_LATENTERR_ENABLE_ADDR, 0U);
+	pva_kmd_write(pva, PVA_REG_SEC_ERRSLICE3_LATENTERR_ENABLE_ADDR, 0U);
+}
+
+void pva_kmd_config_evp_seg_regs(struct pva_kmd_device *pva)
+{
+	uint64_t seg_reg_value;
+	/* EVP */
+	pva_kmd_write(pva, PVA_REG_EVP_RESET_ADDR, EVP_RESET_VECTOR);
+	pva_kmd_write(pva, PVA_REG_EVP_UNDEF_ADDR,
+		      EVP_UNDEFINED_INSTRUCTION_VECTOR);
+	pva_kmd_write(pva, PVA_REG_EVP_SWI_ADDR, EVP_SVC_VECTOR);
+	pva_kmd_write(pva, PVA_REG_EVP_PREFETCH_ABORT_ADDR,
+		      EVP_PREFETCH_ABORT_VECTOR);
+	pva_kmd_write(pva, PVA_REG_EVP_DATA_ABORT_ADDR, EVP_DATA_ABORT_VECTOR);
+	pva_kmd_write(pva, PVA_REG_EVP_RSVD_ADDR, EVP_RESERVED_VECTOR);
+	pva_kmd_write(pva, PVA_REG_EVP_IRQ_ADDR, EVP_IRQ_VECTOR);
+	pva_kmd_write(pva, PVA_REG_EVP_FIQ_ADDR, EVP_FIQ_VECTOR);
+	/* R5 regions are defined as:
+	 * - PRIV1 region for firmware code and data.
+	 * - PRIV2 region for debug printf data.
+	 * - Remaining region for resource table, queues, etc.
+	 */
+	pva_kmd_write(pva, pva->regspec.cfg_priv_ar1_start,
+		      FW_CODE_DATA_START_ADDR);
+	pva_kmd_write(pva, pva->regspec.cfg_priv_ar1_end,
+		      FW_CODE_DATA_END_ADDR);
+	pva_kmd_write(pva, pva->regspec.cfg_priv_ar2_start,
+		      FW_DEBUG_DATA_START_ADDR);
+	pva_kmd_write(pva, pva->regspec.cfg_priv_ar2_end,
+		      FW_DEBUG_DATA_END_ADDR);
+	/* Firmware expects R5 virtual address FW_CODE_DATA_START_ADDR to be
+ 	* mapped to the beginning of firmware binary. Therefore, we adjust
+ 	* segment registers accordingly
+ 	*
+ 	* */
+	if (pva->load_from_gsc) {
+		if (pva->is_hv_mode) {
+			/* Loading from GSC with HV (i.e AV+L or AV+Q case).
+			 * This will be trapped by HV
+			 */
+			pva_kmd_write(pva, pva->regspec.cfg_priv_ar1_lsegreg,
+				      0xFFFFFFFFU);
+			pva_kmd_write(pva, pva->regspec.cfg_priv_ar1_usegreg,
+				      0xFFFFFFFFU);
+		} else {
+			/* Loading from GSC without HV i.e L4T case.
+			 * TODO: Program Segment regsites using the GSC Careveout
+			 * fetched from DT file. Till then, ASSERT here.
+			 */
+			ASSERT(false);
+		}
+	} else {
+		/* Loading from file.
+		 * In HV case, traps should be bypassed in HV
+		 */
+		seg_reg_value =
+			pva->fw_bin_mem->iova -
+			FW_CODE_DATA_START_ADDR; /* underflow is totally OK */
+		pva_kmd_write(pva, pva->regspec.cfg_priv_ar1_lsegreg,
+			      iova_lo(seg_reg_value));
+		pva_kmd_write(pva, pva->regspec.cfg_priv_ar1_usegreg,
+			      iova_hi(seg_reg_value));
+	}
+}
+
+void pva_kmd_config_scr_regs(struct pva_kmd_device *pva)
+{
+	pva_kmd_write(pva, PVA_REG_EVP_SCR_ADDR, PVA_EVP_SCR_VAL);
+	pva_kmd_write(pva, PVA_CFG_SCR_STATUS_CNTL, PVA_STATUS_CTL_SCR_VAL);
+	pva_kmd_write(pva, PVA_CFG_SCR_PRIV, PVA_PRIV_SCR_VAL);
+	pva_kmd_write(pva, PVA_CFG_SCR_CCQ_CNTL, PVA_CCQ_SCR_VAL);
+}
+
+void pva_kmd_config_sid(struct pva_kmd_device *pva)
+{
+	uint32_t addr;
+	uint32_t i;
+	uint32_t offset;
+	uint8_t priv1_sid;
+	uint8_t priv_sid;
+	priv_sid = pva->stream_ids[PVA_R5_SMMU_CONTEXT_ID] & 0xFF;
+	priv1_sid = pva->stream_ids[pva->r5_image_smmu_context_id] & 0xFF;
+	/* Priv SIDs */
+	if (pva->load_from_gsc) {
+		pva_kmd_write(pva, pva->regspec.cfg_priv_sid,
+			      PVA_INSERT(priv_sid, 7, 0) |
+				      PVA_INSERT(priv1_sid, 15, 8) |
+				      PVA_INSERT(priv_sid, 23, 16));
+	} else {
+		pva_kmd_write(pva, pva->regspec.cfg_priv_sid,
+			      PVA_INSERT(priv_sid, 7, 0) |
+				      PVA_INSERT(priv_sid, 15, 8) |
+				      PVA_INSERT(priv_sid, 23, 16));
+	}
+	/* VPS SIDs  */
+	if ((pva->hw_consts.hw_gen == PVA_HW_GEN3) && pva->load_from_gsc) {
+		pva_kmd_write(pva, pva->regspec.cfg_vps_sid,
+			      PVA_INSERT(priv1_sid, 7, 0) |
+				      PVA_INSERT(priv1_sid, 15, 8));
+	} else {
+		pva_kmd_write(pva, pva->regspec.cfg_vps_sid,
+			      PVA_INSERT(priv_sid, 7, 0) |
+				      PVA_INSERT(priv_sid, 15, 8));
+	}
+	/* User SIDs */
+	offset = 0;
+	for (i = 1; i < pva->hw_consts.n_smmu_contexts - 1; i++) {
+		addr = safe_addu32(pva->regspec.cfg_user_sid_base, offset);
+		pva_kmd_write(pva, addr, pva->stream_ids[i]);
+		offset = safe_addu32(offset, 4U);
+	}
+}
+
+enum pva_error pva_kmd_init_fw(struct pva_kmd_device *pva)
+{
+	uint64_t seg_reg_value;
+	uint32_t debug_data_size;
+	uint32_t boot_sema = 0;
+	enum pva_error err = PVA_SUCCESS;
+
+	/* Load firmware */
+	if (!pva->load_from_gsc) {
+		err = pva_kmd_read_fw_bin(pva);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err(
+				"Failed to read firmware from filesystem");
+			goto out;
+		}
+	}
+
+	debug_data_size = (uint32_t)safe_pow2_roundup_u32(
+		FW_DEBUG_DATA_TOTAL_SIZE, SIZE_4KB);
+	pva->fw_debug_mem = pva_kmd_device_memory_alloc_map(
+		debug_data_size, pva, PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+	if (pva->fw_debug_mem == NULL) {
+		err = PVA_NOMEM;
+		goto free_fw_mem;
+	}
+	init_fw_print_buffer(&pva->fw_print_buffer, pva->fw_debug_mem->va);
+
+	/* Program SCRs */
+	pva_kmd_write(pva, PVA_SEC_SCR_SECEXT_INTR_EVENT,
+		      PVA_SEC_SCR_SECEXT_INTR_EVENT_VAL);
+	pva_kmd_write(pva, PVA_PROC_SCR_PROC, PVA_PROC_SCR_PROC_VAL);
+
+	pva_kmd_config_evp_seg_scr_regs(pva);
+
+	/* Write IOVA address of debug buffer to mailbox and FW will program
+	 * PRIV2 segment register properly such that the debug buffer is located
+	 * at R5 virtual address FW_DEBUG_DATA_START_ADDR */
+	seg_reg_value = pva->fw_debug_mem->iova;
+
+	/* When GSC is enabled, KMD cannot write directly to segment registers,
+	 * therefore we write to mailbox registers and FW will program by
+	 * itself.
+	 * pva_kmd_writel(pva, pva->regspec.cfg_priv_ar2_lsegreg,
+	 *	       iova_lo(seg_reg_value));
+	 * pva_kmd_writel(pva, pva->regspec.cfg_priv_ar2_usegreg,
+	 *             iova_hi(seg_reg_value));
+	 */
+	pva_kmd_write_mailbox(pva, PVA_MBOXID_PRIV2SEG_L,
+			      iova_lo(seg_reg_value));
+	pva_kmd_write_mailbox(pva, PVA_MBOXID_PRIV2SEG_H,
+			      iova_hi(seg_reg_value));
+
+	/* Write shared memory allocation start address to mailbox and FW will
+	 * program user segment register accordingly so that virtual address
+	 * PVA_SHARED_MEMORY_START will point to the allocation start address.
+	 *
+	 * We deliberately also choose PVA_SHARED_MEMORY_START as the allocation
+	 * start address so that the net result is that user segment register
+	 * will be programmed to 0.
+	 */
+	seg_reg_value = FW_SHARED_MEMORY_START;
+	pva_kmd_write_mailbox(pva, PVA_MBOXID_USERSEG_L,
+			      iova_lo(seg_reg_value));
+	pva_kmd_write_mailbox(pva, PVA_MBOXID_USERSEG_H,
+			      iova_hi(seg_reg_value));
+
+	/* Boot parameters  */
+	if (pva->bl_sector_pack_format == PVA_BL_XBAR_RAW) {
+		boot_sema = PVA_BOOT_SEMA_USE_XBAR_RAW;
+	}
+	pva_kmd_set_sema(pva, PVA_BOOT_SEMA, boot_sema);
+
+	pva_kmd_write(pva, PVA_REG_HSP_SS2_SET_ADDR,
+		      pva_kmd_get_syncpt_ro_offset(pva));
+	pva_kmd_write(pva, PVA_REG_HSP_SS3_SET_ADDR,
+		      pva_kmd_get_syncpt_rw_offset(pva));
+
+	pva_kmd_config_sid_regs(pva);
+
+	/* Enable LIC INTR line for HSP1 and WDT */
+	pva_kmd_write(pva, pva->regspec.sec_lic_intr_enable,
+		      PVA_BIT(0) /*Watchdog*/
+			      | PVA_INSERT(0x1, 4, 1) /* HSP1 */
+			      | PVA_INSERT(0x7, 7, 5) /* All H1X errors */);
+
+	/* Bind interrupts */
+	err = pva_kmd_bind_intr_handler(pva, PVA_KMD_INTR_LINE_SEC_LIC,
+					pva_kmd_hyp_isr, pva);
+	if (err != PVA_SUCCESS) {
+		goto free_fw_debug_mem;
+	}
+	err = pva_kmd_bind_intr_handler(pva, PVA_KMD_INTR_LINE_CCQ0,
+					pva_kmd_isr, pva);
+	if (err != PVA_SUCCESS) {
+		goto free_sec_lic;
+	}
+
+	/* Take R5 out of reset */
+	pva_kmd_write(pva, PVA_REG_PROC_CPUHALT_ADDR, 0x1);
+
+	/* Wait until fw boots */
+	err = pva_kmd_sema_wait_timeout(&pva->fw_boot_sema,
+					PVA_KMD_FW_BOOT_TIMEOUT_MS);
+
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Waiting for FW boot timed out.");
+		goto free_ccq0;
+	}
+
+	return err;
+
+free_ccq0:
+	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_CCQ0);
+free_sec_lic:
+	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
+free_fw_debug_mem:
+	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
+	pva_kmd_device_memory_free(pva->fw_debug_mem);
+free_fw_mem:
+	if (!pva->load_from_gsc) {
+		pva_kmd_device_memory_free(pva->fw_bin_mem);
+	}
+out:
+	return err;
+}
+
+void pva_kmd_deinit_fw(struct pva_kmd_device *pva)
+{
+	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_CCQ0);
+	pva_kmd_free_intr(pva, PVA_KMD_INTR_LINE_SEC_LIC);
+	pva_kmd_drain_fw_print(&pva->fw_print_buffer);
+
+	/*
+	 * Before powering off PVA, disable SEC error reporting.
+	 * While powering off, PVA might generate (unexplained) error interrupts
+	 * This causes HSM to read some PVA SEC registers. However, since PVA might
+	 * already be powergated by this time, access to PVA SEC registers from HSM
+	 * fails. This was discussed in Bug 3785498.
+	 *
+	 * Note: we do not explicity enable these errors during power on since
+	 *	 'enable' is their reset value
+	 */
+	disable_sec_mission_error_reporting(pva);
+	disable_sec_latent_error_reporting(pva);
+
+	pva_kmd_device_memory_free(pva->fw_debug_mem);
+	if (!pva->load_from_gsc) {
+		pva_kmd_device_memory_free(pva->fw_bin_mem);
+	}
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_boot.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_SILICON_BOOT_H
+#define PVA_KMD_SILICON_BOOT_H
+
+#include "pva_kmd_device.h"
+
+/**
+ * @brief Configure EVP and Segment config registers
+ *
+ * This function configures the EVP and Segment config registers.
+ *
+ * @param pva Pointer to the PVA device.
+ */
+void pva_kmd_config_evp_seg_regs(struct pva_kmd_device *pva);
+
+/**
+ * @brief Configure SCR registers.
+ *
+ * This function configures the SCR registers.
+ *
+ * @param pva Pointer to the PVA device.
+ */
+void pva_kmd_config_scr_regs(struct pva_kmd_device *pva);
+
+/**
+ * @brief Configure SID registers.
+ *
+ * This function configures the SID registers.
+ *
+ * @param pva Pointer to the PVA device.
+ */
+void pva_kmd_config_sid(struct pva_kmd_device *pva);
+
+#endif /* PVA_KMD_SILICON_BOOT_H */
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_elf_parser.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_elf_parser.c
@@ -0,0 +1,414 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_silicon_elf_parser.h"
+#include "pva_kmd_utils.h"
+
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef UINT8_MAX
+#define UINT8_MAX 0xFF
+#endif
+
+// CERT complains about casts from const uint8_t*, so do intermediate cast to void*
+static inline const void *uint_8_to_void(const uint8_t *const p)
+{
+	return (const void *)p;
+}
+
+bool elf_header_check(const elf_ct e)
+{
+	const elfFileHeader *efh = (const elfFileHeader *)e;
+	if ((ELFCLASS32 == efh->oclass) &&
+	    (ELFMAGIC_LSB == *(const elfWord *)e)) {
+		return true;
+	}
+	return false;
+}
+
+/**
+ * @brief Return pointer to ELF file header
+ *
+ * Cast the elf image data to \ref elfFileHeader*
+ *
+ * @param [in] e pointer to elf image data
+ * @return
+ * - Valid poniter to ELF file header
+ * - NULL if \a e is NULL or correct elf magic ID is not present
+ * in first 4 bytes of elf file pointed by \a e.
+ *
+ */
+static const elfFileHeader *elf_file_header(const elf_ct e)
+{
+	return (const elfFileHeader *)e;
+}
+
+/**
+ * @brief Get start address of the section table.
+ *
+ * @param[in] e pointer to elf image
+ * @return const elfSectionHeader*
+ * - Valid address of section header.
+ * - NULL if \a e is NULL or Header in ELF file is NULL.
+ */
+static inline const elfSectionHeader *elf_section_table(const elf_parser_ctx e)
+{
+	const elfFileHeader *efh = elf_file_header(e.elf_file);
+	const char *p = (const char *)e.elf_file;
+
+	if (efh->shoff > e.size) {
+		pva_kmd_log_err("Invalid Section header Offset");
+		return NULL;
+	}
+	p = &p[efh->shoff];
+	// proper ELF should always have offsets be aligned,
+	// but add check just in case.
+	return (const elfSectionHeader *)(const void *)(p);
+}
+
+/**
+ * @brief Get the size of ELF section
+ *
+ * @param esh pointer to ELF section header
+ * @return elfWord
+ * - size of the corresponding section header.
+ * - 0, if \a esh is NULL.
+ *
+ */
+static elfWord elf_section_size(const elfSectionHeader *esh)
+{
+	if (NULL == esh) {
+		return UZERO;
+	}
+	return (elfWord)esh->size;
+}
+
+elfWord elf_shnum(const elf_parser_ctx e)
+{
+	const elfFileHeader *efh = elf_file_header(e.elf_file);
+	if (NULL == efh) {
+		return UZERO;
+	}
+	if (UZERO == efh->shnum) {
+		/* get value from size of first (empty) section */
+		/* to avoid recursion, don't call elf_section_header(0) */
+		const elfSectionHeader *esh = elf_section_table(e);
+		// if esh is somehow NULL, section_size will return UZERO
+		elfWord size = elf_section_size(esh);
+		if (size > e.size) { // make sure we don't lose precision
+			return UZERO;
+		} else {
+			return size;
+		}
+	} else {
+		return (elfWord)efh->shnum;
+	}
+}
+
+const elfSectionHeader *elf_section_header(const elf_parser_ctx e,
+					   unsigned int index)
+{
+	const elfSectionHeader *esh = elf_section_table(e);
+	if (NULL == esh) {
+		return NULL;
+	}
+	if (index >= elf_shnum(e)) {
+		return NULL;
+	}
+
+	esh = &esh[index];
+	return esh;
+}
+
+static inline elfOff get_table_end(elfWord num, elfHalf entsize, elfOff off)
+{
+	elfOff end;
+	elfWord tablesize = 0;
+	/**
+	 * Guaranteed to be less than UINT32_MAX and not overflow
+	 * num if set as efh->shnum is UINT16_MAX
+	 * num if set as section_header->size is file size of ELF which
+	 * is bound to 2 MB
+	 */
+	tablesize = safe_mulu32(num, (uint32_t)entsize);
+
+	end = off + tablesize;
+	if (end < off) {
+		return UZERO; //Wrap around error
+	}
+	return end;
+}
+
+bool elf_has_valid_sections(const elf_parser_ctx e)
+{
+	elfOff max_size = UZERO;
+	uint32_t i;
+	elfOff ph_end, sh_end;
+	const elfFileHeader *efh = elf_file_header(e.elf_file);
+	if (efh == NULL) {
+		return false;
+	}
+	ph_end = get_table_end(efh->phnum, efh->phentsize, efh->phoff);
+	sh_end = get_table_end(elf_shnum(e), efh->shentsize, efh->shoff);
+	max_size = max(ph_end, sh_end);
+	if ((max_size == UZERO) || (max_size > e.size)) {
+		return false;
+	}
+	for (i = UZERO; i < elf_shnum(e); ++i) {
+		elfOff esh_end;
+		const elfSectionHeader *esh = elf_section_header(e, i);
+		/*We have already validated the whole section header array is within the file*/
+		ASSERT(esh != NULL);
+		esh_end = esh->offset + esh->size;
+		if (esh_end < esh->offset) {
+			return false; //WRAP around error;
+		}
+		if ((esh->type != SHT_NOBITS) && (esh_end > e.size)) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * @brief Get section header index
+ * get elf_file_header and check it's not null,
+ * get value from link field of first (empty) section
+ * if esh is somehow NULL, return esh link
+ *
+ * @param[in] e		elf context
+ *
+ * @return 		section header index
+ */
+static elfWord elf_shstrndx(const elf_parser_ctx e)
+{
+	const elfFileHeader *efh = elf_file_header(e.elf_file);
+	if (NULL == efh) {
+		return UZERO;
+	}
+	if (efh->shstrndx == SHN_XINDEX) {
+		/* get value from link field of first (empty) section */
+		/* to avoid recursion, don't call elf_section_header(0) */
+		const elfSectionHeader *esh = elf_section_table(e);
+		if (NULL == esh) {
+			return UZERO;
+		}
+		return esh->link;
+	}
+	return efh->shstrndx;
+}
+
+/**
+ * @brief Get name of string from strtab section
+ * check elf context and section header not null,
+ * check from section header for type and size are not null.
+ * Get strtab section, check that stroffset doesn't wrap
+ *
+ * @param[in] e		elf context
+ * @param[in] eshstr	pointer to elf Section header
+ * @param[in] offset	offset in integer
+ * 			Valid range: 0 to eshstr->size
+ *
+ * @return 		name of string from strtab section "eshstr" at "offset"
+ */
+static const char *elf_string_at_offset(const elf_parser_ctx e,
+					const elfSectionHeader *eshstr,
+					unsigned int offset)
+{
+	const char *strtab;
+	elfOff stroffset;
+
+	if (SHT_STRTAB != eshstr->type) {
+		return NULL;
+	}
+	if (offset >= eshstr->size) {
+		return NULL;
+	}
+	strtab = (const char *)e.elf_file;
+	stroffset = eshstr->offset + offset;
+	if (stroffset < eshstr->offset) { // check that stroffset doesn't wrap
+		return NULL;
+	}
+	strtab = &strtab[stroffset];
+	return strtab;
+}
+
+const char *elf_section_name(const elf_parser_ctx e,
+			     const elfSectionHeader *esh)
+{
+	const char *name;
+	const elfSectionHeader *eshstr;
+	elfWord shstrndx;
+
+	/* get section header string table */
+	shstrndx = elf_shstrndx(e);
+	if (shstrndx == UZERO) {
+		return NULL;
+	}
+	eshstr = elf_section_header(e, shstrndx);
+	if ((NULL == esh) || (NULL == eshstr)) {
+		return NULL;
+	}
+	name = elf_string_at_offset(e, eshstr, esh->name);
+	return name;
+}
+
+const elfSectionHeader *elf_named_section_header(const elf_parser_ctx e,
+						 const char *name)
+{
+	const elfSectionHeader *esh;
+	unsigned int i;
+	if (NULL == name) {
+		return NULL;
+	}
+	esh = elf_section_table(e);
+	if (NULL == esh) {
+		return NULL;
+	}
+
+	/* iterate through sections till find matching name */
+	for (i = UZERO; i < elf_shnum(e); ++i) {
+		const char *secname = elf_section_name(e, esh);
+		if (NULL != secname) {
+			size_t seclen = strlen(secname);
+
+			// use strncmp to avoid problem with input not being null-terminated,
+			// but then need to check for false partial match
+			if ((ZERO == strncmp(secname, name, seclen)) &&
+			    (UZERO == (uint8_t)name[seclen])) {
+				return esh;
+			}
+		}
+		++esh;
+	}
+	return NULL;
+}
+
+/**
+ * @brief Get section header
+ * Get elf_section_table pointer and check it and
+ * iterate through sections till find matching type
+ *
+ * @param[in] e		elf context
+ * @param[in] type	type in word size
+ *
+ * @return 		elf section header with given "type"
+ */
+static const elfSectionHeader *elf_typed_section_header(const elf_parser_ctx e,
+							elfWord type)
+{
+	unsigned int i;
+	const elfSectionHeader *esh = elf_section_table(e);
+	if (NULL == esh) {
+		return NULL;
+	}
+
+	/* iterate through sections till find matching type */
+	for (i = UZERO; i < elf_shnum(e); ++i) {
+		if (esh->type == type) {
+			return esh;
+		}
+		++esh;
+	}
+	return NULL;
+}
+
+const elfByte *elf_section_contents(const elf_parser_ctx e,
+				    const elfSectionHeader *esh)
+{
+	const elfByte *p;
+	if ((NULL == e.elf_file) || (NULL == esh)) {
+		return NULL;
+	}
+	p = (const elfByte *)e.elf_file;
+	if ((esh->offset > e.size) ||
+	    ((uint64_t)((uint64_t)esh->offset + (uint64_t)esh->size) >
+	     e.size)) {
+		return NULL;
+	}
+	return &p[esh->offset];
+}
+
+const elfSymbol *elf_symbol(const elf_parser_ctx e, unsigned int index)
+{
+	const elfSectionHeader *esh;
+	const elfSymbol *esymtab;
+	const uint8_t *p = e.elf_file;
+	uint8_t align = 0;
+	/* get symbol table */
+	esh = elf_typed_section_header(e, SHT_SYMTAB);
+	if ((NULL == esh) || (UZERO == esh->entsize)) {
+		return NULL;
+	}
+	if (index >= (esh->size / esh->entsize)) {
+		return NULL;
+	}
+	if (esh->addralign <= (uint8_t)UINT8_MAX) {
+		align = (uint8_t)esh->addralign;
+	} else {
+		return NULL;
+	}
+	if ((uint64_t)((uint64_t)esh->size + (uint64_t)esh->offset) > e.size) {
+		return NULL;
+	}
+	p = &p[esh->offset];
+	esymtab = (const elfSymbol *)uint_8_to_void(p);
+	if ((align != 0U) && ((((uintptr_t)(esymtab) % align) != UZERO))) {
+		return NULL;
+	}
+
+	return &esymtab[index];
+}
+
+const char *elf_symbol_name(const elf_parser_ctx e, const elfSectionHeader *esh,
+			    unsigned int index)
+{
+	const elfSectionHeader *eshstr;
+	const elfSymbol *esymtab;
+	const elfSymbol *esym;
+	const char *name;
+	const char *p;
+	uint8_t align = 0;
+
+	if ((NULL == esh) || (UZERO == esh->entsize)) {
+		return NULL;
+	}
+	if (SHT_SYMTAB != esh->type) {
+		return NULL;
+	}
+	if (index >= (esh->size / esh->entsize)) {
+		return NULL;
+	}
+	/* get string table */
+	eshstr = elf_section_header(e, esh->link);
+	if (NULL == eshstr) {
+		return NULL;
+	}
+	p = (const char *)e.elf_file;
+	if (esh->addralign <= (uint8_t)UINT8_MAX) {
+		align = (uint8_t)esh->addralign;
+	} else {
+		return NULL;
+	}
+	if (esh->offset > e.size) {
+		return NULL;
+	}
+	p = &p[esh->offset];
+	esymtab = (const elfSymbol *)(const void *)(p);
+	if ((align != 0U) && ((((uintptr_t)(esymtab) % align) != UZERO))) {
+		return NULL;
+	}
+	esym = &esymtab[index];
+	name = elf_string_at_offset(e, eshstr, esym->name);
+	return name;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_elf_parser.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_elf_parser.h
@@ -0,0 +1,363 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_SILICON_ELF_PARSER_H
+#define PVA_KMD_SILICON_ELF_PARSER_H
+#include "pva_api.h"
+
+#define ZERO 0
+#define UZERO 0U
+#define ULLZERO 0ULL
+
+/*
+ * Define mapping from VPU data, rodata and program sections into
+ * corresponding segment types.
+ */
+typedef const void *elf_ct; /* points to const image of elf file */
+
+/**
+ * Struct containing the ELF Buffer and size of the buffer.
+ */
+typedef struct {
+	/** Pointer to buffer containing ELF File */
+	elf_ct elf_file;
+	/** Size of the buffer containing ELF File */
+	uint64_t size;
+} elf_parser_ctx;
+
+/*--------------------------------- Types ----------------------------------*/
+/** unsinged 8-bit data type */
+typedef uint8_t elfByte;
+/** unsinged 16-bit data type */
+typedef uint16_t elfHalf;
+/** unsinged 32-bit data type */
+typedef uint32_t elfWord;
+/** unsinged 32-bit data type */
+typedef uint32_t elfAddr;
+/** unsinged 32-bit data type */
+typedef uint32_t elfOff;
+
+/**
+ * @brief ELF File Header
+ *
+ */
+typedef struct {
+	/** ELF magic number : 0x7f,0x45,0x4c,0x46 */
+	elfWord magic;
+	/** Object file class */
+	elfByte oclass;
+	/** Data encoding */
+	elfByte data;
+	/** Object format version */
+	elfByte formatVersion;
+	/** OS application binary interface */
+	elfByte abi;
+	/** Version of abi */
+	elfByte abiVersion;
+	/** Elf ident padding */
+	elfByte padd[7];
+	/** Object file type */
+	elfHalf type;
+	/** Architecture */
+	elfHalf machine;
+	/** Object file version */
+	elfWord version;
+	/** Entry point virtual address */
+	elfAddr entry;
+	/** Program header table file offset */
+	elfOff phoff;
+	/** Section header table file offset */
+	elfOff shoff;
+	/** Processor-specific flags */
+	elfWord flags;
+	/** ELF header size in bytes */
+	elfHalf ehsize;
+	/** Program header table entry size */
+	elfHalf phentsize;
+	/** Program header table entry count */
+	elfHalf phnum;
+	/** Section header table entry size */
+	elfHalf shentsize;
+	/** Section header table entry count */
+	elfHalf shnum;
+	/** Section header string table index */
+	elfHalf shstrndx;
+} elfFileHeader;
+
+/** ELF magic number in big endian */
+#define ELFMAGIC 0x7f454c46U
+#define ELFMAGIC_LSB 0x464c457fU // ELF magic number in little endian
+#define ELFCLASS32 1U // 32 bit object file
+
+#define EV_NONE 0 // Invalid version
+#define EV_CURRENT 1 // Current version
+
+/**
+ * @brief ELF Section Header
+ *
+ */
+typedef struct {
+	/** Section name, string table index */
+	elfWord name;
+	/** Type of section */
+	elfWord type;
+	/** Miscellaneous section attributes */
+	elfWord flags;
+	/** Section virtual addr at execution */
+	elfAddr addr;
+	/** Section file offset */
+	elfOff offset;
+	/** Size of section in bytes */
+	elfWord size;
+	/** Index of another section */
+	elfWord link;
+	/** Additional section information */
+	elfWord info;
+	/** Section alignment */
+	elfWord addralign;
+	/** Entry size if section holds table */
+	elfWord entsize;
+} elfSectionHeader;
+
+/*
+* Section Header Type
+*/
+#define SHT_NULL 0x00U /// NULL section (entry unused)
+#define SHT_PROGBITS 0x01U /// Loadable program data
+#define SHT_SYMTAB 0x02U /// Symbol table
+#define SHT_STRTAB 0x03U /// String table
+#define SHT_RELA 0x04U /// Relocation table with addents
+#define SHT_HASH 0x05U /// Hash table
+#define SHT_DYNAMIC 0x06U /// Information for dynamic linking
+#define SHT_NOTE 0x07U /// Information that marks file
+#define SHT_NOBITS 0x08U /// Section does not have data in file
+#define SHT_REL 0x09U /// Relocation table without addents
+#define SHT_SHLIB 0x0aU /// Reserved
+#define SHT_DYNSYM 0x0bU /// Dynamic linker symbol table
+#define SHT_INIT_ARRAY 0x0eU /// Array of pointers to init funcs
+#define SHT_FINI_ARRAY 0x0fU /// Array of function to finish funcs
+#define SHT_PREINIT_ARRAY 0x10U /// Array of pointers to pre-init functions
+#define SHT_GROUP 0x11U /// Section group
+#define SHT_SYMTAB_SHNDX 0x12U /// Table of 32bit symtab shndx
+#define SHT_LOOS 0x60000000U /// Start OS-specific.
+#define SHT_HIOS 0x6fffffffU /// End OS-specific type
+#define SHT_LOPROC 0x70000000U /// Start of processor-specific
+#define SHT_HIPROC 0x7fffffffU /// End of processor-specific
+#define SHT_LOUSER 0x80000000U /// Start of application-specific
+#define SHT_HIUSER 0x8fffffffU /// End of application-specific
+
+/*
+* Special section index
+*/
+#define SHN_UNDEF 0U // Undefined section
+#define SHN_LORESERVE 0xff00U // lower bound of reserved indexes
+#define SHN_ABS 0xfff1U // Associated symbol is absolute
+#define SHN_COMMON 0xfff2U // Associated symbol is common
+#define SHN_XINDEX 0xffffU // Index is in symtab_shndx
+
+/*
+* Special section names
+*/
+#define SHNAME_SHSTRTAB ".shstrtab" /// section string table
+#define SHNAME_STRTAB ".strtab" /// string table
+#define SHNAME_SYMTAB ".symtab" /// symbol table
+#define SHNAME_SYMTAB_SHNDX ".symtab_shndx" /// symbol table shndx array
+#define SHNAME_TEXT ".text." /// suffix with entry name
+
+/**
+ * @brief Symbol's information
+ *
+ */
+typedef struct {
+	/** Symbol name, index in string tbl */
+	elfWord name;
+	/** Value of the symbol */
+	elfAddr value;
+	/** Associated symbol size */
+	elfWord size;
+	/** Type and binding attributes */
+	elfByte info;
+	/** Extra flags */
+	elfByte other;
+	/** Associated section index */
+	elfHalf shndx;
+} elfSymbol;
+
+/** Get the \a binding info of the symbol */
+#define ELF_ST_BIND(s) ((elfWord)((s)->info) >> 4)
+/** Get the \a type info of the symbol */
+#define ELF_ST_TYPE(s) ((elfWord)((s)->info) & 0xfU)
+
+/*
+* ELF symbol type
+*/
+#define STT_NOTYPE 0U // No type known
+#define STT_OBJECT 1U // Data symbol
+#define STT_FUNC 2U // Code symbol
+#define STT_SECTION 3U // Section
+#define STT_FILE 4U // File
+#define STT_COMMON 5U // Common symbol
+#define STT_LOOS 10U // Start of OS-specific
+
+/*
+* ELF symbol scope (binding)
+*/
+#define STB_LOCAL 0U /// Symbol not visible outside object
+#define STB_GLOBAL 1U /// Symbol visible outside object
+#define STB_WEAK 2U /// Weak symbol
+
+/*
+ * The following routines that return file/program/section headers
+ * all return NULL when not found.
+ */
+
+/*
+ *  Typical elf readers create a table of information that is passed
+ *  to the different routines.  For simplicity, we're going to just
+ *  keep the image of the whole file and pass that around.  Later, if we see
+ *  a need to speed this up, we could consider changing elf_parser_ctx to be something
+ *  more complicated.
+ */
+
+/**
+ * @brief Checks if the file stored in \a e is a 32-bit elf file
+ * and if the first 4 bytes contain elf magic ID.
+ *
+ * @param[in] e		elf context containing complete ELF in a const buffer
+ *
+ * @return
+ *     - TRUE if valid 32-bit elf file and correct elf magic ID present
+ *       in first 4 bytes of elf file
+ *     - FALSE if either of the above condition is not met
+ */
+bool elf_header_check(const elf_ct e);
+
+/**
+ * @brief Provide number of sections in sections header table
+ * get elf_file_header and check it's not null,
+ * get value from size of first (empty) section
+ * if esh is NULL, section_size will return zero
+ *
+ * @param[in] e		elf context containing complete ELF in a const buffer
+ *
+ * @return 		section header number
+ */
+elfWord elf_shnum(const elf_parser_ctx e);
+
+/**
+ * @brief This function checks all sections in the elf to be valid
+ *
+ * The function validates all sections as follows:
+ * - Valid section offset i.e. within file bounds.
+ * - Valid section size i.e. non-zero section size
+ *   and offset + section size is within file bounds
+ *
+ * @param[in]e		elf context containing completeELF in a const buffer  
+ *
+ * @return
+ * 	- TRUE if all sections are valid
+ * 	- FALSE if any invalid section found
+ */
+bool elf_has_valid_sections(const elf_parser_ctx e);
+
+/**
+ * @brief This function traverses the elf and
+ * returns a valid \ref elfSectionHeader if present
+ * at the index provided
+ *
+ * @param[in] e		elf context containing complete ELF in a const buffer
+ * @param[in] index	The index of the elfSectionHeader that is requested
+ * 			Valid range : 0 to elf_shnum(e)
+ *
+ * @return
+ *     - valid elfSectionHeader from elf if index is valid and if sectionHeader is present
+ *     - NULL if invalid or out of bounds index
+ */
+const elfSectionHeader *elf_section_header(const elf_parser_ctx e,
+					   unsigned int index);
+
+/**
+ * @brief This function obtains the name of the \ref elfSectionHeader
+ * by going to the index specified by elfSectionHeader->name in the string table
+ * of the elf
+ *
+ * @param[in] e			elf context
+ *
+ * @param[in] esh		Valid \ref elfSectionHeader whose name is requested
+ *
+ * @return
+ *     - Non NULL character array containing name of the elfSectionHeader
+ *       if found in elf String Table
+ *     - NULL if invalid elfSectionHeader or invalid index in elfSectionHeader->name
+ *       going out of bounds of string table of elf
+ */
+const char *elf_section_name(const elf_parser_ctx e,
+			     const elfSectionHeader *esh);
+
+/**
+ * @brief Provide elf section header with given "name".
+ * check elf context not a null, get elf_section_table and
+ *  then iterate through sections till find matching name
+ *
+ * @param[in] e		elf context
+ * @param[in] name	name of section
+ *
+ * @return
+ *     - elf section header with given "name"
+ *     - NULL if @a name is NULL or invalid elfSectionHeader is found
+ */
+const elfSectionHeader *elf_named_section_header(const elf_parser_ctx e,
+						 const char *name);
+
+/**
+ * @brief Provide contents of section.
+ * check elf context and section header not a null,
+ *  return byte pointer of section header offset of elf context
+ * @param[in] e		elf context
+ * @param[in] esh	section header
+ *
+ i* @return 		Bytepointer of elf (NULL if e or esh == NULL)
+ */
+const elfByte *elf_section_contents(const elf_parser_ctx e,
+				    const elfSectionHeader *esh);
+
+/**
+ * @brief Get ELF symbol
+ * get elf_typed_section_header section header,
+ * check header or it's entsize not null.
+ * check index is not crossing section header & table size
+ * Also make sure it is address aligned and get symbol table.
+ *
+ * @param[in] e		elf context
+ * @param[in] index 	unsigned index
+ * 			Valid range: 0 to number of entries in SHT_SYMTAB of e
+ *
+ * @return 		elf symbol at given index (NULL if not found).
+ */
+const elfSymbol *elf_symbol(const elf_parser_ctx e, unsigned int index);
+
+/**
+ * @brief Get symbol table section
+ * check section header or it's entsize not null.
+ * check index is not crossing section header & table size
+ * get elf_section_header and Also make sure it is address
+ * aligned and get symbol table.
+ *
+ * @param[in] e		elf context
+ * @param[in] esh 	pointer to structure elfSectionHeader
+ * @param[in] index 	unsigned index
+ * 			Valid range: 0 to number of entries in SHT_SYMTAB of e
+ *
+ * @return 		name of symbol from symtab section "esh" at "index".
+ */
+const char *elf_symbol_name(const elf_parser_ctx e, const elfSectionHeader *esh,
+			    unsigned int index);
+
+#endif // PVA_KMD_SILICON_ELF_PARSER_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_executable.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_executable.c
@@ -0,0 +1,920 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_executable.h"
+#include "pva_kmd_silicon_elf_parser.h"
+#include "pva_kmd_utils.h"
+#include "pva_resource.h"
+#include "pva_kmd_device.h"
+#include "pva_api_types.h"
+#include "pva_kmd_t23x.h"
+#include "pva_kmd_t26x.h"
+#include "pva_math_utils.h"
+
+/**
+ *  enum to identify different segments of VPU ELF
+ */
+enum pva_elf_seg_type {
+	/** Code segment in VPU ELF */
+	PVA_SEG_VPU_CODE = 0U,
+	/** DATA segment in VPU ELF */
+	PVA_SEG_VPU_DATA,
+	/** DATA segment in VPU ELF containing symbol information*/
+	PVA_SEG_VPU_IN_PARAMS,
+	/** Not a valid segment in VPU ELF */
+	PVA_SEG_VPU_MAX_TYPE
+};
+
+/** Maximum number of characters in symbol name */
+#define ELF_MAXIMUM_SYMBOL_LENGTH 64U
+
+/** Maximum number of characters in section name */
+#define ELF_MAXIMUM_SECTION_NAME 64
+
+/** Section name of EXPORTS section */
+#define ELF_EXPORTS_SECTION "EXPORTS"
+
+/** Section name of EXPORTS section name length */
+#define ELF_EXPORTS_SECTION_NAME_LENGTH 7
+
+/** Alignment needed for Data section of ELFs */
+#define DATA_SECTION_ALIGNMENT 32U
+
+/** Alignment needed for Text section of ELFs */
+#define TEXT_SECTION_ALIGNMENT 128U
+
+/** VPU icache size: 16KB */
+#define VPU_ICACHE_SIZE (16U * 1024U)
+
+/** This value indicates the that current symbol can be ignored in the VPU ELF */
+#define SYM_IGNORE 1
+
+#define SIZE_EXPORTS_TABLE_ENTRY (3U * sizeof(uint32_t))
+
+static uint32_t change_byte_order(uint32_t word)
+{
+	uint32_t out_word = 0U;
+	out_word = PVA_INSERT(PVA_EXTRACT(word, 31, 24, uint32_t), 7, 0);
+	out_word |= PVA_INSERT(PVA_EXTRACT(word, 23, 16, uint32_t), 15, 8);
+	out_word |= PVA_INSERT(PVA_EXTRACT(word, 15, 8, uint32_t), 23, 16);
+	out_word |= PVA_INSERT(PVA_EXTRACT(word, 7, 0, uint32_t), 31, 24);
+	return out_word;
+}
+
+/*
+ * Define mapping from VPU data, rodata and program sections into
+ * corresponding segment types.
+ */
+static const struct pack_rule {
+	const char *elf_sec_name;
+	int32_t pva_type;
+} pack_rules[] = { {
+			   .elf_sec_name = ".data",
+			   .pva_type = (int32_t)PVA_SEG_VPU_DATA,
+		   },
+		   {
+			   .elf_sec_name = ".rodata",
+			   .pva_type = (int32_t)PVA_SEG_VPU_DATA,
+		   },
+		   {
+			   .elf_sec_name = ".text",
+			   .pva_type = (int32_t)PVA_SEG_VPU_CODE,
+		   } };
+
+/**
+* \brief Compares the \a section_name with all
+* vpu elf section names until it finds a match and
+* then return corresponding segment type.
+* If the segment type is \ref PVA_SEG_VPU_DATA, then it further
+* checks if its PVA_SEG_VPU_IN_PARAMS.
+* \param[in] section_name Name of the section to be searched for, in VPU ELF
+* \return returns corresponding value from enum pva_elf_seg_type.
+*/
+static int32_t find_pva_ucode_segment_type(const char *section_name)
+{
+	uint32_t i;
+	int32_t ret = (int32_t)PVA_SEG_VPU_MAX_TYPE;
+
+	for (i = 0; i < PVA_ARRAY_SIZE(pack_rules); i += 1U) {
+		/* Ignore the suffix of the section name */
+		if (strncmp(section_name, pack_rules[i].elf_sec_name,
+			    strlen(pack_rules[i].elf_sec_name)) == 0) {
+			ret = pack_rules[i].pva_type;
+			break;
+		}
+	}
+	if (ret == (int32_t)PVA_SEG_VPU_DATA) {
+		uint64_t section_name_len =
+			strnlen(section_name, ELF_MAXIMUM_SECTION_NAME);
+		uint64_t exports_section_name_len =
+			ELF_EXPORTS_SECTION_NAME_LENGTH;
+		// Check Export section present in DATA segment. Only support export sections.
+		if ((section_name_len >= exports_section_name_len) &&
+		    (strncmp((section_name +
+			      (section_name_len - exports_section_name_len)),
+			     ELF_EXPORTS_SECTION,
+			     (size_t)exports_section_name_len)) == 0) {
+			ret = (int32_t)PVA_SEG_VPU_IN_PARAMS;
+		}
+	}
+
+	return ret;
+}
+
+static enum pva_error validate_elf(const elf_parser_ctx elf)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	if (!elf_header_check(elf.elf_file)) {
+		pva_kmd_log_err("Invalid 32 bit VPU ELF");
+		err = PVA_INVAL;
+		goto done;
+	}
+
+	if (!elf_has_valid_sections(elf)) {
+		pva_kmd_log_err("ELF has invalid sections");
+		err = PVA_INVAL;
+	}
+done:
+	return err;
+}
+
+static int32_t validate_symbol(elf_parser_ctx elf, uint32_t symbol_entry_id,
+			       const elfSymbol **sym)
+{
+	const elfSectionHeader *sym_scn;
+	const char *section_name = NULL;
+	int32_t section_type = (int32_t)PVA_SEG_VPU_MAX_TYPE;
+	int32_t err = 0;
+
+	*sym = elf_symbol(elf, symbol_entry_id);
+	if ((*sym == NULL) || ((*sym)->size == 0U) ||
+	    (ELF_ST_BIND(*sym) != STB_GLOBAL) ||
+	    (ELF_ST_TYPE(*sym) == STT_FUNC)) {
+		err = SYM_IGNORE;
+		goto end;
+	}
+
+	sym_scn = elf_section_header(elf, (*sym)->shndx);
+	section_name = elf_section_name(elf, sym_scn);
+	if (section_name == NULL) {
+		err = SYM_IGNORE;
+		goto end;
+	}
+	section_type = find_pva_ucode_segment_type(section_name);
+	if (section_type != (int32_t)PVA_SEG_VPU_IN_PARAMS) {
+		err = SYM_IGNORE;
+		goto end;
+	}
+	err = 0;
+end:
+	if (err != 0) {
+		*sym = NULL;
+	}
+	return err;
+}
+
+static enum pva_error count_symbols(const elf_parser_ctx elf,
+				    uint32_t *out_num_symbols)
+{
+	enum pva_error err = PVA_SUCCESS;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+	const elfSectionHeader *section_header;
+	uint32_t i, ent_count;
+	const elfSymbol *sym;
+	int32_t ret;
+	uint32_t num_symbols = 0;
+
+	section_header = elf_named_section_header(elf, ".symtab");
+
+	if (section_header == NULL) {
+		err = PVA_INVAL;
+		pva_kmd_log_err("No symbol table found");
+		goto done;
+	}
+
+	ent_count = section_header->size / section_header->entsize;
+	for (i = 0; i < ent_count; i++) {
+		ret = validate_symbol(elf, i, &sym);
+		if (ret < 0) {
+			err = PVA_INVAL;
+			pva_kmd_log_err("Validation of symbol failed");
+			goto done;
+		}
+		if (ret == SYM_IGNORE) {
+			continue;
+		}
+		num_symbols = addu32(num_symbols, 1U, &math_err);
+	}
+	if (math_err != MATH_OP_SUCCESS) {
+		err = PVA_ERR_MATH_OP;
+		pva_kmd_log_err("count_symbols math error");
+		goto done;
+	}
+
+	*out_num_symbols = num_symbols;
+done:
+	return err;
+}
+
+/**
+ * @brief updates symbol information (type, addr and size) from
+ * VPU ELF PVA_SEG_VPU_IN_PARAMS segment.
+ *
+ * Data about symbol information in EXPORTS section of ELF is present as follows.
+ * typedef struct {
+ *   uint32_t type; From VMEM_TYPE enums
+ *   uint32_t addr_offset; Offset from VMEM base
+ *   uint32_t size; Size of VMEM region in bytes
+ * };
+ * @param[in] elf pointer to const image of elf file.
+ * @param[in] section_header pointer to VPU ELF PVA_SEG_VPU_IN_PARAMS section header
+ * @param[in, out] symbol_info pointer to ELF image symbol which needs to be updated.
+*/
+static enum pva_error
+update_exports_symbol(elf_parser_ctx elf,
+		      const elfSectionHeader *section_header,
+		      struct pva_symbol_info *symbol_info)
+{
+	const elfByte *data;
+	uint32_t symOffset = 0U;
+	enum pva_error err = PVA_SUCCESS;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+
+	if ((section_header == NULL) ||
+	    (symbol_info->vmem_addr < section_header->addr) ||
+	    (addu32(symbol_info->vmem_addr, (uint32_t)SIZE_EXPORTS_TABLE_ENTRY,
+		    &math_err) >
+	     addu32(section_header->addr, section_header->size, &math_err))) {
+		err = PVA_INVAL;
+		goto done;
+	} else {
+		symOffset = subu32(symbol_info->vmem_addr, section_header->addr,
+				   &math_err);
+	}
+	data = elf_section_contents(elf, section_header);
+	if (data == NULL) {
+		pva_kmd_log_err("Export section in ELF is NULL");
+		err = PVA_INVAL;
+		goto done;
+	}
+	symbol_info->symbol_type = *(uint8_t *)((uintptr_t)&data[symOffset]);
+	if ((symbol_info->symbol_type == (uint8_t)PVA_SYM_TYPE_INVALID) ||
+	    (symbol_info->symbol_type >= (uint8_t)PVA_SYM_TYPE_MAX)) {
+		pva_kmd_log_err("Invalid symbol type found");
+		err = PVA_INVAL;
+		goto done;
+	}
+	symbol_info->vmem_addr =
+		*(uint32_t *)((uintptr_t)&data[symOffset + sizeof(uint32_t)]);
+	symbol_info->size = *(uint32_t *)((
+		uintptr_t)&data[symOffset + (2UL * sizeof(uint32_t))]);
+	if (math_err != MATH_OP_SUCCESS) {
+		pva_kmd_log_err("update_exports_symbol math error");
+		err = PVA_ERR_MATH_OP;
+		goto done;
+	}
+done:
+	return err;
+}
+
+static bool validate_vmem_offset(const uint32_t vmem_offset,
+				 const uint32_t size,
+				 const uint8_t vmem_region_count,
+				 const struct vmem_region *vmem_regions_tab)
+{
+	bool valid = false;
+	uint32_t i = 0U;
+	uint32_t prev_idx;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+
+	for (i = vmem_region_count; i > 0U; i--) {
+		prev_idx = subu32(i, 1U, &math_err);
+		if (vmem_offset >= vmem_regions_tab[prev_idx].start) {
+			break;
+		}
+	}
+
+	if ((i > 0U) && (addu32(vmem_offset, size, &math_err) <=
+			 vmem_regions_tab[prev_idx].end)) {
+		valid = true;
+	}
+
+	return (math_err != MATH_OP_SUCCESS) ? false : valid;
+}
+
+static enum pva_error copy_symbol(elf_parser_ctx elf, const elfSymbol *sym,
+				  const char *symname,
+				  struct pva_symbol_info *symbol_info,
+				  const uint8_t vmem_region_count,
+				  const struct vmem_region *vmem_regions_tab)
+{
+	const elfSectionHeader *sym_scn;
+	enum pva_error err = PVA_SUCCESS;
+
+	size_t symname_len = strnlen(symname, PVA_MAX_SYMBOL_NAME_LEN);
+	if (symname_len > 0U) {
+		(void)memcpy(symbol_info->name, symname, symname_len);
+	}
+	symbol_info->name[PVA_MAX_SYMBOL_NAME_LEN] = '\0';
+
+	symbol_info->size = sym->size;
+	symbol_info->vmem_addr = sym->value;
+
+	sym_scn = elf_section_header(elf, sym->shndx);
+	err = update_exports_symbol(elf, sym_scn, symbol_info);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Updating symbol from EXPORTS table failed");
+		goto out;
+	}
+
+	if (!validate_vmem_offset(symbol_info->vmem_addr, symbol_info->size,
+				  vmem_region_count, vmem_regions_tab)) {
+		pva_kmd_log_err("Invalid symbol vmem offset in ELF");
+		err = PVA_INVAL;
+		goto out;
+	}
+
+out:
+	return err;
+}
+
+static enum pva_error
+fill_symbol_table(const elf_parser_ctx elf,
+		  struct pva_kmd_exec_symbol_table *sym_table,
+		  const uint8_t vmem_region_count,
+		  const struct vmem_region *vmem_regions_tab)
+{
+	enum pva_error err = PVA_SUCCESS;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+	const elfSectionHeader *section_header;
+	uint32_t i, ent_count;
+	const elfSymbol *sym;
+	const char *symname;
+	int32_t ret;
+	uint32_t export_sym_idx = 0;
+
+	section_header = elf_named_section_header(elf, ".symtab");
+
+	if (section_header == NULL) {
+		err = PVA_INVAL;
+		pva_kmd_log_err("No symbol table found");
+		goto done;
+	}
+
+	ent_count = section_header->size / section_header->entsize;
+	for (i = 0; i < ent_count; i++) {
+		struct pva_symbol_info *symbol_info;
+
+		ret = validate_symbol(elf, i, &sym);
+		if (ret < 0) {
+			err = PVA_INVAL;
+			pva_kmd_log_err("Validation of symbol failed");
+			goto done;
+		}
+		if (ret == SYM_IGNORE) {
+			continue;
+		}
+
+		symbol_info = &sym_table->symbols[export_sym_idx];
+		ASSERT(symbol_info != NULL);
+		symname = elf_symbol_name(elf, section_header, i);
+		if (symname == NULL) {
+			err = PVA_INVAL;
+			pva_kmd_log_err("elf_symbol_name failed");
+			goto done;
+		}
+		err = copy_symbol(elf, sym, symname, symbol_info,
+				  vmem_region_count, vmem_regions_tab);
+		if (err != PVA_SUCCESS) {
+			goto done;
+		}
+		symbol_info->symbol_id =
+			addu32(export_sym_idx, PVA_SYMBOL_ID_BASE, &math_err);
+		export_sym_idx = addu32(export_sym_idx, 1U, &math_err);
+		if (math_err != MATH_OP_SUCCESS) {
+			err = PVA_ERR_MATH_OP;
+			pva_kmd_log_err("fill_symbol_table math error");
+			goto done;
+		}
+	}
+done:
+	return err;
+}
+
+/**
+ * The simplify caller's life: the input ptr should always be considered freed
+ * after this call. The returned new ptr should always be considered a new
+ * allocation and it needs to be freed if not NULL.
+ */
+static void *pva_realloc(void *ptr, uint32_t old_size, uint32_t new_size)
+{
+	void *new_buffer;
+
+	if (ptr == NULL) {
+		return pva_kmd_zalloc(new_size);
+	}
+
+	if (new_size <= old_size) {
+		return ptr;
+	}
+
+	new_buffer = pva_kmd_zalloc(new_size);
+	if (new_buffer == NULL) {
+		goto out;
+	}
+
+	memcpy(new_buffer, ptr, old_size);
+
+out:
+	pva_kmd_free(ptr);
+	return new_buffer;
+}
+
+static void *copy_text_section(const elf_parser_ctx elf,
+			       const elfSectionHeader *section_header,
+			       void *out_buffer, uint32_t *buffer_size)
+{
+	const elfByte *elf_data;
+	uint32_t const *word;
+	uint32_t *dst_word;
+	uint32_t wi;
+	/* The load address in section header is in words (uint32_t) */
+	uint32_t load_addr_bytes =
+		safe_mulu32(section_header->addr, (uint32_t)sizeof(uint32_t));
+	uint32_t needed_size =
+		safe_addu32(load_addr_bytes, section_header->size);
+
+	// Align required text section size
+	needed_size =
+		safe_pow2_roundup_u32(needed_size, TEXT_SECTION_ALIGNMENT);
+
+	if (needed_size > *buffer_size) {
+		out_buffer = pva_realloc(out_buffer, *buffer_size, needed_size);
+		*buffer_size = needed_size;
+	}
+
+	if (out_buffer == NULL) {
+		return NULL;
+	}
+
+	elf_data = elf_section_contents(elf, section_header);
+	if (elf_data == NULL) {
+		pva_kmd_log_err("copy_text_section elf_data error");
+		return NULL;
+	}
+
+	word = (uint32_t const *)elf_data;
+
+	dst_word = (uint32_t *)((uintptr_t)out_buffer + load_addr_bytes);
+	for (wi = 0; wi < (section_header->size / sizeof(uint32_t)); wi++) {
+		dst_word[wi] = change_byte_order(word[wi]);
+	}
+
+	return out_buffer;
+}
+
+/**
+ * @brief Aggregate all text sections into a single, dynamically
+ * allocated buffer.
+ *
+ * The placement of text sections needs to take into account of the loading
+ * addresses.
+ *
+ * The endianness of text section needs to be changed.
+ *
+ * Caller is responsible for freeing the returned buffer.
+ */
+static void *aggregate_text_sections(const elf_parser_ctx elf,
+				     uint32_t *out_size)
+{
+	const elfSectionHeader *section_header;
+	uint32_t index = 0;
+	const char *section_name;
+	const elfWord sectionCount = elf_shnum(elf);
+	void *sections_content = NULL;
+	uint32_t sections_size = 0;
+
+	for (index = 0; index < sectionCount; index++) {
+		int32_t segment_type;
+
+		section_header = elf_section_header(elf, index);
+		if (section_header == NULL) {
+			pva_kmd_log_err(
+				"aggregate_text_sections elf_section_header error");
+			goto out;
+		}
+
+		section_name = elf_section_name(elf, section_header);
+		if (section_name == NULL) {
+			pva_kmd_log_err(
+				"aggregate_text_sections elf_section_name error");
+			goto out;
+		}
+		segment_type = find_pva_ucode_segment_type(section_name);
+		if ((section_header->type == SHT_PROGBITS) &&
+		    (segment_type == (int32_t)PVA_SEG_VPU_CODE)) {
+			sections_content =
+				copy_text_section(elf, section_header,
+						  sections_content,
+						  &sections_size);
+			if (sections_content == NULL) {
+				pva_kmd_log_err(
+					"aggregate_text_sections copy_text_section error");
+				goto out;
+			}
+		}
+	}
+out:
+	*out_size = sections_size;
+	return sections_content;
+}
+
+static void copy_data_section(const elf_parser_ctx elf,
+			      const elfSectionHeader *section_header,
+			      void *out_buffer, uint32_t *buffer_offset,
+			      uint32_t buffer_size)
+{
+	const elfByte *elf_data;
+	void *dst;
+	uint32_t aligned_size = safe_pow2_roundup_u32(section_header->size,
+						      DATA_SECTION_ALIGNMENT);
+	uint32_t size = safe_addu32(*buffer_offset, aligned_size);
+	ASSERT(size <= buffer_size);
+
+	dst = pva_offset_pointer(out_buffer, *buffer_offset);
+
+	elf_data = elf_section_contents(elf, section_header);
+
+	ASSERT(elf_data != NULL);
+
+	memcpy(dst, elf_data, section_header->size);
+
+	*buffer_offset = safe_addu32(*buffer_offset, aligned_size);
+}
+
+static enum pva_error count_data_sections(const elf_parser_ctx elf,
+					  uint32_t *out_n_data_sections,
+					  uint32_t *out_total_size)
+{
+	const elfSectionHeader *section_header;
+	uint32_t index = 0;
+	const char *section_name;
+	const elfWord sectionCount = elf_shnum(elf);
+	uint32_t n_data_sections = 0;
+	uint32_t total_size = 0;
+	enum pva_error err = PVA_SUCCESS;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+
+	for (index = 0; index < sectionCount; index++) {
+		int32_t segment_type;
+
+		section_header = elf_section_header(elf, index);
+		if (section_header == NULL) {
+			err = PVA_INVAL;
+			goto out;
+		}
+
+		section_name = elf_section_name(elf, section_header);
+		if (section_name == NULL) {
+			err = PVA_INVAL;
+			goto out;
+		}
+		segment_type = find_pva_ucode_segment_type(section_name);
+		if ((section_header->type == SHT_PROGBITS) &&
+		    (segment_type == (int32_t)PVA_SEG_VPU_DATA)) {
+			n_data_sections =
+				addu32(n_data_sections, 1U, &math_err);
+			total_size += safe_pow2_roundup_u32(
+				section_header->size, DATA_SECTION_ALIGNMENT);
+		}
+	}
+	if (math_err != MATH_OP_SUCCESS) {
+		err = PVA_ERR_MATH_OP;
+		pva_kmd_log_err("count_data_sections math error");
+		goto out;
+	}
+	*out_n_data_sections = n_data_sections;
+	*out_total_size = total_size;
+out:
+	return err;
+}
+
+/**
+ * @brief Aggregate all data sections into a single, dynamically
+ * allocated buffer.
+ *
+ * The offset of each data section must be aligned to DATA_SEGMENT_ALIGNMENT.
+ *
+ * The caller must free the returned data buffer and out_section_infos.
+ *
+ */
+static void *
+aggregate_data_sections(const elf_parser_ctx elf, uint32_t n_data_sections,
+			uint32_t total_sections_size,
+			struct pva_fw_data_section_info **out_section_infos)
+{
+	const elfSectionHeader *section_header;
+	uint32_t index = 0;
+	const char *section_name;
+	const elfWord sectionCount = elf_shnum(elf);
+	void *sections_content = NULL;
+	struct pva_fw_data_section_info *section_infos;
+	uint32_t buffer_offset = 0;
+	uint32_t sec_idx = 0;
+
+	sections_content = pva_kmd_zalloc(total_sections_size);
+	if (sections_content == NULL) {
+		goto err_out;
+	}
+	section_infos =
+		pva_kmd_zalloc(sizeof(*section_infos) * n_data_sections);
+	if (section_infos == NULL) {
+		goto free_content;
+	}
+
+	for (index = 0; index < sectionCount; index++) {
+		int32_t segment_type;
+
+		section_header = elf_section_header(elf, index);
+		/* Already checked when count data sections */
+		ASSERT(section_header != NULL);
+
+		section_name = elf_section_name(elf, section_header);
+		ASSERT(section_name != NULL);
+		segment_type = find_pva_ucode_segment_type(section_name);
+		if ((section_header->type == SHT_PROGBITS) &&
+		    (segment_type == (int32_t)PVA_SEG_VPU_DATA)) {
+			section_infos[sec_idx].data_buf_off = buffer_offset;
+			section_infos[sec_idx].vmem_addr = section_header->addr;
+			section_infos[sec_idx].size = section_header->size;
+			sec_idx = safe_addu32(sec_idx, 1U);
+
+			copy_data_section(elf, section_header, sections_content,
+					  &buffer_offset, total_sections_size);
+		}
+	}
+
+	*out_section_infos = section_infos;
+	return sections_content;
+free_content:
+	pva_kmd_free(sections_content);
+err_out:
+	return NULL;
+}
+
+/**
+ * @brief layout text and data sections in a single continuous buffer that is
+ * mapped to PVA IOVA space (user SID).
+ *
+ * We need to pad text size by an entire VPU icache size to avoid SMMU fault
+ * when prefetching.
+ */
+static struct pva_kmd_device_memory *
+load_sections(struct pva_kmd_device *pva, uint8_t smmu_id,
+	      const void *text_section_buf, uint32_t text_size,
+	      const void *data_section_buf, uint32_t data_size,
+	      uint32_t *out_data_begin_offset)
+{
+	uint32_t size = safe_addu32(text_size, (uint32_t)VPU_ICACHE_SIZE);
+	uint32_t alloc_size = safe_addu32(size, data_size);
+	uint32_t data_begin = safe_addu32(text_size, (uint32_t)VPU_ICACHE_SIZE);
+	struct pva_kmd_device_memory *dev_mem;
+
+	ASSERT(TEXT_SECTION_ALIGNMENT >= DATA_SECTION_ALIGNMENT);
+	/* This is guaranteed to be true as TEXT_SECTION_ALIGNMENT is more strict */
+	ASSERT(data_begin % DATA_SECTION_ALIGNMENT == 0);
+
+	/* Map it as read-only. TODO: when VPU debugger is supported, we may
+	 * need to map text as READ_WRITE conditionally. */
+	dev_mem = pva_kmd_device_memory_alloc_map(alloc_size, pva,
+						  PVA_ACCESS_RO, smmu_id);
+	if (dev_mem == NULL) {
+		goto out;
+	}
+
+	memcpy(dev_mem->va, text_section_buf, text_size);
+	memcpy(pva_offset_pointer(dev_mem->va, data_begin), data_section_buf,
+	       data_size);
+
+	*out_data_begin_offset = data_begin;
+out:
+	return dev_mem;
+}
+
+static struct pva_kmd_device_memory *
+load_metainfo(struct pva_kmd_device *pva, uint64_t section_iova,
+	      uint32_t text_size, uint32_t data_begin_off, uint32_t data_size,
+	      struct pva_fw_data_section_info const *section_infos,
+	      uint32_t n_data_sections, struct pva_symbol_info *symbol_table,
+	      uint32_t n_symbols)
+{
+	struct pva_kmd_device_memory *dev_mem;
+	struct pva_exec_bin_resource *metainfo;
+	struct pva_fw_vmem_buffer *vmem_buffers_mem;
+	struct pva_fw_data_section_info *data_sections_mem;
+	uint32_t i;
+	uint32_t alloc_size = (uint32_t)sizeof(struct pva_exec_bin_resource);
+	pva_math_error math_err = MATH_OP_SUCCESS;
+
+	alloc_size =
+		addu32(alloc_size,
+		       mulu32(n_data_sections,
+			      (uint32_t)sizeof(struct pva_fw_data_section_info),
+			      &math_err),
+		       &math_err);
+
+	alloc_size = addu32(alloc_size,
+			    mulu32(n_symbols,
+				   (uint32_t)sizeof(struct pva_fw_vmem_buffer),
+				   &math_err),
+			    &math_err);
+
+	dev_mem = pva_kmd_device_memory_alloc_map(
+		alloc_size, pva, PVA_ACCESS_RO, PVA_R5_SMMU_CONTEXT_ID);
+	if (dev_mem == NULL) {
+		goto out;
+	}
+
+	metainfo = dev_mem->va;
+	metainfo->code_addr_hi = iova_hi(section_iova);
+	metainfo->code_addr_lo = iova_lo(section_iova);
+	metainfo->code_size = text_size;
+	metainfo->data_section_addr_hi =
+		iova_hi(addu64(section_iova, data_begin_off, &math_err));
+	metainfo->data_section_addr_lo =
+		iova_lo(addu64(section_iova, data_begin_off, &math_err));
+	metainfo->num_data_sections = n_data_sections;
+	metainfo->num_vmem_buffers = n_symbols;
+
+	data_sections_mem = pva_offset_pointer(metainfo, sizeof(*metainfo));
+	memcpy(data_sections_mem, section_infos,
+	       mulu32(n_data_sections, (uint32_t)sizeof(*section_infos),
+		      &math_err));
+
+	vmem_buffers_mem = pva_offset_pointer(
+		data_sections_mem,
+		mulu32(n_data_sections, (uint32_t)sizeof(*section_infos),
+		       &math_err));
+	if (math_err != MATH_OP_SUCCESS) {
+		dev_mem = NULL;
+		goto out;
+	}
+
+	for (i = 0; i < n_symbols; i++) {
+		vmem_buffers_mem[i].addr =
+			PVA_INSERT(symbol_table[i].vmem_addr,
+				   PVA_FW_VMEM_ADDR_MSB, PVA_FW_VMEM_ADDR_LSB) |
+			PVA_INSERT((uint32_t)symbol_table[i].symbol_type,
+				   PVA_FW_SYM_TYPE_MSB, PVA_FW_SYM_TYPE_LSB);
+		vmem_buffers_mem[i].size = symbol_table[i].size;
+	}
+
+out:
+	return dev_mem;
+}
+
+enum pva_error
+pva_kmd_load_executable(void *executable_data, uint32_t executable_size,
+			struct pva_kmd_device *pva, uint8_t dma_smmu_id,
+			struct pva_kmd_exec_symbol_table *out_symbol_table,
+			struct pva_kmd_device_memory **out_metainfo,
+			struct pva_kmd_device_memory **out_sections)
+{
+	enum pva_error err = PVA_SUCCESS;
+	pva_math_error math_err = MATH_OP_SUCCESS;
+	elf_parser_ctx elf = { 0 };
+	uint32_t num_symbols = 0;
+	uint32_t n_data_sections;
+	uint32_t total_data_section_size = 0;
+	struct pva_fw_data_section_info *section_infos = NULL;
+	void *data_section_buf = NULL;
+	void *text_section_buf = NULL;
+	uint32_t total_text_section_size = 0;
+	struct pva_kmd_device_memory *metainfo_mem = NULL;
+	struct pva_kmd_device_memory *sections_mem = NULL;
+	uint32_t data_begin_off;
+
+	elf.elf_file = executable_data;
+	elf.size = executable_size;
+	err = validate_elf(elf);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	err = count_symbols(elf, &num_symbols);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	out_symbol_table->n_symbols = num_symbols;
+	if (num_symbols > 0) {
+		out_symbol_table->symbols = pva_kmd_zalloc(
+			mulu32((uint32_t)sizeof(struct pva_symbol_info),
+			       num_symbols, &math_err));
+		if (out_symbol_table->symbols == NULL) {
+			err = PVA_NOMEM;
+			goto err_out;
+		}
+		if (math_err != MATH_OP_SUCCESS) {
+			err = PVA_ERR_MATH_OP;
+			pva_kmd_log_err("pva_kmd_load_executable math error");
+			goto err_out;
+		}
+	}
+
+	err = fill_symbol_table(elf, out_symbol_table,
+				pva->hw_consts.n_vmem_regions,
+				pva->vmem_regions_tab);
+	if (err != PVA_SUCCESS) {
+		goto free_syms;
+	}
+
+	text_section_buf =
+		aggregate_text_sections(elf, &total_text_section_size);
+	/* Must have text sections */
+	if (text_section_buf == NULL) {
+		pva_kmd_log_err(
+			"pva_kmd_load_executable aggregate_text_sections error");
+		goto free_syms;
+	}
+
+	err = count_data_sections(elf, &n_data_sections,
+				  &total_data_section_size);
+	if (err != PVA_SUCCESS) {
+		goto free_text_buf;
+	}
+
+	/* It's OK to not have data sections */
+	if (total_data_section_size != 0) {
+		data_section_buf =
+			aggregate_data_sections(elf, n_data_sections,
+						total_data_section_size,
+						&section_infos);
+		ASSERT(data_section_buf != NULL);
+	}
+
+	sections_mem = load_sections(pva, dma_smmu_id, text_section_buf,
+				     total_text_section_size, data_section_buf,
+				     total_data_section_size, &data_begin_off);
+	if (sections_mem == NULL) {
+		goto free_data_buf;
+	}
+
+	metainfo_mem =
+		load_metainfo(pva, sections_mem->iova, total_text_section_size,
+			      data_begin_off, total_data_section_size,
+			      section_infos, n_data_sections,
+			      out_symbol_table->symbols, num_symbols);
+	if (metainfo_mem == NULL) {
+		goto free_sec_mem;
+	}
+	/* Success. Now clean up temporary allocations */
+	if (data_section_buf != NULL) {
+		pva_kmd_free(data_section_buf);
+	}
+	if (section_infos != NULL) {
+		pva_kmd_free(section_infos);
+	}
+	pva_kmd_free(text_section_buf);
+
+	*out_metainfo = metainfo_mem;
+	*out_sections = sections_mem;
+
+	return PVA_SUCCESS;
+free_sec_mem:
+	pva_kmd_device_memory_free(sections_mem);
+free_data_buf:
+	if (data_section_buf != NULL) {
+		pva_kmd_free(data_section_buf);
+	}
+	if (section_infos != NULL) {
+		pva_kmd_free(section_infos);
+	}
+free_text_buf:
+	pva_kmd_free(text_section_buf);
+free_syms:
+	pva_kmd_free(out_symbol_table->symbols);
+err_out:
+	return err;
+}
+
+void pva_kmd_unload_executable(struct pva_kmd_exec_symbol_table *symbol_table,
+			       struct pva_kmd_device_memory *metainfo,
+			       struct pva_kmd_device_memory *sections)
+{
+	pva_kmd_device_memory_free(metainfo);
+	pva_kmd_device_memory_free(sections);
+	if (symbol_table->symbols != NULL) {
+		pva_kmd_free(symbol_table->symbols);
+		symbol_table->symbols = NULL;
+	}
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.c
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_device.h"
+#include "pva_kmd_silicon_hwpm.h"
+#include "pva_kmd_silicon_utils.h"
+
+#ifndef TEGRA_SOC_HWPM_IP_REG_OP_READ
+#define TEGRA_SOC_HWPM_IP_REG_OP_READ 0x1
+#endif
+#ifndef TEGRA_SOC_HWPM_IP_REG_OP_WRITE
+#define TEGRA_SOC_HWPM_IP_REG_OP_WRITE 0x2
+#endif
+int pva_kmd_hwpm_ip_reg_op(void *ip_dev, uint32_t reg_op,
+			   uint32_t inst_element_index, uint64_t reg_offset,
+			   uint32_t *reg_data)
+{
+	struct pva_kmd_device *pva = ip_dev;
+
+	if (reg_offset > UINT32_MAX)
+		return PVA_INVAL;
+
+	switch (reg_op) {
+	case TEGRA_SOC_HWPM_IP_REG_OP_READ:
+		*reg_data =
+			pva_kmd_read(pva, safe_addu32(pva->regspec.cfg_perf_mon,
+						      reg_offset));
+		break;
+	case TEGRA_SOC_HWPM_IP_REG_OP_WRITE:
+		pva_kmd_write(
+			pva, safe_addu32(pva->regspec.cfg_perf_mon, reg_offset),
+			*reg_data);
+		break;
+	default:
+		pva_kmd_log_err("Invalid HWPM operation");
+		return PVA_INVAL;
+	}
+
+	return PVA_SUCCESS;
+}
+
+int pva_kmd_hwpm_ip_pm(void *ip_dev, bool disable)
+{
+	struct pva_kmd_device *dev = ip_dev;
+	enum pva_error err = PVA_SUCCESS;
+
+	if (disable) {
+		err = pva_kmd_device_busy(dev);
+		if (err != PVA_SUCCESS) {
+			pva_kmd_log_err("Failed to busy");
+		}
+	} else {
+		pva_kmd_device_idle(dev);
+	}
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_hwpm.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_SILICON_HWPM_H
+#define PVA_KMD_SILICON_HWPM_H
+#include "pva_kmd.h"
+#include "pva_kmd_shim_debugfs.h"
+
+/**
+* @brief	pva_hwpm_ip_pm
+*
+* This function called from Tegra HWPM driver to
+* poweron/off pva device.
+*
+* @param ip_dev	Pointer to PVA device
+* @param disable	disable/enable power management.  PVA is
+*			powered on when false.
+* @param reg_offset	offset of register relative to PVA HWP base
+* @return		0 on Success or negative error code
+*
+*/
+int pva_kmd_hwpm_ip_pm(void *ip_dev, bool disable);
+
+/**
+* @brief	pva_hwpm_ip_reg_op
+*
+* This function called from Tegra HWPM driver to
+* access PVA HWPM registers.
+*
+* @param ip_dev		Pointer to PVA device
+* @param reg_op		access operation and can be one of
+*				TEGRA_SOC_HWPM_IP_REG_OP_READ
+*				TEGRA_SOC_HWPM_IP_REG_OP_WRITE
+* @param inst_element_index	element index within PVA instance
+* @param reg_offset		offset of register relative to PVA HWP base
+* @param reg_data		pointer to where data is to be placed or read.
+* @return			0 on Success or negative error code
+*
+*/
+int pva_kmd_hwpm_ip_reg_op(void *ip_dev, uint32_t reg_op,
+			   uint32_t inst_element_index, uint64_t reg_offset,
+			   uint32_t *reg_data);
+#endif //PVA_KMD_SILICON_HWPM_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.c
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_silicon_isr.h"
+#include "pva_kmd_device.h"
+#include "pva_fw_hyp.h"
+#include "pva_kmd_msg.h"
+
+struct pva_fw_msg {
+	uint8_t len;
+	uint32_t data[PVA_FW_MSG_MAX_LEN];
+};
+
+static void read_hyp_msg(struct pva_kmd_device *pva, struct pva_fw_msg *msg)
+{
+	uint32_t i;
+
+	msg->data[0] = pva_kmd_read_mailbox(pva, PVA_FW_MBOX_TO_HYP_LAST);
+	msg->len = PVA_EXTRACT(msg->data[0], PVA_FW_MSG_LEN_MSB,
+			       PVA_FW_MSG_LEN_LSB, uint8_t);
+	ASSERT(msg->len <= PVA_ARRAY_SIZE(msg->data));
+	for (i = 1; i < msg->len; i++) {
+		msg->data[i] = pva_kmd_read_mailbox(
+			pva, PVA_FW_MBOX_TO_HYP_BASE + i - 1);
+	}
+}
+
+void pva_kmd_hyp_isr(void *data)
+{
+	struct pva_kmd_device *pva = data;
+	uint32_t intr_status;
+	uint32_t wdt_val, hsp_val, h1x_val;
+
+	intr_status = pva_kmd_read(pva, pva->regspec.sec_lic_intr_status);
+
+	wdt_val = PVA_EXTRACT(intr_status, PVA_REG_SEC_LIC_INTR_WDT_MSB,
+			      PVA_REG_SEC_LIC_INTR_WDT_LSB, uint32_t);
+	hsp_val = PVA_EXTRACT(intr_status, PVA_REG_SEC_LIC_INTR_HSP_MSB,
+			      PVA_REG_SEC_LIC_INTR_HSP_LSB, uint32_t);
+	h1x_val = PVA_EXTRACT(intr_status, PVA_REG_SEC_LIC_INTR_H1X_MSB,
+			      PVA_REG_SEC_LIC_INTR_H1X_LSB, uint32_t);
+
+	if (wdt_val != 0) {
+		/* Clear interrupt status */
+		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status,
+			      intr_status &
+				      PVA_MASK(PVA_REG_SEC_LIC_INTR_WDT_MSB,
+					       PVA_REG_SEC_LIC_INTR_WDT_LSB));
+		/* TODO: reboot firmware when we can */
+		FAULT("PVA watchdog timeout!");
+	}
+
+	if (h1x_val != 0) {
+		pva_kmd_log_err_u64("Host1x errors", h1x_val);
+		/* Clear interrupt status */
+		pva_kmd_write(pva, pva->regspec.sec_lic_intr_status,
+			      intr_status &
+				      PVA_MASK(PVA_REG_SEC_LIC_INTR_H1X_MSB,
+					       PVA_REG_SEC_LIC_INTR_H1X_LSB));
+	}
+
+	if (hsp_val != 0) {
+		struct pva_fw_msg msg = { 0 };
+
+		read_hyp_msg(pva, &msg);
+
+		pva_kmd_handle_hyp_msg(pva, &msg.data[0], msg.len);
+
+		msg.data[0] &= ~PVA_FW_MBOX_FULL_BIT;
+		/* Clear interrupt bit in mailbox */
+		pva_kmd_write_mailbox(pva, PVA_FW_MBOX_TO_HYP_LAST,
+				      msg.data[0]);
+	}
+}
+
+static uint32_t read_ccq0_status(struct pva_kmd_device *pva, uint8_t status_id)
+{
+	return pva_kmd_read(pva, pva->regspec.ccq_regs[0].status[status_id]);
+}
+
+static void write_ccq0_status(struct pva_kmd_device *pva, uint8_t status_id,
+			      uint32_t value)
+{
+	pva_kmd_write(pva, pva->regspec.ccq_regs[0].status[status_id], value);
+}
+
+static void read_ccq_msg(struct pva_kmd_device *pva, struct pva_fw_msg *msg)
+{
+	uint32_t i;
+
+	msg->data[0] = read_ccq0_status(pva, PVA_FW_MSG_STATUS_LAST);
+	msg->len = PVA_EXTRACT(msg->data[0], PVA_FW_MSG_LEN_MSB,
+			       PVA_FW_MSG_LEN_LSB, uint8_t);
+	ASSERT(msg->len <= PVA_ARRAY_SIZE(msg->data));
+	for (i = 1; i < msg->len; i++) {
+		msg->data[i] =
+			read_ccq0_status(pva, PVA_FW_MSG_STATUS_BASE + i - 1);
+	}
+}
+
+/* Handle interrupt from CCQ0 */
+void pva_kmd_isr(void *data)
+{
+	struct pva_kmd_device *pva = data;
+	uint32_t intr_status;
+
+	intr_status =
+		read_ccq0_status(pva, 2) & PVA_REG_CCQ_STATUS2_INTR_ALL_BITS;
+	pva_dbg_printf("CCQ0_INTR_STATUS 0x%x\n", intr_status);
+	/* Clear interupt status This must be done prior to ack CCQ messages
+	 * otherwise we risk losing CCQ messages.
+	 */
+	write_ccq0_status(pva, 2, intr_status);
+
+	if (intr_status & PVA_REG_CCQ_STATUS2_INTR_STATUS8_BIT) {
+		struct pva_fw_msg msg;
+
+		read_ccq_msg(pva, &msg);
+
+		pva_kmd_handle_msg(pva, &msg.data[0], msg.len);
+
+		/* Ack through status1 write. */
+		write_ccq0_status(pva, 1, 0 /* Value doesn't matter for now */);
+	}
+
+	/* We don't care about Status7 or CCQ overflow interrupt */
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_isr.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_SILICON_ISR_H
+#define PVA_KMD_SILICON_ISR_H
+#include "pva_kmd_silicon_utils.h"
+#include "pva_kmd_device.h"
+
+void pva_kmd_hyp_isr(void *data);
+
+void pva_kmd_isr(void *data);
+
+#endif // PVA_KMD_SILICON_ISR_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_misc.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_misc.c
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_silicon_utils.h"
+#include "pva_kmd_device.h"
+#include "pva_math_utils.h"
+
+void pva_kmd_ccq_push(struct pva_kmd_device *pva, uint8_t ccq_id,
+		      uint64_t ccq_entry)
+{
+	pva_kmd_write(pva, pva->regspec.ccq_regs[ccq_id].fifo,
+		      PVA_EXTRACT64(ccq_entry, 31, 0, uint32_t));
+	pva_kmd_write(pva, pva->regspec.ccq_regs[ccq_id].fifo,
+		      PVA_EXTRACT64(ccq_entry, 63, 32, uint32_t));
+}
+
+uint32_t pva_kmd_get_ccq_space(struct pva_kmd_device *pva, uint8_t ccq_id)
+{
+	uint32_t status2 =
+		pva_kmd_read(pva, pva->regspec.ccq_regs[ccq_id].status[2]);
+	uint32_t len =
+		PVA_EXTRACT(status2, PVA_REG_CCQ_STATUS2_NUM_ENTRIES_MSB,
+			    PVA_REG_CCQ_STATUS2_NUM_ENTRIES_LSB, uint32_t);
+	return safe_subu32((uint32_t)PVA_CCQ_DEPTH, len) / 2U;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_utils.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_silicon_utils.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_SILICON_UTILS_H
+#define PVA_KMD_SILICON_UTILS_H
+#include "pva_utils.h"
+#include "pva_kmd_regs.h"
+#include "pva_kmd_shim_silicon.h"
+#include "pva_math_utils.h"
+
+static inline void pva_kmd_write(struct pva_kmd_device *pva, uint32_t addr,
+				 uint32_t val)
+{
+	pva_dbg_printf("pva_kmd_write: addr=0x%x, val=0x%x\n", addr, val);
+	pva_kmd_aperture_write(pva, PVA_KMD_APERTURE_PVA_CLUSTER, addr, val);
+}
+
+static inline uint32_t pva_kmd_read(struct pva_kmd_device *pva, uint32_t addr)
+{
+	uint32_t val;
+
+	val = pva_kmd_aperture_read(pva, PVA_KMD_APERTURE_PVA_CLUSTER, addr);
+	return val;
+}
+
+static inline void pva_kmd_write_mailbox(struct pva_kmd_device *pva,
+					 uint32_t mailbox_idx, uint32_t val)
+{
+	uint32_t gap = PVA_REG_HSP_SM1_ADDR - PVA_REG_HSP_SM0_ADDR;
+	uint32_t offset = safe_mulu32(gap, mailbox_idx);
+	uint32_t addr = safe_addu32(PVA_REG_HSP_SM0_ADDR, offset);
+	pva_kmd_write(pva, addr, val);
+}
+
+static inline uint32_t pva_kmd_read_mailbox(struct pva_kmd_device *pva,
+					    uint32_t mailbox_idx)
+{
+	uint32_t gap = PVA_REG_HSP_SM1_ADDR - PVA_REG_HSP_SM0_ADDR;
+	uint32_t offset = safe_mulu32(gap, mailbox_idx);
+	uint32_t addr = safe_addu32(PVA_REG_HSP_SM0_ADDR, offset);
+	return pva_kmd_read(pva, addr);
+}
+
+#endif // PVA_KMD_SILICON_UTILS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.c
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_submitter.h"
+#include "pva_kmd_utils.h"
+
+void pva_kmd_submitter_init(struct pva_kmd_submitter *submitter,
+			    struct pva_kmd_queue *queue,
+			    pva_kmd_mutex_t *submit_lock,
+			    struct pva_kmd_cmdbuf_chunk_pool *chunk_pool,
+			    pva_kmd_mutex_t *chunk_pool_lock,
+			    uint32_t *post_fence_va,
+			    struct pva_fw_postfence const *post_fence)
+{
+	submitter->queue = queue;
+	submitter->submit_lock = submit_lock;
+	submitter->post_fence_va = post_fence_va;
+	submitter->post_fence = *post_fence;
+	submitter->fence_future_value = 0;
+	submitter->chunk_pool = chunk_pool;
+	submitter->chunk_pool_lock = chunk_pool_lock;
+
+	*submitter->post_fence_va = submitter->fence_future_value;
+}
+
+enum pva_error pva_kmd_submitter_prepare(struct pva_kmd_submitter *submitter,
+					 struct pva_kmd_cmdbuf_builder *builder)
+{
+	enum pva_error err;
+
+	err = pva_kmd_cmdbuf_builder_init(builder, submitter->chunk_pool);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+
+	return PVA_SUCCESS;
+err_out:
+	return err;
+}
+
+enum pva_error
+pva_kmd_submitter_submit_with_fence(struct pva_kmd_submitter *submitter,
+				    struct pva_kmd_cmdbuf_builder *builder,
+				    struct pva_fw_postfence *fence)
+{
+	enum pva_error err;
+	uint32_t first_chunk_id;
+	uint16_t first_chunk_size;
+	uint64_t first_chunk_offset;
+	struct pva_fw_cmdbuf_submit_info submit_info = { 0 };
+	struct pva_fw_postfence free_notifier_fence;
+
+	pva_kmd_cmdbuf_builder_finalize(builder, &first_chunk_id,
+					&first_chunk_size);
+
+	pva_kmd_get_free_notifier_fence(submitter->chunk_pool, first_chunk_id,
+					&free_notifier_fence);
+	first_chunk_offset = pva_kmd_get_cmdbuf_chunk_res_offset(
+		submitter->chunk_pool, first_chunk_id);
+
+	submit_info.postfences[0] = free_notifier_fence;
+	submit_info.num_postfence = 1;
+	if (fence->resource_id != PVA_RESOURCE_ID_INVALID) {
+		submit_info.postfences[1] = *fence;
+		submit_info.num_postfence = 2;
+	}
+	submit_info.first_chunk_resource_id =
+		submitter->chunk_pool->mem_resource_id;
+	submit_info.first_chunk_offset_lo = iova_lo(first_chunk_offset);
+	submit_info.first_chunk_offset_hi = iova_hi(first_chunk_offset);
+	submit_info.first_chunk_size = first_chunk_size;
+
+	pva_kmd_mutex_lock(submitter->submit_lock);
+	err = pva_kmd_queue_submit(submitter->queue, &submit_info);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_cmdbuf_builder_cancel(builder);
+	}
+	pva_kmd_mutex_unlock(submitter->submit_lock);
+
+	return err;
+}
+
+enum pva_error pva_kmd_submitter_submit(struct pva_kmd_submitter *submitter,
+					struct pva_kmd_cmdbuf_builder *builder,
+					uint32_t *out_fence_val)
+{
+	enum pva_error err;
+	uint32_t first_chunk_id;
+	uint16_t first_chunk_size;
+	uint64_t first_chunk_offset;
+	struct pva_fw_cmdbuf_submit_info submit_info = { 0 };
+	struct pva_fw_postfence free_notifier_fence;
+
+	pva_kmd_cmdbuf_builder_finalize(builder, &first_chunk_id,
+					&first_chunk_size);
+
+	pva_kmd_get_free_notifier_fence(submitter->chunk_pool, first_chunk_id,
+					&free_notifier_fence);
+	first_chunk_offset = pva_kmd_get_cmdbuf_chunk_res_offset(
+		submitter->chunk_pool, first_chunk_id);
+
+	submit_info.num_postfence = 2;
+	submit_info.postfences[0] = submitter->post_fence;
+	submit_info.postfences[1] = free_notifier_fence;
+	submit_info.first_chunk_resource_id =
+		submitter->chunk_pool->mem_resource_id;
+	submit_info.first_chunk_offset_lo = iova_lo(first_chunk_offset);
+	submit_info.first_chunk_offset_hi = iova_hi(first_chunk_offset);
+	submit_info.first_chunk_size = first_chunk_size;
+	/* TODO: remove these flags after FW execute command buffer with no engines. */
+	submit_info.flags =
+		PVA_INSERT8(0x3, PVA_CMDBUF_FLAGS_ENGINE_AFFINITY_MSB,
+			    PVA_CMDBUF_FLAGS_ENGINE_AFFINITY_LSB);
+
+	pva_kmd_mutex_lock(submitter->submit_lock);
+	submitter->fence_future_value += 1U;
+	submit_info.postfences[0].value = submitter->fence_future_value;
+	err = pva_kmd_queue_submit(submitter->queue, &submit_info);
+	if (err == PVA_SUCCESS) {
+		*out_fence_val = submitter->fence_future_value;
+	} else {
+		submitter->fence_future_value -= 1U;
+		pva_kmd_cmdbuf_builder_cancel(builder);
+	}
+	pva_kmd_mutex_unlock(submitter->submit_lock);
+
+	return err;
+}
+
+enum pva_error pva_kmd_submitter_wait(struct pva_kmd_submitter *submitter,
+				      uint32_t fence_val,
+				      uint32_t poll_interval_us,
+				      uint32_t timeout_us)
+{
+	uint32_t volatile *fence_addr = submitter->post_fence_va;
+	uint32_t time_spent = 0;
+
+	while (*fence_addr < fence_val) {
+		pva_kmd_sleep_us(poll_interval_us);
+		time_spent = safe_addu32(time_spent, poll_interval_us);
+		if (time_spent >= timeout_us) {
+			pva_kmd_log_err("pva_kmd_submitter_wait Timed out");
+			return PVA_TIMEDOUT;
+		}
+	}
+
+	return PVA_SUCCESS;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_submitter.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_SUBMITTER_H
+#define PVA_KMD_SUBMITTER_H
+#include "pva_kmd_cmdbuf.h"
+#include "pva_kmd_mutex.h"
+#include "pva_kmd_queue.h"
+
+/** A thread-safe submitter utility */
+struct pva_kmd_submitter {
+	/** The lock protects the submission to the queue, including
+	 * incrementing the post fence */
+	pva_kmd_mutex_t *submit_lock;
+	struct pva_kmd_queue *queue;
+	uint32_t *post_fence_va;
+	struct pva_fw_postfence post_fence;
+	uint32_t fence_future_value;
+
+	/** This lock protects the use of the chunk_pool*/
+	pva_kmd_mutex_t *chunk_pool_lock;
+	struct pva_kmd_cmdbuf_chunk_pool *chunk_pool;
+};
+
+void pva_kmd_submitter_init(struct pva_kmd_submitter *submitter,
+			    struct pva_kmd_queue *queue,
+			    pva_kmd_mutex_t *submit_lock,
+			    struct pva_kmd_cmdbuf_chunk_pool *chunk_pool,
+			    pva_kmd_mutex_t *chunk_pool_lock,
+			    uint32_t *post_fence_va,
+			    struct pva_fw_postfence const *post_fence);
+
+enum pva_error
+pva_kmd_submitter_prepare(struct pva_kmd_submitter *submitter,
+			  struct pva_kmd_cmdbuf_builder *builder);
+
+enum pva_error pva_kmd_submitter_submit(struct pva_kmd_submitter *submitter,
+					struct pva_kmd_cmdbuf_builder *builder,
+					uint32_t *out_fence_val);
+enum pva_error pva_kmd_submitter_wait(struct pva_kmd_submitter *submitter,
+				      uint32_t fence_val,
+				      uint32_t poll_interval_ms,
+				      uint32_t timeout_ms);
+enum pva_error
+pva_kmd_submitter_submit_with_fence(struct pva_kmd_submitter *submitter,
+				    struct pva_kmd_cmdbuf_builder *builder,
+				    struct pva_fw_postfence *fence);
+
+/* prepare submission */
+/* add cmd */
+/* add cmd */
+/* do submit -> fence value */
+/* wait for fence */
+
+/* prepare submission */
+/* add cmd */
+/* add cmd */
+/* do submit with fence (provide a fence) */
+
+#endif // PVA_KMD_SUBMITTER_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t23x.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t23x.c
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_t23x.h"
+#include "pva_kmd_constants.h"
+
+struct vmem_region vmem_regions_tab_t23x[PVA_VMEM_REGION_COUNT_T23X] = {
+	{ .start = T23x_VMEM0_START, .end = T23x_VMEM0_END },
+	{ .start = T23x_VMEM1_START, .end = T23x_VMEM1_END },
+	{ .start = T23x_VMEM2_START, .end = T23x_VMEM2_END },
+};
+
+void pva_kmd_device_init_t23x(struct pva_kmd_device *pva)
+{
+	uint32_t ccq;
+	uint32_t st_idx;
+
+	pva->hw_consts.hw_gen = PVA_HW_GEN2;
+	pva->hw_consts.n_smmu_contexts = PVA_NUM_SMMU_CONTEXTS_T23X;
+	pva->r5_image_smmu_context_id = PVA_NUM_SMMU_CONTEXTS_T23X - 1;
+	pva->hw_consts.n_dma_descriptors = PVA_NUM_DMA_DESC_T23X;
+	pva->hw_consts.n_user_dma_channels = PVA_DMA_NUM_CHANNELS_T23X - 1U;
+	pva->hw_consts.n_hwseq_words = PVA_NUM_HWSEQ_WORDS_T23X;
+	pva->hw_consts.n_dynamic_adb_buffs = PVA_NUM_DYNAMIC_ADB_BUFFS_T23X;
+	pva->hw_consts.n_vmem_regions = PVA_VMEM_REGION_COUNT_T23X;
+	pva->support_hwseq_frame_linking = false;
+	pva->vmem_regions_tab = vmem_regions_tab_t23x;
+	pva->reg_phy_base[PVA_KMD_APERTURE_PVA_CLUSTER] =
+		PVA_KMD_PVA0_T23x_REG_BASE;
+	pva->reg_size[PVA_KMD_APERTURE_PVA_CLUSTER] =
+		PVA_KMD_PVA0_T23x_REG_SIZE;
+	pva->reg_phy_base[PVA_KMD_APERTURE_VPU_DEBUG] = TEGRA_PVA0_VPU_DBG_BASE;
+	pva->reg_size[PVA_KMD_APERTURE_VPU_DEBUG] = TEGRA_PVA0_VPU_DBG_SIZE;
+
+	pva->regspec.sec_lic_intr_enable = 0x28064;
+	pva->regspec.sec_lic_intr_status = 0x2806C;
+
+	pva->regspec.cfg_user_sid_base = 0x240000;
+	pva->regspec.cfg_priv_sid = 0x240020;
+	pva->regspec.cfg_vps_sid = 0x240024;
+	pva->regspec.cfg_r5user_lsegreg = 0x250008;
+	pva->regspec.cfg_r5user_usegreg = 0x25001c;
+	pva->regspec.cfg_priv_ar1_lsegreg = 0x25000c;
+	pva->regspec.cfg_priv_ar1_usegreg = 0x250020;
+	pva->regspec.cfg_priv_ar2_lsegreg = 0x250010;
+	pva->regspec.cfg_priv_ar2_usegreg = 0x250024;
+	pva->regspec.cfg_priv_ar1_start = 0x250028;
+	pva->regspec.cfg_priv_ar1_end = 0x25002c;
+	pva->regspec.cfg_priv_ar2_start = 0x250030;
+	pva->regspec.cfg_priv_ar2_end = 0x250034;
+
+	pva->regspec.cfg_scr_priv_0 = 0x18004;
+	pva->regspec.cfg_perf_mon = 0x200000;
+
+	pva->regspec.ccq_count = 8U;
+	/* For VPU 0*/
+	pva->regspec.vpu_dbg_instr_reg_offset[0] = 0x50000U;
+	/* For VPU 1*/
+	pva->regspec.vpu_dbg_instr_reg_offset[1] = 0x70000U;
+	for (ccq = 0; ccq < pva->regspec.ccq_count; ccq++) {
+		uint32_t n_st = PVA_CFG_CCQ_STATUS_COUNT;
+		uint32_t ccq_base = safe_addu32(
+			(uint32_t)0x260000,
+			safe_mulu32((uint32_t)PVA_CFG_CCQ_BLOCK_SIZE, ccq));
+		pva->regspec.ccq_regs[ccq].status_count = n_st;
+		pva->regspec.ccq_regs[ccq].fifo = ccq_base;
+		for (st_idx = 0; st_idx < n_st; st_idx++) {
+			pva->regspec.ccq_regs[ccq].status[st_idx] = safe_addu32(
+				ccq_base,
+				safe_addu32((uint32_t)0x4U,
+					    safe_mulu32((uint32_t)0x4U,
+							st_idx)));
+		}
+	}
+
+#if PVA_SUPPORT_XBAR_RAW == 1
+	pva->bl_sector_pack_format = PVA_BL_XBAR_RAW;
+#else
+	pva->bl_sector_pack_format = PVA_BL_TEGRA_RAW;
+#endif
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t23x.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t23x.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_T23X_H
+#define PVA_KMD_T23X_H
+#include "pva_kmd_device.h"
+
+/** Number of VMEM regions */
+#define PVA_VMEM_REGION_COUNT_T23X 3U
+
+/** Start Address of VMEM0 Bank in T23X */
+#define T23x_VMEM0_START 0x40U
+/** End Address of VMEM0 Bank in T23X */
+#define T23x_VMEM0_END 0x20000U
+/** Start Address of VMEM1 Bank in T23X */
+#define T23x_VMEM1_START 0x40000U
+/** End Address of VMEM1 Bank in T23X */
+#define T23x_VMEM1_END 0x60000U
+/** Start Address of VMEM2 Bank in T23X */
+#define T23x_VMEM2_START 0x80000U
+/** End Address of VMEM2 Bank in T23X */
+#define T23x_VMEM2_END 0xA0000U
+
+/** @brief Base address for PVA0 VPU Debug Register space (CSITE_PVA0VPU) */
+#define TEGRA_PVA0_VPU_DBG_BASE 0x24740000U
+/** @brief Size (in bytes) of the PVA0 VPU Debug Register space (CSITE_PVA0VPU) */
+#define TEGRA_PVA0_VPU_DBG_SIZE 0x40000U
+
+void pva_kmd_device_init_t23x(struct pva_kmd_device *pva);
+
+#endif // PVA_KMD_T23X_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t26x.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t26x.c
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_t26x.h"
+#include "pva_kmd_constants.h"
+
+struct vmem_region vmem_regions_tab_t26x[PVA_VMEM_REGION_COUNT_T26X] = {
+	{ .start = T26x_VMEM0_START, .end = T26x_VMEM0_END },
+	{ .start = T26x_VMEM1_START, .end = T26x_VMEM1_END },
+	{ .start = T26x_VMEM2_START, .end = T26x_VMEM2_END },
+	{ .start = T26x_VMEM3_START, .end = T26x_VMEM3_END },
+};
+
+void pva_kmd_device_init_t26x(struct pva_kmd_device *pva)
+{
+	uint32_t ccq;
+	uint32_t st_idx;
+
+	pva->hw_consts.hw_gen = PVA_HW_GEN3;
+	pva->hw_consts.n_smmu_contexts = PVA_NUM_SMMU_CONTEXTS_T26X;
+	pva->r5_image_smmu_context_id = PVA_NUM_SMMU_CONTEXTS_T26X - 1;
+	pva->hw_consts.n_dma_descriptors = PVA_NUM_DMA_DESC_T26X;
+	pva->hw_consts.n_user_dma_channels = PVA_DMA_NUM_CHANNELS_T26X - 1U;
+	pva->hw_consts.n_hwseq_words = PVA_NUM_HWSEQ_WORDS_T26X;
+	pva->hw_consts.n_dynamic_adb_buffs = PVA_NUM_DYNAMIC_ADB_BUFFS_T26X;
+	pva->hw_consts.n_vmem_regions = PVA_VMEM_REGION_COUNT_T26X;
+	pva->vmem_regions_tab = vmem_regions_tab_t26x;
+	pva->support_hwseq_frame_linking = true;
+	pva->reg_phy_base[PVA_KMD_APERTURE_PVA_CLUSTER] =
+		PVA_KMD_PVA0_T26x_REG_BASE;
+	pva->reg_size[PVA_KMD_APERTURE_PVA_CLUSTER] =
+		PVA_KMD_PVA0_T26x_REG_SIZE;
+	pva->reg_phy_base[PVA_KMD_APERTURE_VPU_DEBUG] = TEGRA_PVA0_VPU_DBG_BASE;
+	pva->reg_size[PVA_KMD_APERTURE_VPU_DEBUG] = TEGRA_PVA0_VPU_DBG_SIZE;
+
+	pva->regspec.sec_lic_intr_enable = 0x28064;
+	pva->regspec.sec_lic_intr_status = 0x2806C;
+
+	pva->regspec.cfg_user_sid_base = 0x240000;
+	pva->regspec.cfg_priv_sid = 0x240020;
+	pva->regspec.cfg_vps_sid = 0x240024;
+	pva->regspec.cfg_r5user_lsegreg = 0x250008;
+	pva->regspec.cfg_r5user_usegreg = 0x25001c;
+	pva->regspec.cfg_priv_ar1_lsegreg = 0x25000c;
+	pva->regspec.cfg_priv_ar1_usegreg = 0x250020;
+	pva->regspec.cfg_priv_ar2_lsegreg = 0x250010;
+	pva->regspec.cfg_priv_ar2_usegreg = 0x250024;
+	pva->regspec.cfg_priv_ar1_start = 0x250028;
+	pva->regspec.cfg_priv_ar1_end = 0x25002c;
+	pva->regspec.cfg_priv_ar2_start = 0x250030;
+	pva->regspec.cfg_priv_ar2_end = 0x250034;
+
+	pva->regspec.cfg_scr_priv_0 = 0x18004;
+	pva->regspec.cfg_perf_mon = 0x200000;
+
+	pva->regspec.ccq_count = 8U;
+	/* For VPU 0*/
+	pva->regspec.vpu_dbg_instr_reg_offset[0] = 0x50000U;
+	/* For VPU 1*/
+	pva->regspec.vpu_dbg_instr_reg_offset[1] = 0x70000U;
+	for (ccq = 0; ccq < pva->regspec.ccq_count; ccq++) {
+		uint32_t n_st = PVA_CFG_CCQ_STATUS_COUNT;
+		uint32_t ccq_base = safe_addu32(
+			(uint32_t)0x260000,
+			safe_mulu32((uint32_t)PVA_CFG_CCQ_BLOCK_SIZE, ccq));
+		pva->regspec.ccq_regs[ccq].status_count = n_st;
+		pva->regspec.ccq_regs[ccq].fifo = ccq_base;
+		for (st_idx = 0; st_idx < n_st; st_idx++) {
+			pva->regspec.ccq_regs[ccq].status[st_idx] = safe_addu32(
+				ccq_base,
+				safe_addu32((uint32_t)0x4U,
+					    safe_mulu32((uint32_t)0x4U,
+							st_idx)));
+		}
+	}
+	pva->bl_sector_pack_format = PVA_BL_TEGRA_RAW;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t26x.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_t26x.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_T26X_H
+#define PVA_KMD_T26X_H
+#include "pva_kmd_device.h"
+
+#define PVA_KMD_PVA0_T26x_REG_BASE 0x818c000000
+#define PVA_KMD_PVA0_T26x_REG_SIZE 0x900000
+
+/** Number of VMEM regions in T26X */
+#define PVA_VMEM_REGION_COUNT_T26X 4U
+
+/** Start Address of VMEM0 Bank in T26X */
+#define T26x_VMEM0_START 0x40U
+/** End Address of VMEM0 Bank in T26X */
+#define T26x_VMEM0_END 0x20000U
+/** Start Address of VMEM1 Bank in T26X */
+#define T26x_VMEM1_START 0x40000U
+/** End Address of VMEM1 Bank in T26X */
+#define T26x_VMEM1_END 0x60000U
+/** End Address of VMEM2 Bank in T26X */
+#define T26x_VMEM2_START 0x80000U
+/** End Address of VMEM2 Bank in T26X */
+#define T26x_VMEM2_END 0xA0000U
+/** End Address of VMEM3 Bank in T26X */
+#define T26x_VMEM3_START 0xC0000U
+/** End Address of VMEM3 Bank in T26X */
+#define T26x_VMEM3_END 0xE0000U
+
+/** @brief Base address for PVA0 VPU Debug Register space (CSITE_PVA0VPU) */
+#define TEGRA_PVA0_VPU_DBG_BASE 0x24740000U
+/** @brief Size (in bytes) of the PVA0 VPU Debug Register space (CSITE_PVA0VPU) */
+#define TEGRA_PVA0_VPU_DBG_SIZE 0x40000U
+
+void pva_kmd_device_init_t26x(struct pva_kmd_device *pva);
+
+#endif // PVA_KMD_T26X_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.c
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_api_cmdbuf.h"
+#include "pva_api_types.h"
+#include "pva_bit.h"
+#include "pva_fw.h"
+#include "pva_kmd_cmdbuf.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_constants.h"
+#include "pva_utils.h"
+#include "pva_kmd_tegra_stats.h"
+
+void pva_kmd_device_init_tegra_stats(struct pva_kmd_device *pva)
+{
+	enum pva_error err = PVA_SUCCESS;
+
+	pva->tegra_stats_buf_size = sizeof(struct pva_kmd_fw_tegrastats);
+
+	pva->tegra_stats_memory =
+		pva_kmd_device_memory_alloc_map(pva->tegra_stats_buf_size, pva,
+						PVA_ACCESS_RW,
+						PVA_R5_SMMU_CONTEXT_ID);
+	ASSERT(pva->tegra_stats_memory != NULL);
+
+	err = pva_kmd_add_dram_buffer_resource(&pva->dev_resource_table,
+					       pva->tegra_stats_memory,
+					       &pva->tegra_stats_resource_id);
+	ASSERT(err == PVA_SUCCESS);
+	pva_kmd_update_fw_resource_table(&pva->dev_resource_table);
+}
+
+void pva_kmd_device_deinit_tegra_stats(struct pva_kmd_device *pva)
+{
+	pva_kmd_drop_resource(&pva->dev_resource_table,
+			      pva->tegra_stats_resource_id);
+}
+
+enum pva_error
+pva_kmd_notify_fw_get_tegra_stats(struct pva_kmd_device *pva,
+				  struct pva_kmd_tegrastats *kmd_tegra_stats)
+{
+	struct pva_kmd_cmdbuf_builder builder;
+	struct pva_kmd_submitter *dev_submitter = &pva->submitter;
+	struct pva_cmd_get_tegra_stats *cmd;
+	uint64_t buffer_offset = 0U;
+	uint32_t fence_val;
+	enum pva_error err;
+	struct pva_kmd_fw_tegrastats *fw_tegra_stats;
+	bool stats_enabled = pva->debugfs_context.stats_enable;
+	uint64_t duration = 0U;
+
+	/* Power on PVA if not already */
+	err = pva_kmd_device_busy(pva);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"pva_kmd_device_busy failed when submitting tegra stats cmd");
+		return err;
+	}
+
+	err = pva_kmd_submitter_prepare(dev_submitter, &builder);
+	if (err != PVA_SUCCESS) {
+		goto err_out;
+	}
+	cmd = pva_kmd_reserve_cmd_space(&builder, sizeof(*cmd));
+	ASSERT(cmd != NULL);
+
+	pva_kmd_set_cmd_get_tegra_stats(cmd, pva->tegra_stats_resource_id,
+					pva->tegra_stats_buf_size,
+					buffer_offset, stats_enabled);
+
+	err = pva_kmd_submitter_submit(dev_submitter, &builder, &fence_val);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("tegra stats cmd submission failed");
+		goto cancel_builder;
+	}
+
+	err = pva_kmd_submitter_wait(dev_submitter, fence_val,
+				     PVA_KMD_WAIT_FW_POLL_INTERVAL_US,
+				     PVA_KMD_WAIT_FW_TIMEOUT_US);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"Waiting for FW timed out when getting tegra stats");
+		goto err_out;
+	}
+
+	if (stats_enabled == false)
+		goto err_out;
+
+	fw_tegra_stats =
+		(struct pva_kmd_fw_tegrastats *)(pva->tegra_stats_memory->va);
+
+	duration = safe_subu64(fw_tegra_stats->window_end_time,
+			       fw_tegra_stats->window_start_time);
+	if (duration == 0) {
+		pva_kmd_print_str("VPU Stats: Duration is zero");
+		goto err_out;
+	}
+
+	pva_kmd_print_str("VPU Stats");
+	pva_kmd_print_str_u64("Window Start Time",
+			      fw_tegra_stats->window_start_time);
+	pva_kmd_print_str_u64("Window End Time",
+			      fw_tegra_stats->window_end_time);
+	pva_kmd_print_str_u64("Total utilization VPU 0",
+			      fw_tegra_stats->total_utilization[0]);
+	pva_kmd_print_str_u64("Total utilization VPU 1",
+			      fw_tegra_stats->total_utilization[1]);
+	pva_kmd_print_str_u64(
+		"VPU 0 percent utilization",
+		safe_mulu64(100ULL, fw_tegra_stats->total_utilization[0]) /
+			duration);
+	pva_kmd_print_str_u64(
+		"VPU 1 percent utilization",
+		safe_mulu64(100ULL, fw_tegra_stats->total_utilization[1]) /
+			duration);
+
+	kmd_tegra_stats->average_vpu_utilization[0] =
+		safe_mulu64(100ULL, fw_tegra_stats->total_utilization[0]) /
+		duration;
+	kmd_tegra_stats->average_vpu_utilization[1] =
+		safe_mulu64(100ULL, fw_tegra_stats->total_utilization[1]) /
+		duration;
+	kmd_tegra_stats->window_start_time = fw_tegra_stats->window_start_time;
+	kmd_tegra_stats->window_end_time = fw_tegra_stats->window_end_time;
+
+	err = PVA_SUCCESS;
+
+cancel_builder:
+	pva_kmd_cmdbuf_builder_cancel(&builder);
+err_out:
+	pva_kmd_device_idle(pva);
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_tegra_stats.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_TEGRA_STATS_H
+#define PVA_KMD_TEGRA_STATS_H
+#include "pva_kmd_device.h"
+
+/**
+ * @brief Structure which holds vpu stats information
+ */
+struct pva_kmd_tegrastats {
+	/** Holds vpu utilization as a percentage for each VPU in the PVA */
+	uint64_t average_vpu_utilization[PVA_NUM_PVE];
+	/** Current state of pva_kmd_tegrastats */
+	uint64_t window_start_time;
+	uint64_t window_end_time;
+};
+
+void pva_kmd_device_init_tegra_stats(struct pva_kmd_device *pva);
+
+void pva_kmd_device_deinit_tegra_stats(struct pva_kmd_device *pva);
+
+enum pva_error
+pva_kmd_notify_fw_get_tegra_stats(struct pva_kmd_device *pva,
+				  struct pva_kmd_tegrastats *kmd_tegra_stats);
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_userspace_misc.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_userspace_misc.c
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include "pva_kmd_mutex.h"
+#include "pva_kmd_utils.h"
+#include "pva_kmd_thread_sema.h"
+#include "pva_kmd_device_memory.h"
+#include <pthread.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+
+enum pva_error pva_kmd_mutex_init(pva_kmd_mutex_t *m)
+{
+	int ret = pthread_mutex_init(m, NULL);
+	ASSERT(ret == 0);
+
+	return PVA_SUCCESS;
+}
+
+void pva_kmd_mutex_lock(pva_kmd_mutex_t *m)
+{
+	int ret = pthread_mutex_lock(m);
+	ASSERT(ret == 0);
+}
+
+void pva_kmd_mutex_unlock(pva_kmd_mutex_t *m)
+{
+	int ret = pthread_mutex_unlock(m);
+	ASSERT(ret == 0);
+}
+
+void pva_kmd_mutex_deinit(pva_kmd_mutex_t *m)
+{
+	int ret = pthread_mutex_destroy(m);
+	ASSERT(ret == 0);
+}
+
+void *pva_kmd_zalloc(uint64_t size)
+{
+	return calloc(1, size);
+}
+
+void pva_kmd_free(void *ptr)
+{
+	free(ptr);
+}
+
+void pva_kmd_fault(void)
+{
+	abort();
+}
+
+void pva_kmd_sema_init(pva_kmd_sema_t *sem, uint32_t val)
+{
+	int ret;
+
+	ret = sem_init(sem, 0 /* Only sharing in threads */, val);
+	ASSERT(ret == 0);
+}
+
+enum pva_error pva_kmd_sema_wait_timeout(pva_kmd_sema_t *sem,
+					 uint32_t timeout_ms)
+{
+	struct timespec ts;
+	int ret;
+	ret = clock_gettime(CLOCK_REALTIME, &ts);
+	ASSERT(ret == 0);
+
+	/* Add timeout (specified in milliseconds) to the current time */
+	ts.tv_sec += timeout_ms / 1000;
+	ts.tv_nsec += (timeout_ms % 1000) * 1000000;
+
+	/* Handle case where nanoseconds exceed 1 second */
+	if (ts.tv_nsec >= 1000000000) {
+		ts.tv_nsec -= 1000000000;
+		ts.tv_sec += 1;
+	}
+
+wait_again:
+	ret = sem_timedwait(sem, &ts);
+	if (ret != 0) {
+		if (errno == ETIMEDOUT) {
+			pva_kmd_log_err("pva_kmd_sema_wait_timeout Timed out");
+			return PVA_TIMEDOUT;
+		} else if (errno == EINTR) {
+			goto wait_again;
+		} else {
+			FAULT("Unexpected sem_timedwait error");
+		}
+	}
+
+	return PVA_SUCCESS;
+}
+
+void pva_kmd_sema_deinit(pva_kmd_sema_t *sem)
+{
+	int ret = sem_destroy(sem);
+	ASSERT(ret == 0);
+}
+
+void pva_kmd_sema_post(pva_kmd_sema_t *sem)
+{
+	int ret = sem_post(sem);
+	ASSERT(ret == 0);
+}
+
+struct pva_kmd_device_memory *
+pva_kmd_device_memory_alloc_map(uint64_t size, struct pva_kmd_device *pva,
+				uint32_t iova_access_flags,
+				uint32_t smmu_ctx_idx)
+{
+	struct pva_kmd_device_memory *mem;
+	enum pva_error err;
+
+	mem = pva_kmd_device_memory_alloc(size);
+
+	if (mem == NULL) {
+		goto err_out;
+	}
+
+	err = pva_kmd_device_memory_iova_map(mem, pva, iova_access_flags,
+					     smmu_ctx_idx);
+	if (err != PVA_SUCCESS) {
+		goto free_mem;
+	}
+
+	err = pva_kmd_device_memory_cpu_map(mem);
+	if (err != PVA_SUCCESS) {
+		goto iova_unmap;
+	}
+
+	return mem;
+iova_unmap:
+	pva_kmd_device_memory_iova_unmap(mem);
+free_mem:
+	pva_kmd_device_memory_free(mem);
+err_out:
+	return NULL;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_utils.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_utils.c
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_utils.h"
+
+void *pva_kmd_zalloc_nofail(uint64_t size)
+{
+	void *ptr = pva_kmd_zalloc(size);
+	ASSERT(ptr != NULL);
+	return ptr;
+}
+
+void pva_kmd_log_err(const char *msg)
+{
+	pva_kmd_print_str(msg);
+}
+
+void pva_kmd_log_err_u64(const char *msg, uint64_t val)
+{
+	pva_kmd_print_str_u64(msg, val);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_utils.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_utils.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_UTILS_H
+#define PVA_KMD_UTILS_H
+#include "pva_kmd.h"
+#include "pva_api.h"
+#include "pva_kmd_shim_utils.h"
+#include "pva_bit.h"
+#include "pva_utils.h"
+#include "pva_plat_faults.h"
+#include "pva_math_utils.h"
+
+#define SIZE_4KB (4 * 1024)
+
+void pva_kmd_log_err(const char *msg);
+void pva_kmd_log_err_u64(const char *msg, uint64_t val);
+void *pva_kmd_zalloc_nofail(uint64_t size);
+
+#endif // PVA_KMD_UTILS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_app_auth.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_app_auth.c
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_api_types.h"
+#include "pva_kmd_vpu_app_auth.h"
+#include "pva_kmd_device.h"
+#include "pva_kmd_sha256.h"
+#include "pva_kmd_utils.h"
+
+enum pva_error pva_kmd_init_vpu_app_auth(struct pva_kmd_device *pva, bool ena)
+{
+	enum pva_error err = PVA_SUCCESS;
+	const char *default_path = pva_kmd_get_default_allowlist();
+	size_t default_path_len;
+	struct pva_vpu_auth *pva_auth = pva_kmd_zalloc(sizeof(*pva_auth));
+	if (pva_auth == NULL) {
+		pva_kmd_log_err("Unable to allocate memory");
+		return PVA_NOMEM;
+	}
+
+	pva->pva_auth = pva_auth;
+	ASSERT(pva_auth != NULL);
+
+	pva_auth->vpu_hash_keys = NULL;
+	pva_auth->pva_auth_allow_list_parsed = false;
+	/**TODO - This will be disabled by default. Authentication will be enabled based on 2 things
+	 * 1. Debug FS (For non production)
+	 * 2. Device tree property (For production)
+	 * Either of the 2 conditions if satisfied will enable authentication
+	 */
+	pva_auth->pva_auth_enable = ena;
+	default_path_len = strnlen(default_path, ALLOWLIST_FILE_LEN);
+	if (default_path_len > 0U) {
+		(void)memcpy(pva_auth->pva_auth_allowlist_path, default_path,
+			     default_path_len);
+	}
+
+	return err;
+}
+
+/**
+ * \brief
+ * is_key_match calculates the sha256 key of ELF and checks if it matches with key.
+ * \param[in] dataptr Pointer to the data to which sha256 to ba calculated
+ * \param[in] size length in bytes of the data to which sha256 to be calculated.
+ * \param[in] key the key with which calculated key would be compared for match.
+ * \return The completion status of the operation. Possible values are:
+ * \ref PVA_SUCCESS Success. Passed in key matched wth calculated key.
+ * \ref -EACCES. Passed in Key doesn't match with calcualted key.
+ */
+static enum pva_error is_key_match(uint8_t *dataptr, size_t size,
+				   struct shakey key)
+{
+	enum pva_error err = PVA_SUCCESS;
+	int32_t status = 0;
+	uint32_t calc_key[8];
+	size_t off;
+	struct sha256_ctx ctx1;
+	struct sha256_ctx ctx2;
+
+	sha256_init(&ctx1);
+	off = (size / 64U) * 64U;
+	if (off > 0U) {
+		sha256_update(&ctx1, dataptr, off);
+	}
+
+	/* clone */
+	sha256_copy(&ctx1, &ctx2);
+
+	/* finalize with leftover, if any */
+	sha256_finalize(&ctx2, dataptr + off, size % 64U, calc_key);
+
+	status = memcmp((void *)&(key.sha_key), (void *)calc_key,
+			NVPVA_SHA256_DIGEST_SIZE);
+	if (status != 0) {
+		err = PVA_EACCES;
+	}
+
+	return err;
+}
+
+/**
+ * \brief
+ * Keeps checking all the keys accociated with match_hash
+ * against the calculated sha256 key for dataptr, until it finds a match.
+ * \param[in] pallkeys Pointer to array of SHA keys \ref shakey
+ * \param[in] dataptr pointer to ELF data
+ * \param[in] size length (in bytes) of ELF data
+ * \param[in] match_hash pointer to matching hash structure, \ref struct vpu_hash_vector.
+ * \return Matching status of the calculated key
+ * against the keys asscociated with match_hash. possible values:
+ * - 0 Success, one of the keys associated with match_hash
+ * matches with the calculated sha256 key.
+ * - -EACCES if no match found.
+ */
+static enum pva_error
+check_all_keys_for_match(struct shakey *pallkeys, uint8_t *dataptr, size_t size,
+			 const struct vpu_hash_vector *match_hash)
+{
+	enum pva_error err = PVA_SUCCESS;
+	uint32_t idx;
+	uint32_t count;
+	uint32_t end;
+	struct shakey key;
+	uint32_t i;
+
+	idx = match_hash->index;
+	count = match_hash->count;
+	end = idx + count;
+	if (end < idx) {
+		err = PVA_ERANGE;
+		goto fail;
+	}
+
+	for (i = 0; i < count; i++) {
+		key = pallkeys[idx + i];
+		err = is_key_match(dataptr, size, key);
+		if (err == PVA_SUCCESS) {
+			break;
+		}
+	}
+fail:
+	return err;
+}
+
+/**
+ * @brief
+ * Helper function for \ref binary_search.
+ * Uses a specific field in @ref pkey to compare with the same filed in @ref pbase.
+ * @param[in] pkey pointer to the object that needs to be compared.
+ * @param[in] pbase pointer to the starting element of the array.
+ * @retval
+ * - -1 when @ref pkey is less than starting element of array pointed to by @ref pbase.
+ * - 1 when @ref pkey is greater than starting element of array pointed to by @ref pbase.
+ * - 0 when @ref pkey is equal to starting element of array pointed to by @ref pbase.
+ */
+static int32_t compare_hash_value(const struct vpu_hash_vector *pkey,
+				  const struct vpu_hash_vector *pbase)
+{
+	int32_t ret;
+
+	if (pkey->crc32_hash < pbase->crc32_hash) {
+		ret = -1;
+	} else if (pkey->crc32_hash > pbase->crc32_hash) {
+		ret = 1;
+	} else {
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/**
+ * @brief
+ * calculates crc32.
+ * @param[in] crc initial crc value. usually 0.
+ * @param[in] buf pointer to the buffer whose crc32 to be calculated.
+ * @param[in] len length (in bytes) of data at @ref buf.
+ * @retval value of calculated crc32.
+ */
+static uint32_t pva_crc32(uint32_t crc, uint8_t *buf, size_t len)
+{
+	int32_t k;
+	size_t count;
+
+	count = len;
+	crc = ~crc;
+	while (count != 0U) {
+		crc ^= *buf++;
+		for (k = 0; k < 8; k++) {
+			crc = ((crc & 1U) == 1U) ? (crc >> 1U) ^ 0xedb88320U :
+							 crc >> 1U;
+		}
+
+		count--;
+	}
+
+	return ~crc;
+}
+
+static const struct vpu_hash_vector *
+binary_search(const struct vpu_hash_vector *key,
+	      const struct vpu_hash_vector *base, size_t num_elems,
+	      int32_t (*compare)(const struct vpu_hash_vector *pkey,
+				 const struct vpu_hash_vector *pbase))
+{
+	size_t low = 0U;
+	size_t high;
+
+	if (num_elems == 0U) {
+		return NULL;
+	}
+
+	high = num_elems - 1U;
+	for (;;) {
+		const struct vpu_hash_vector *mid_elem;
+		int32_t r;
+		size_t mid = low + ((high - low) / 2U);
+
+		mid_elem = &(base[mid]);
+		r = compare(key, mid_elem);
+
+		if (r < 0) {
+			if (mid == 0U) {
+				return NULL;
+			}
+
+			high = mid - 1U;
+		} else if (r > 0) {
+			low = mid + 1U;
+			if (low < mid || low > high) {
+				return NULL;
+			}
+		} else {
+			return mid_elem;
+		}
+	}
+}
+
+static enum pva_error
+pva_kmd_vpu_check_sha256_key(struct vpu_hash_key_pair *vpu_hash_keys,
+			     uint8_t *dataptr, size_t size)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct vpu_hash_vector cal_Hash;
+	const struct vpu_hash_vector *match_Hash;
+
+	cal_Hash.crc32_hash = pva_crc32(0L, dataptr, size);
+
+	match_Hash = (const struct vpu_hash_vector *)binary_search(
+		&cal_Hash, vpu_hash_keys->pvpu_hash_vector,
+		vpu_hash_keys->num_hashes, compare_hash_value);
+	if (match_Hash == NULL) {
+		pva_kmd_log_err("No Hash Match Found");
+		err = PVA_EACCES;
+		goto fail;
+	}
+
+	err = check_all_keys_for_match(vpu_hash_keys->psha_key, dataptr, size,
+				       match_Hash);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err("Match key not found");
+	}
+fail:
+	return err;
+}
+
+enum pva_error pva_kmd_verify_exectuable_hash(struct pva_kmd_device *pva,
+					      uint8_t *dataptr, size_t size)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_vpu_auth *pva_auth;
+
+	ASSERT(pva != NULL);
+	ASSERT(dataptr != NULL);
+	pva_auth = pva->pva_auth;
+	ASSERT(pva_auth != NULL);
+
+	pva_kmd_mutex_lock(&pva_auth->allow_list_lock);
+	if (pva_auth->pva_auth_enable) {
+		pva_dbg_printf("App authentication enabled");
+		if (pva_auth->pva_auth_allow_list_parsed == false) {
+			err = pva_kmd_allowlist_parse(pva);
+			if (err == PVA_SUCCESS) {
+				pva_dbg_printf(
+					"App authentication allowlist parsing successfull");
+			} else {
+				pva_dbg_printf(
+					"App authentication allowlist parsing failed");
+			}
+		}
+
+		if (err == PVA_SUCCESS) {
+			err = pva_kmd_vpu_check_sha256_key(
+				pva_auth->vpu_hash_keys, (uint8_t *)dataptr,
+				size);
+			if (err == PVA_SUCCESS) {
+				pva_dbg_printf(
+					"App authentication successfull");
+			} else {
+				pva_dbg_printf("App authentication failed : %d",
+					       err);
+			}
+		}
+	} else {
+		pva_dbg_printf("App authentication disabled");
+	}
+
+	pva_kmd_mutex_unlock(&pva_auth->allow_list_lock);
+
+	return err;
+}
+
+static void pva_kmd_allowlist_destroy(struct pva_vpu_auth *pva_auth)
+{
+	if (pva_auth->vpu_hash_keys != NULL) {
+		pva_kmd_free(pva_auth->vpu_hash_keys->ptr_file_data);
+		pva_kmd_free(pva_auth->vpu_hash_keys);
+		pva_auth->vpu_hash_keys = NULL;
+	}
+}
+
+enum pva_error pva_kmd_allowlist_parse(struct pva_kmd_device *pva)
+{
+	struct pva_vpu_auth *pva_auth = pva->pva_auth;
+	enum pva_error err = PVA_SUCCESS;
+	uint8_t *data = NULL;
+	uint64_t size = 0;
+	struct vpu_hash_key_pair *vhashk;
+	size_t vkey_size = 0;
+	size_t vhash_size = 0;
+
+	ASSERT(pva_auth != NULL);
+
+	//Destroy previously parsed allowlist data
+	pva_kmd_allowlist_destroy(pva_auth);
+
+	err = pva_kmd_auth_allowlist_load(
+		pva, pva_auth->pva_auth_allowlist_path, &data, &size);
+	if (err != PVA_SUCCESS) {
+		if (data != NULL) {
+			pva_kmd_free(data);
+		}
+		goto fail;
+	}
+	vhashk = (struct vpu_hash_key_pair *)pva_kmd_zalloc(
+		sizeof(struct vpu_hash_key_pair));
+	if (vhashk == NULL) {
+		pva_kmd_log_err("Unable to allocate memory");
+		pva_kmd_free(data);
+		err = PVA_NOMEM;
+		goto fail;
+	}
+
+	vhashk->ptr_file_data = data;
+	vhashk->num_keys = ((uint32_t *)(uintptr_t)data)[0];
+	vhashk->psha_key =
+		(struct shakey *)(uintptr_t)(data + sizeof(uint32_t));
+	vkey_size = sizeof(struct shakey) * (vhashk->num_keys);
+	vhashk->num_hashes = ((uint32_t *)(uintptr_t)((char *)vhashk->psha_key +
+						      vkey_size))[0];
+	vhashk->pvpu_hash_vector =
+		(struct vpu_hash_vector
+			 *)(uintptr_t)((char *)(vhashk->psha_key) + vkey_size +
+				       sizeof(uint32_t));
+	vhash_size = sizeof(struct vpu_hash_vector) * (vhashk->num_hashes);
+	if ((sizeof(uint32_t) + sizeof(uint32_t) + vkey_size + vhash_size) !=
+	    size) {
+		pva_kmd_free(data);
+		pva_kmd_free(vhashk);
+		err = PVA_EACCES;
+		goto fail;
+	}
+
+	pva_auth->pva_auth_allow_list_parsed = true;
+	pva_auth->vpu_hash_keys = vhashk;
+
+fail:
+	return err;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_app_auth.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_app_auth.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef PVA_KMD_VPU_APP_AUTH_H
+#define PVA_KMD_VPU_APP_AUTH_H
+
+#include "pva_kmd_shim_vpu_app_auth.h"
+#include "pva_kmd_mutex.h"
+
+/**
+ * Maximum length of allowlist file path
+ */
+#define ALLOWLIST_FILE_LEN 128U
+
+/**
+ * Size of sha256 keys in bytes.
+ */
+#define NVPVA_SHA256_DIGEST_SIZE 32U
+
+struct pva_kmd_device;
+/**
+ * Array of all VPU Hash'es
+ */
+struct vpu_hash_vector {
+	/*! Number of Keys for this crc32_hash */
+	uint32_t count;
+	/*! Starting Index into Keys Array */
+	uint32_t index;
+	/*! CRC32 hash value */
+	uint32_t crc32_hash;
+};
+
+/**
+ * Stores sha256 key
+ */
+struct shakey {
+	/** 256-bit (32 Bytes) SHA Key */
+	uint8_t sha_key[NVPVA_SHA256_DIGEST_SIZE];
+};
+
+/**
+ * Stores Hash Vector and Keys vector
+ */
+struct vpu_hash_key_pair {
+	/*! Total number of Keys in binary file */
+	uint32_t num_keys;
+	/*! pointer to SHA keys Array. */
+	struct shakey *psha_key;
+	/*! Total number of Hashes in binary file */
+	uint32_t num_hashes;
+	/*! pointer to Array of Hash'es */
+	struct vpu_hash_vector *pvpu_hash_vector;
+	/*! pointer to data loaded from file (QNX Specific)*/
+	uint8_t *ptr_file_data;
+};
+
+/**
+ * Stores all the information related to pva vpu elf authentication.
+ */
+struct pva_vpu_auth {
+	/** Stores crc32-sha256 of ELFs */
+	struct vpu_hash_key_pair *vpu_hash_keys;
+	pva_kmd_mutex_t allow_list_lock;
+	/** Flag to check if allowlist is enabled */
+	bool pva_auth_enable;
+	/** Flag to track if the allow list is already parsed */
+	bool pva_auth_allow_list_parsed;
+	/** Stores the path to allowlist binary file. */
+	char pva_auth_allowlist_path[ALLOWLIST_FILE_LEN + 1U];
+};
+
+enum pva_error pva_kmd_init_vpu_app_auth(struct pva_kmd_device *pva, bool ena);
+
+enum pva_error pva_kmd_verify_exectuable_hash(struct pva_kmd_device *pva,
+					      uint8_t *dataptr, size_t size);
+
+enum pva_error pva_kmd_allowlist_parse(struct pva_kmd_device *pva);
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_ocd.c
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_ocd.c
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#include "pva_kmd_device.h"
+#include "pva_math_utils.h"
+#include "pva_kmd_vpu_ocd.h"
+#include "pva_kmd_silicon_utils.h"
+
+#define PVA_DEBUG_APERTURE_INDEX 1U
+
+int pva_kmd_vpu_ocd_open(struct pva_kmd_device *dev)
+{
+	int retval = 0;
+	enum pva_error err;
+	err = pva_kmd_device_busy(dev);
+	if (err != PVA_SUCCESS) {
+		pva_kmd_log_err(
+			"pva_kmd_vpu_ocd_open pva_kmd_device_busy failed");
+		retval = -1;
+		goto out;
+	}
+out:
+	return retval;
+}
+
+int pva_kmd_vpu_ocd_release(struct pva_kmd_device *dev)
+{
+	pva_kmd_device_idle(dev);
+	return 0;
+}
+
+int64_t pva_kmd_vpu_ocd_write(struct pva_kmd_device *dev, void *file_data,
+			      const uint8_t *data, uint64_t offset,
+			      uint64_t size)
+{
+	struct pva_vpu_ocd_write_param write_param;
+	uint32_t i;
+	unsigned long retval;
+	uint32_t reg_offset;
+	uint32_t const *vpu_ocd_offset = (uint32_t *)file_data;
+
+	retval = pva_kmd_copy_data_from_user(&write_param, data,
+					     sizeof(write_param));
+	if (retval != 0u) {
+		pva_kmd_log_err("Failed to copy write buffer from user");
+		return -1;
+	}
+
+	if (write_param.n_write > VPU_OCD_MAX_NUM_DATA_ACCESS) {
+		pva_kmd_log_err_u64("pva: too many vpu dbg reg write",
+				    write_param.n_write);
+		return -1;
+	}
+
+	/* Write instruction first */
+	pva_kmd_aperture_write(dev, PVA_DEBUG_APERTURE_INDEX, *vpu_ocd_offset,
+			       write_param.instr);
+
+	/*
+	* Write data
+	* if there's 1 word, write to addr 0x4,
+	* if there's 2 words, write to addr 2 * 0x4,
+	* ...
+	*/
+	reg_offset = safe_addu32((uint32_t)*vpu_ocd_offset,
+				 safe_mulu32(write_param.n_write,
+					     (uint32_t)sizeof(uint32_t)));
+	for (i = 0u; i < write_param.n_write; i++) {
+		pva_kmd_aperture_write(dev, PVA_DEBUG_APERTURE_INDEX,
+				       reg_offset, write_param.data[i]);
+	}
+
+	return 0;
+}
+
+int64_t pva_kmd_vpu_ocd_read(struct pva_kmd_device *dev, void *file_data,
+			     uint8_t *data, uint64_t offset, uint64_t size)
+{
+	struct pva_vpu_ocd_read_param read_param;
+	unsigned long retval;
+	uint32_t i;
+	uint32_t reg_offset;
+	uint32_t const *vpu_ocd_offset = (uint32_t *)file_data;
+
+	retval = pva_kmd_copy_data_from_user(&read_param, data,
+					     sizeof(read_param));
+	if (retval != 0u) {
+		pva_kmd_log_err("failed to copy read buffer from user");
+		return -1;
+	}
+
+	if (read_param.n_read > VPU_OCD_MAX_NUM_DATA_ACCESS) {
+		pva_kmd_log_err_u64("pva: too many vpu dbg reg read",
+				    read_param.n_read);
+		return -1;
+	}
+
+	/*
+	* Read data
+	* if there's 1 word, read from addr 0x4,
+	* if there's 2 words, read from addr 2 * 0x4,
+	* ...
+	*/
+	reg_offset = safe_addu32((uint32_t)*vpu_ocd_offset,
+				 safe_mulu32(read_param.n_read,
+					     (uint32_t)sizeof(uint32_t)));
+	for (i = 0; i < read_param.n_read; i++) {
+		read_param.data[i] = pva_kmd_aperture_read(
+			dev, PVA_DEBUG_APERTURE_INDEX, reg_offset);
+	}
+
+	retval = pva_kmd_copy_data_to_user(data, &read_param,
+					   sizeof(read_param));
+	if (retval != 0u) {
+		pva_kmd_log_err("failed to copy read buffer to user");
+		return -1;
+	}
+
+	return 0;
+}
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_ocd.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_kmd_vpu_ocd.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_VPU_OCD_H
+#define PVA_KMD_VPU_OCD_H
+
+#define VPU_OCD_MAX_NUM_DATA_ACCESS 7U
+
+struct pva_vpu_ocd_write_param {
+	uint32_t instr;
+	uint32_t n_write;
+	uint32_t data[VPU_OCD_MAX_NUM_DATA_ACCESS];
+};
+
+struct pva_vpu_ocd_read_param {
+	uint32_t n_read;
+	uint32_t data[VPU_OCD_MAX_NUM_DATA_ACCESS];
+};
+
+int64_t pva_kmd_vpu_ocd_read(struct pva_kmd_device *dev, void *file_data,
+			     uint8_t *data, uint64_t offset, uint64_t size);
+int64_t pva_kmd_vpu_ocd_write(struct pva_kmd_device *dev, void *file_data,
+			      const uint8_t *data, uint64_t offset,
+			      uint64_t size);
+int pva_kmd_vpu_ocd_open(struct pva_kmd_device *dev);
+int pva_kmd_vpu_ocd_release(struct pva_kmd_device *dev);
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/pva_plat_faults.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/pva_plat_faults.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_PLAT_FAULTS_H
+#define PVA_PLAT_FAULTS_H
+
+#include "pva_kmd_shim_utils.h"
+
+#define ASSERT(x)                                                              \
+	if (!(x)) {                                                            \
+		pva_kmd_print_str_u64("PVA KMD ASSERT at " __FILE__,           \
+				      __LINE__);                               \
+		pva_kmd_fault();                                               \
+	}
+
+#define FAULT(msg)                                                             \
+	{                                                                      \
+		pva_kmd_print_str_u64("PVA KMD FAULT at " __FILE__, __LINE__); \
+		pva_kmd_print_str(msg);                                        \
+		pva_kmd_fault();                                               \
+	}                                                                      \
+	while (0)
+
+#define ASSERT_WITH_LOC(x, err_file, err_line)                                 \
+	if (!(x)) {                                                            \
+		pva_kmd_print_str_u64("Error at line", err_line);              \
+		pva_kmd_print_str(err_file);                                   \
+		pva_kmd_print_str("PVA KMD ASSERT");                           \
+		pva_kmd_fault();                                               \
+	}
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_device_memory.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_device_memory.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_DEVICE_MEMORY_H
+#define PVA_KMD_DEVICE_MEMORY_H
+#include "pva_kmd.h"
+#include "pva_api.h"
+struct pva_kmd_context;
+
+/**
+ * @brief KMD device memory structure.
+ *
+ * This structure is essentially a base object. More information is needed to
+ * manage memory allocations but the required information is platform dependent.
+ * Therefore, each platform will have a derived implementation and this
+ * structure is just part of it.
+ */
+struct pva_kmd_device_memory {
+	uint64_t iova; /**< IOVA address if mapped. Otherwise 0 */
+	void *va; /**< CPU address if mapped. Otherwise 0. */
+	uint64_t size; /**< Size of the mapping. */
+	struct pva_kmd_device *pva; /**< The PVA this memory is mapped to. */
+	uint32_t smmu_ctx_idx; /**< The SMMU context this memory is mapped to. */
+};
+
+/**
+ * This API is not available in Linux and should not be used by the common code.
+ */
+struct pva_kmd_device_memory *pva_kmd_device_memory_alloc(uint64_t size);
+
+/**
+ * Allocate memory and map to both IOVA space and CPU space.
+ *
+ * @note We cannot just allocate without mapping or just mapping to one
+ * space. This restriction comes from the Linux dma_alloc_coherent API, which
+ * allocates and maps at the same time.
+ *
+ * @note iova_access_flag is only supported by QNX implementation.
+ *
+ * @param size Size of the allocation
+ * @param pva The PVA device to map to
+ * @param iova_access_flags Access flags for IOVA space. PVA_ACCESS_RO or
+ *                          PVA_ACCESS_RW. For CPU space, it's always
+ *                          read and write.
+ * @param smmu_ctx_idx The SMMU context to map to
+ */
+struct pva_kmd_device_memory *
+pva_kmd_device_memory_alloc_map(uint64_t size, struct pva_kmd_device *pva,
+				uint32_t iova_access_flags,
+				uint32_t smmu_ctx_idx);
+/** @brief Acquire memory shared from UMD.
+ *
+ * This function takes a shared ownership of the memory allocation so that KMD
+ * can keep the allocation alive even after UMD closed the memory handle.
+ *
+ * @param memory_handle Memory handle passed from user space. On Linux, this is
+ *                      a file descriptor associated with dma_buf object. On
+ *                      QNX, this is NvRM import ID.
+ * @param offset Offset into the allocation. This affects the mapped address.
+ * @param size Size of the mapping, which can be smaller than the size of the
+ *             allocation.
+ * @param ctx The user from whom we are importing the memory.
+ */
+struct pva_kmd_device_memory *
+pva_kmd_device_memory_acquire(uint64_t memory_handle, uint64_t offset,
+			      uint64_t size, struct pva_kmd_context *ctx);
+/**
+ * @brief Release the memory.
+ *
+ * This function frees memory allocated from acquire or alloc_map. If there are
+ * active CPU mapping or IOVA mapping, this function will unmap them.
+ *
+ * @param memory Pointer to the memory to release.
+ */
+void pva_kmd_device_memory_free(struct pva_kmd_device_memory *memory);
+
+/**
+ * @brief Map the memory to CPU space.
+ */
+enum pva_error
+pva_kmd_device_memory_cpu_map(struct pva_kmd_device_memory *memory);
+
+/**
+ * @brief Unmap the memory from CPU space.
+ *
+ * Unmap a not mapped memory will trigger abort.
+ */
+void pva_kmd_device_memory_cpu_unmap(struct pva_kmd_device_memory *memory);
+
+/**
+ * @brief Map the memory to IOVA space.
+ */
+enum pva_error
+pva_kmd_device_memory_iova_map(struct pva_kmd_device_memory *memory,
+			       struct pva_kmd_device *pva,
+			       uint32_t access_flags, uint32_t smmu_ctx_idx);
+/**
+ * @brief Unmap the memory from IOVA space.
+ *
+ * Unmap a not mapped memory will trigger abort.
+ */
+void pva_kmd_device_memory_iova_unmap(struct pva_kmd_device_memory *memory);
+
+#endif // PVA_KMD_DEVICE_MEMORY_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_ccq.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_ccq.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_SHIM_CCQ_H
+#define PVA_KMD_SHIM_CCQ_H
+#include "pva_api.h"
+struct pva_kmd_device;
+
+/**
+ * @brief Push a 64 bit entry to CCQ FIFO.
+ *
+ * Push low 32 bits first and then high 32 bits.
+ *
+ * @note The caller is responsible for checking if CCQ has enough spaces.
+ *
+ */
+void pva_kmd_ccq_push(struct pva_kmd_device *pva, uint8_t ccq_id,
+		      uint64_t ccq_entry);
+/**
+ * @brief Get the number of available spaces in the CCQ.
+ *
+ * One CCQ entry is 64 bits. One CCQ can hold up to 4 entries. Therefore, this
+ * function returns values from 0 to 4.
+ */
+uint32_t pva_kmd_get_ccq_space(struct pva_kmd_device *pva, uint8_t ccq_id);
+
+#endif // PVA_KMD_SHIM_CCQ_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_debugfs.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_debugfs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_SHIM_DEBUGFS_H
+#define PVA_KMD_SHIM_DEBUGFS_H
+#include "pva_api.h"
+#include "pva_kmd_tegra_stats.h"
+
+void pva_kmd_debugfs_create_bool(struct pva_kmd_device *pva, const char *name,
+				 bool *val);
+void pva_kmd_debugfs_create_u32(struct pva_kmd_device *pva, const char *name,
+				uint32_t *val);
+void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
+				 struct pva_kmd_file_ops *fops);
+void pva_kmd_debugfs_remove_nodes(struct pva_kmd_device *pva);
+unsigned long pva_kmd_copy_data_from_user(void *dst, const void *src,
+					  uint64_t size);
+unsigned long pva_kmd_copy_data_to_user(void *to, const void *from,
+					unsigned long size);
+unsigned long pva_kmd_strtol(const char *str, int base);
+
+#endif //PVA_KMD_SHIM_DEBUGFS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_init.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_init.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_SHIM_INIT_H
+#define PVA_KMD_SHIM_INIT_H
+#include "pva_api.h"
+struct pva_kmd_device;
+struct pva_kmd_file_ops;
+
+/* TODO: remove plat_init APIs. We should just pass in plat_data directly to
+ * pva_kmd_device_create. */
+void pva_kmd_device_plat_init(struct pva_kmd_device *pva);
+void pva_kmd_device_plat_deinit(struct pva_kmd_device *pva);
+
+void pva_kmd_read_syncpt_val(struct pva_kmd_device *pva, uint32_t syncpt_id,
+			     uint32_t *syncpt_value);
+
+void pva_kmd_get_syncpt_iova(struct pva_kmd_device *pva, uint32_t syncpt_id,
+			     uint64_t *syncpt_iova);
+
+void pva_kmd_allocate_syncpts(struct pva_kmd_device *pva);
+
+/**
+ * @brief Power on PVA cluster.
+ */
+enum pva_error pva_kmd_power_on(struct pva_kmd_device *pva);
+
+/**
+ * @brief Power off PVA cluster.
+ */
+void pva_kmd_power_off(struct pva_kmd_device *pva);
+
+/**
+ * @brief Initialize firmware.
+ *
+ * This function initializes firmware. On silicon, this includes
+ * - power on R5,
+ * - load firmware,
+ * - bind interrupts,
+ * - and wait for firmware boot to complete.
+ *
+ * @param pva pointer to the PVA device to initialize
+ */
+enum pva_error pva_kmd_init_fw(struct pva_kmd_device *pva);
+
+/**
+ * @brief De-init firmware.
+ *
+ * This function de-initializes firmware. On silicon, this includes
+ * - free interrupts,
+ * - power off R5,
+ * - and free firmware memories.
+ *
+ * @param pva pointer to the PVA device to de-initialize
+ */
+void pva_kmd_deinit_fw(struct pva_kmd_device *pva);
+#endif // PVA_KMD_SHIM_INIT_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_silicon.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_silicon.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#ifndef PVA_KMD_SHIM_SILICON_H
+#define PVA_KMD_SHIM_SILICON_H
+#include "pva_api.h"
+#include "pva_kmd_regs.h"
+struct pva_kmd_device;
+
+/**
+ * @file This file defines silicon APIs.
+ *
+ * Silicon APIs are only implemented by platforms that closely resemble the
+ * silicon PVA, a.k.a Linux, QNX and SIM platforms. Silicon APIs are used to
+ * implement message APIs and some init APIs.
+ *
+ * On native platform, message APIs are implemented differently. Therefore,
+ * native platform does not need to implement silicon APIs.
+ */
+
+/**
+ * @brief Write to a register in a MMIO region.
+ *
+ * @param pva pointer the PVA cluser.
+ * @param aperture the MMIO region.
+ * @param addr the register offset in the MMIO region.
+ * @param val value to write.
+ */
+void pva_kmd_aperture_write(struct pva_kmd_device *pva,
+			    enum pva_kmd_reg_aperture aperture, uint32_t addr,
+			    uint32_t val);
+/**
+ * @brief Read from a register in a MMIO region.
+ *
+ * @param pva pointer the PVA cluser.
+ * @param aperture the MMIO region.
+ * @param addr the register offset in the MMIO region.
+ *
+ * @return the value of the register.
+ */
+uint32_t pva_kmd_aperture_read(struct pva_kmd_device *pva,
+			       enum pva_kmd_reg_aperture aperture,
+			       uint32_t addr);
+
+/**
+ * @brief PVA's interrupt lines.
+ */
+enum pva_kmd_intr_line {
+	/** Interrupt line from SEC block. We receive mailbox interrupts from
+	 * this line. */
+	PVA_KMD_INTR_LINE_SEC_LIC = 0,
+	PVA_KMD_INTR_LINE_CCQ0,
+	PVA_KMD_INTR_LINE_CCQ1,
+	PVA_KMD_INTR_LINE_CCQ2,
+	PVA_KMD_INTR_LINE_CCQ3,
+	PVA_KMD_INTR_LINE_CCQ4,
+	PVA_KMD_INTR_LINE_CCQ5,
+	PVA_KMD_INTR_LINE_CCQ6,
+	PVA_KMD_INTR_LINE_CCQ7,
+	PVA_KMD_INTR_LINE_COUNT,
+};
+
+/**
+ * @brief Interrupt handler function prototype.
+ */
+typedef void (*pva_kmd_intr_handler_t)(void *data);
+
+/**
+ * @brief Bind an interrupt handler to an interrupt line.
+ *
+ * Interrupt will be enabled after binding.
+ */
+enum pva_error pva_kmd_bind_intr_handler(struct pva_kmd_device *pva,
+					 enum pva_kmd_intr_line intr_line,
+					 pva_kmd_intr_handler_t handler,
+					 void *data);
+/**
+ * @brief Enable an interrupt line.
+ */
+void pva_kmd_enable_intr(struct pva_kmd_device *pva,
+			 enum pva_kmd_intr_line intr_line);
+
+/**
+ * @brief Disable an interrupt line.
+ */
+void pva_kmd_disable_intr(struct pva_kmd_device *pva,
+			  enum pva_kmd_intr_line intr_line);
+
+/**
+ * @brief Free an interrupt line.
+ *
+ * This will disable the interrupt line and unbind the handler.
+ */
+void pva_kmd_free_intr(struct pva_kmd_device *pva,
+		       enum pva_kmd_intr_line intr_line);
+
+/**
+ * @brief Read firmware binary from file system.
+ *
+ * Firmware binary is loaded into pva->fw_bin_mem, which is directly accessible
+ * by R5.
+ *
+ * KMD will free pva->fw_bin_mem during firmware deinit.
+ */
+enum pva_error pva_kmd_read_fw_bin(struct pva_kmd_device *pva);
+
+/**
+ * @brief Get base address of read only syncpoints.
+ */
+uint32_t pva_kmd_get_syncpt_ro_offset(struct pva_kmd_device *pva);
+
+/**
+ * @brief Get base address of read write syncpoints.
+ */
+uint32_t pva_kmd_get_syncpt_rw_offset(struct pva_kmd_device *pva);
+
+/**
+ * @brief Configure EVP, Segment config registers and SCR registers.
+ *
+ * This function configures the EVP, Segment config registers and SCR registers.
+ *
+ * @param pva Pointer to the PVA device.
+ */
+void pva_kmd_config_evp_seg_scr_regs(struct pva_kmd_device *pva);
+
+/**
+ * @brief Configure SID registers.
+ *
+ * This function configures the SID registers.
+ *
+ * @param pva Pointer to the PVA device.
+ */
+void pva_kmd_config_sid_regs(struct pva_kmd_device *pva);
+
+#endif // PVA_KMD_SHIM_SILICON_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_utils.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_utils.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_SHIM_UTILS_H
+#define PVA_KMD_SHIM_UTILS_H
+#include "pva_api.h"
+
+/**
+ * @brief Allocate memory for KMD's private use.
+ *
+ *  Memory will be zero initialized.
+ */
+void *pva_kmd_zalloc(uint64_t size);
+
+/**
+ * @brief Free memory allocated by pva_kmd_zalloc.
+ */
+void pva_kmd_free(void *ptr);
+
+/**
+ * @brief Print a string.
+ *
+ * This function is used for logging errors, enabled even in safety environment.
+ * For debug print, use pva_dbg_printf.
+ *
+ * @param str The string to print.
+ */
+void pva_kmd_print_str(const char *str);
+
+/**
+ * @brief Print a string followed by a 64-bit unsigned number.
+ *
+ * This function is used for logging errors, enabled even in safety environment.
+ * For debug print, use pva_dbg_printf.
+ *
+ * @param str The string to print.
+ * @param n The number to print.
+ */
+void pva_kmd_print_str_u64(const char *str, uint64_t n);
+
+/**
+ * @brief Fault KMD.
+ *
+ * Abort KMD due to critical unrecoverable error.
+ */
+void pva_kmd_fault(void) __attribute__((noreturn));
+
+/**
+ * @brief Sleep for some microseconds.
+ *
+ * @param us The number of microseconds to sleep.
+ */
+void pva_kmd_sleep_us(uint64_t us);
+
+#if defined(__KERNEL__)
+#include <linux/nospec.h>
+#else
+static inline uint32_t array_index_nospec(uint32_t index, uint32_t size)
+{
+	return index < size ? index : 0;
+}
+#endif
+
+#endif // PVA_KMD_SHIM_UTILS_H
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_vpu_app_auth.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_shim_vpu_app_auth.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021-2023, NVIDIA Corporation.  All rights reserved.
+ */
+
+#ifndef PVA_KMD_SHIM_VPU_APP_AUTH_H
+#define PVA_KMD_SHIM_VPU_APP_AUTH_H
+
+#include "pva_api_types.h"
+struct pva_kmd_device;
+const char *pva_kmd_get_default_allowlist(void);
+enum pva_error pva_kmd_auth_allowlist_load(struct pva_kmd_device *pva,
+					   const char *file_name,
+					   uint8_t **hash_keys_data,
+					   uint64_t *psize);
+
+#endif
--- a/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_thread_sema.h
+++ b/drivers/video/tegra/host/pva/src/kmd/common/shim/pva_kmd_thread_sema.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_THREAD_SEMA_H
+#define PVA_KMD_THREAD_SEMA_H
+
+#include "pva_api.h"
+
+#if defined(__KERNEL__) /* For Linux */
+
+#include <linux/semaphore.h>
+typedef struct semaphore pva_kmd_sema_t;
+
+#else /* For user space code, including QNX KMD */
+
+#include <semaphore.h>
+/* Mutex */
+typedef sem_t pva_kmd_sema_t;
+
+#endif
+
+/**
+ * @brief Initialize a semaphore.
+ *
+ * @param sem Pointer to the semaphore.
+ * @param val Initial value of the semaphore.
+ */
+void pva_kmd_sema_init(pva_kmd_sema_t *sem, uint32_t val);
+
+/**
+ * @brief Wait on a semaphore.
+ *
+ * Decrement the semaphore count. If the count is zero, the caller will block
+ * until the semaphore is posted or the timeout expires.
+ *
+ * @param sem Pointer to the semaphore.
+ * @param timeout_ms Timeout in milliseconds.
+ *
+ * @retval PVA_SUCCESS if the semaphore was successfully acquired.
+ * @retval PVA_TIMEDOUT if the semaphore was not acquired within the timeout.
+ */
+enum pva_error pva_kmd_sema_wait_timeout(pva_kmd_sema_t *sem,
+					 uint32_t timeout_ms);
+
+/**
+ * @brief Signal a semaphore.
+ *
+ * Increment the semaphore count.
+ *
+ * @param sem Pointer to the semaphore.
+ */
+void pva_kmd_sema_post(pva_kmd_sema_t *sem);
+
+/**
+ * @brief Deinitialize a semaphore.
+ *
+ * @param sem Pointer to the semaphore.
+ */
+void pva_kmd_sema_deinit(pva_kmd_sema_t *sem);
+
+#endif // PVA_KMD_THREAD_SEMA_H
--- a/drivers/video/tegra/host/pva/src/kmd/include/pva_kmd.h
+++ b/drivers/video/tegra/host/pva/src/kmd/include/pva_kmd.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_H
+#define PVA_KMD_H
+#include "pva_api.h"
+#include "pva_fw.h"
+#include "pva_constants.h"
+#include "pva_math_utils.h"
+
+/* KMD API: context init */
+struct pva_kmd_context_init_in_args {
+	uint32_t resource_table_capacity;
+};
+
+struct pva_kmd_context_init_out_args {
+	enum pva_error error;
+	uint64_t ccq_shm_hdl;
+};
+
+struct pva_kmd_syncpt_register_out_args {
+	enum pva_error error;
+	uint32_t syncpt_ro_res_id;
+	uint32_t syncpt_rw_res_id;
+	uint32_t synpt_size;
+	uint32_t synpt_ids[PVA_NUM_RW_SYNCPTS_PER_CONTEXT];
+	uint32_t num_ro_syncpoints;
+};
+
+/**
+ * Calculates the total memory size required for a PVA submission queue.
+ * This includes the size of the queue header and the combined size of all command buffer submission info structures.
+ * 
+ * @param x The number of command buffer submission info structures.
+ * @return The total memory size in bytes.
+ */
+static inline uint32_t pva_get_submission_queue_memory_size(uint32_t x)
+{
+	uint32_t submit_info_size =
+		(uint32_t)sizeof(struct pva_fw_cmdbuf_submit_info);
+	uint32_t num_submit_infos = safe_mulu32(x, submit_info_size);
+	uint32_t header_size =
+		(uint32_t)sizeof(struct pva_fw_submit_queue_header);
+	return safe_addu32(header_size, num_submit_infos);
+}
+
+/* KMD API: queue create */
+struct pva_kmd_queue_create_in_args {
+	uint32_t max_submission_count;
+	uint64_t queue_memory_handle;
+	uint64_t queue_memory_offset;
+};
+
+struct pva_kmd_queue_create_out_args {
+	enum pva_error error;
+	uint32_t queue_id;
+	uint32_t syncpt_fence_counter;
+};
+
+/* KMD API: queue destroy */
+struct pva_kmd_queue_destroy_in_args {
+	uint32_t queue_id;
+};
+
+struct pva_kmd_queue_destroy_out_args {
+	enum pva_error error;
+};
+
+struct pva_kmd_memory_register_in_args {
+	enum pva_memory_segment segment;
+	uint32_t access_flags;
+	uint64_t memory_handle;
+	uint64_t offset;
+	uint64_t size;
+};
+
+/* KMD API: executable */
+struct pva_kmd_executable_register_in_args {
+	uint32_t size;
+};
+
+struct pva_kmd_executable_get_symbols_in_args {
+	uint32_t exec_resource_id;
+};
+
+struct pva_kmd_executable_get_symbols_out_args {
+	enum pva_error error;
+	uint32_t num_symbols;
+	/* Followed by <num_symbols> of struct pva_symbol_info */
+};
+
+/* KMD API: DMA config */
+struct pva_kmd_dma_config_register_in_args {
+	struct pva_dma_config_header dma_config_header;
+	/* Followed by hwseq words, channels, descriptors, etc. */
+};
+
+struct pva_kmd_register_out_args {
+	enum pva_error error;
+	uint32_t resource_id;
+};
+
+struct pva_kmd_exec_register_out_args {
+	enum pva_error error;
+	uint32_t resource_id;
+	uint32_t num_symbols;
+};
+
+struct pva_kmd_unregister_in_args {
+	uint32_t resource_id;
+};
+
+enum pva_kmd_op_type {
+	PVA_KMD_OP_CONTEXT_INIT,
+	PVA_KMD_OP_QUEUE_CREATE,
+	PVA_KMD_OP_QUEUE_DESTROY,
+	PVA_KMD_OP_EXECUTABLE_GET_SYMBOLS,
+	PVA_KMD_OP_MEMORY_REGISTER,
+	PVA_KMD_OP_SYNPT_REGISTER,
+	PVA_KMD_OP_EXECUTABLE_REGISTER,
+	PVA_KMD_OP_DMA_CONFIG_REGISTER,
+	PVA_KMD_OP_UNREGISTER,
+	PVA_KMD_OP_MAX,
+};
+
+/**
+ * The header of a KMD operation
+ */
+struct pva_kmd_op_header {
+	enum pva_kmd_op_type op_type; /**< Type of the KMD operation */
+};
+
+/**
+ * The header of a KMD response
+ */
+struct pva_kmd_response_header {
+	uint32_t rep_size; /** Size of the response, including the header */
+};
+
+enum pva_kmd_ops_mode {
+	/**
+	* Only one operation is allowed. The
+	* operation will be done synchronously.
+	* KMD will wait for the fence if
+	* necessary. */
+	PVA_KMD_OPS_MODE_SYNC,
+	/**
+	* A list of registration operations are allowed. These operations will
+	* trigger a post fence. KMD will not wait for the fence.
+	*/
+	PVA_KMD_OPS_MODE_ASYNC,
+};
+
+/**
+ * A buffer contains a list of KMD operations and a post fence.
+ *
+ * In general, the list of KMD operations contain jobs that need to be done by
+ * the KMD and FW. KMD will first perform its part and then submit a privileged
+ * command buffer to FW. FW will trigger the provided post fence when done.
+ *
+ * NOTE: Starting address of every struct/array in the buffer must be aligned to
+ * 8 bytes.
+ */
+struct pva_kmd_operations {
+	enum pva_kmd_ops_mode mode;
+	struct pva_fw_postfence postfence;
+	/** Followed by a list of KMD operation(s) */
+};
+
+/* Max op buffer sizer is 8 MB */
+#define PVA_KMD_MAX_OP_BUFFER_SIZE (8 * 1024 * 1024)
+
+/* Max respone size is 8 KB */
+#define PVA_KMD_MAX_RESP_BUFFER_SIZE (8 * 1024)
+
+#endif // PVA_KMD_H
--- a/drivers/video/tegra/host/pva/src/kmd/linux/Kbuild
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/Kbuild
@@ -0,0 +1,19 @@
+################################### tell Emacs this is a -*- makefile-gmake -*-
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###############################################################################
+
+obj-m := pva_kmd_linux.o
+
+pva_kmd_linux-objs += ${PVA_KMD_LINUX_SRC}
+
+ccflags-y += ${PVA_KMD_LINUX_INC}
+ccflags-y += ${PVA_KMD_LINUX_DEF}
+ccflags-y += ${PVA_KMD_LINUX_CFLAGS}
+ccflags-y += -std=gnu11
--- a/drivers/video/tegra/host/pva/src/kmd/linux/include/pva_kmd_linux.h
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/include/pva_kmd_linux.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+
+#ifndef PVA_KMD_LINUX_H
+#define PVA_KMD_LINUX_H
+
+#include "pva_kmd.h"
+
+#define PVA_LINUX_DEV_PATH_PREFIX "/dev/nvhost-ctrl-pva"
+
+#define NVPVA_IOCTL_MAGIC 'Q'
+
+#define PVA_KMD_IOCTL_GENERIC                                                  \
+	_IOWR(NVPVA_IOCTL_MAGIC, 1, struct pva_kmd_linux_ioctl_header)
+
+#define NVPVA_IOCTL_MAX_SIZE 256 //Temp value which can be updated later
+
+struct nvpva_ioctl_part {
+	void *addr;
+	uint64_t size;
+};
+
+/**
+ * The header of request to KMD
+ */
+struct pva_kmd_linux_ioctl_header {
+	struct nvpva_ioctl_part request;
+	struct nvpva_ioctl_part response;
+};
+
+#endif // PVA_KMD_LINUX_H
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_debugfs.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_debugfs.c
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include <linux/debugfs.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/nvhost.h>
+
+#include <uapi/linux/tegra-soc-hwpm-uapi.h>
+#include "pva_kmd_linux.h"
+#include "pva_kmd_linux_device.h"
+#include "pva_kmd_debugfs.h"
+
+static int pva_handle_fops(struct seq_file *s, void *data)
+{
+	return 0;
+}
+
+static int debugfs_node_open(struct inode *inode, struct file *file)
+{
+	int retval;
+	struct pva_kmd_file_ops *fops = file_inode(file)->i_private;
+	retval = single_open(file, pva_handle_fops, inode->i_private);
+	if (retval != 0) {
+		pva_kmd_log_err("debugfs_node_open single_open failed");
+		goto out;
+	}
+
+	if (fops->open != NULL) {
+		retval = fops->open(fops->pdev);
+	}
+
+out:
+	return retval;
+}
+
+static int debugfs_node_release(struct inode *inode, struct file *file)
+{
+	int retval;
+	struct pva_kmd_file_ops *fops = file_inode(file)->i_private;
+
+	if (fops->release != NULL) {
+		retval = fops->release(fops->pdev);
+		if (retval != 0) {
+			pva_kmd_log_err("debugfs_node_release release failed");
+			goto out;
+		}
+	}
+
+	retval = single_release(inode, file);
+
+out:
+	return retval;
+}
+
+static long int debugfs_node_read(struct file *file, char *data,
+				  long unsigned int size, long long int *offset)
+{
+	int64_t retval;
+	struct pva_kmd_file_ops *fops = file_inode(file)->i_private;
+	retval = fops->read(fops->pdev, fops->file_data, data, *offset, size);
+
+	return retval;
+}
+
+static long int debugfs_node_write(struct file *file, const char *data,
+				   long unsigned int size,
+				   long long int *offset)
+{
+	long int retval;
+	struct pva_kmd_file_ops *fops = file_inode(file)->i_private;
+	retval = fops->write(fops->pdev, fops->file_data, data, *offset, size);
+
+	return retval;
+}
+
+static const struct file_operations pva_linux_debugfs_fops = {
+	.open = debugfs_node_open,
+	.read = debugfs_node_read,
+	.write = debugfs_node_write,
+	.release = debugfs_node_release,
+};
+
+void pva_kmd_debugfs_create_bool(struct pva_kmd_device *pva, const char *name,
+				 bool *pdata)
+{
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+	struct dentry *de = props->debugfs;
+
+	debugfs_create_bool(name, 0644, de, pdata);
+}
+void pva_kmd_debugfs_create_u32(struct pva_kmd_device *pva, const char *name,
+				uint32_t *pdata)
+{
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+	struct dentry *de = props->debugfs;
+
+	debugfs_create_u32(name, 0644, de, pdata);
+}
+
+void pva_kmd_debugfs_create_file(struct pva_kmd_device *pva, const char *name,
+				 struct pva_kmd_file_ops *pvafops)
+{
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+	struct dentry *de = props->debugfs;
+	struct file_operations *fops =
+		(struct file_operations *)&pva_linux_debugfs_fops;
+	struct dentry *file;
+
+	file = debugfs_create_file(name, 0644, de, pvafops, fops);
+	ASSERT(file != NULL);
+}
+
+void pva_kmd_debugfs_remove_nodes(struct pva_kmd_device *pva)
+{
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+	struct dentry *de = props->debugfs;
+
+	debugfs_lookup_and_remove("stats_enable", de);
+	debugfs_lookup_and_remove("vpu_debug", de);
+	debugfs_lookup_and_remove("profile_level", de);
+	debugfs_lookup_and_remove("vpu_stats", de);
+}
--- a/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device.c
+++ b/drivers/video/tegra/host/pva/src/kmd/linux/pva_kmd_linux_device.c
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA Corporation.  All Rights Reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.  Any
+ * use, reproduction, disclosure or distribution of this software and related
+ * documentation without an express license agreement from NVIDIA Corporation
+ * is strictly prohibited.
+ */
+#include <linux/of.h>
+#include <linux/clk.h>
+#include <linux/reset.h>
+#include <linux/pm_runtime.h>
+#include <linux/debugfs.h>
+#include <linux/firmware.h>
+#include <linux/version.h>
+#include <linux/nvhost.h>
+#include <linux/nvhost_t194.h>
+#include <linux/iommu.h>
+#include <linux/dma-mapping.h>
+#include <soc/tegra/virt/syscalls.h>
+#include <asm/io.h>
+
+#include "pva_kmd_device.h"
+#include "pva_kmd_linux_device.h"
+#include "pva_kmd_device_memory.h"
+#include "pva_kmd_constants.h"
+#include "pva_kmd_silicon_utils.h"
+#include "pva_kmd_silicon_boot.h"
+
+struct nvhost_device_data *
+pva_kmd_linux_device_get_properties(struct platform_device *pdev)
+{
+	struct nvhost_device_data *props = platform_get_drvdata(pdev);
+	return props;
+}
+
+struct pva_kmd_linux_device_data *
+pva_kmd_linux_device_get_data(struct pva_kmd_device *device)
+{
+	return (struct pva_kmd_linux_device_data *)device->plat_data;
+}
+
+void pva_kmd_linux_device_set_data(struct pva_kmd_device *device,
+				   struct pva_kmd_linux_device_data *data)
+{
+	device->plat_data = (void *)data;
+}
+
+void pva_kmd_read_syncpt_val(struct pva_kmd_device *pva, uint32_t syncpt_id,
+			     uint32_t *syncpt_value)
+{
+	int err = 0;
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+	err = nvhost_syncpt_read_ext_check(props->pdev, syncpt_id,
+					   syncpt_value);
+	if (err < 0) {
+		FAULT("Failed to read syncpoint value\n");
+	}
+}
+
+void pva_kmd_get_syncpt_iova(struct pva_kmd_device *pva, uint32_t syncpt_id,
+			     uint64_t *syncpt_iova)
+{
+	uint32_t offset = 0;
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+
+	struct platform_device *host_pdev =
+		to_platform_device(props->pdev->dev.parent);
+
+	offset = nvhost_syncpt_unit_interface_get_byte_offset_ext(host_pdev,
+								  syncpt_id);
+	*syncpt_iova = safe_addu64(pva->syncpt_ro_iova, (uint64_t)offset);
+}
+
+void pva_kmd_linux_host1x_init(struct pva_kmd_device *pva)
+{
+	phys_addr_t base;
+	size_t size;
+	int err = 0;
+	uint32_t syncpt_page_size;
+	uint32_t syncpt_offset[PVA_NUM_RW_SYNCPTS];
+	dma_addr_t sp_start;
+	struct platform_device *host_pdev;
+	struct device *dev;
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+	nvhost_syncpt_unit_interface_init(props->pdev);
+
+	host_pdev = to_platform_device(props->pdev->dev.parent);
+	err = nvhost_syncpt_unit_interface_get_aperture(host_pdev, &base,
+							&size);
+	if (err < 0) {
+		FAULT("Failed to get syncpt aperture\n");
+	}
+	/** Get page size of a syncpoint */
+	syncpt_page_size =
+		nvhost_syncpt_unit_interface_get_byte_offset_ext(host_pdev, 1);
+	dev = &device_data->smmu_contexts[PVA_R5_SMMU_CONTEXT_ID]->dev;
+	if (iommu_get_domain_for_dev(dev)) {
+		sp_start = dma_map_resource(dev, base, size, DMA_TO_DEVICE,
+					    DMA_ATTR_SKIP_CPU_SYNC);
+		if (dma_mapping_error(dev, sp_start)) {
+			FAULT("Failed to pin RO syncpoints\n");
+		}
+	} else {
+		FAULT("Failed to pin RO syncpoints\n");
+	}
+	pva->syncpt_ro_iova = sp_start;
+	pva->syncpt_offset = syncpt_page_size;
+	pva->num_syncpts = (size / syncpt_page_size);
+
+	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS; i++) {
+		pva->syncpt_rw[i].syncpt_id = nvhost_get_syncpt_client_managed(
+			props->pdev, "pva_syncpt");
+		if (pva->syncpt_rw[i].syncpt_id == 0) {
+			FAULT("Failed to get syncpt\n");
+		}
+		syncpt_offset[i] =
+			nvhost_syncpt_unit_interface_get_byte_offset_ext(
+				host_pdev, pva->syncpt_rw[i].syncpt_id);
+		err = nvhost_syncpt_read_ext_check(
+			props->pdev, pva->syncpt_rw[i].syncpt_id,
+			&pva->syncpt_rw[i].syncpt_value);
+		if (err < 0) {
+			FAULT("Failed to read syncpoint value\n");
+		}
+	}
+
+	pva->syncpt_rw_iova =
+		dma_map_resource(dev,
+				 safe_addu64(base, (uint64_t)syncpt_offset[0]),
+				 safe_mulu64((uint64_t)pva->syncpt_offset,
+					     (uint64_t)PVA_NUM_RW_SYNCPTS),
+				 DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+	if (dma_mapping_error(dev, pva->syncpt_rw_iova)) {
+		FAULT("Failed to pin RW syncpoints\n");
+	}
+	pva->syncpt_rw[0].syncpt_iova = pva->syncpt_rw_iova;
+	for (uint32_t i = 1; i < PVA_NUM_RW_SYNCPTS; i++) {
+		if (safe_addu32(syncpt_offset[i - 1], pva->syncpt_offset) !=
+		    syncpt_offset[i]) {
+			FAULT("RW syncpts are not contiguous\n");
+		}
+		pva->syncpt_rw[i].syncpt_iova = safe_addu64(
+			pva->syncpt_rw_iova,
+			safe_mulu64((uint64_t)pva->syncpt_offset, (uint64_t)i));
+	}
+}
+
+void pva_kmd_allocate_syncpts(struct pva_kmd_device *pva)
+{
+}
+
+void pva_kmd_linux_host1x_deinit(struct pva_kmd_device *pva)
+{
+	int err = 0;
+	phys_addr_t base;
+	size_t size;
+	struct device *dev;
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+	struct platform_device *host_pdev =
+		to_platform_device(props->pdev->dev.parent);
+
+	err = nvhost_syncpt_unit_interface_get_aperture(host_pdev, &base,
+							&size);
+	if (err < 0) {
+		FAULT("Failed to get syncpt aperture\n");
+	}
+
+	dev = &device_data->smmu_contexts[PVA_R5_SMMU_CONTEXT_ID]->dev;
+	if (iommu_get_domain_for_dev(dev)) {
+		dma_unmap_resource(dev, pva->syncpt_ro_iova, size,
+				   DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+		dma_unmap_resource(dev, pva->syncpt_rw_iova,
+				   safe_mulu64((uint64_t)pva->syncpt_offset,
+					       (uint64_t)PVA_NUM_RW_SYNCPTS),
+				   DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+	} else {
+		FAULT("Failed to unmap syncpts\n");
+	}
+	for (uint32_t i = 0; i < PVA_NUM_RW_SYNCPTS; i++) {
+		nvhost_syncpt_put_ref_ext(props->pdev,
+					  pva->syncpt_rw[i].syncpt_id);
+		pva->syncpt_rw[i].syncpt_id = 0;
+		pva->syncpt_rw[i].syncpt_iova = 0;
+		pva->syncpt_rw[i].syncpt_value = 0;
+	}
+	pva->syncpt_ro_iova = 0;
+	pva->syncpt_rw_iova = 0;
+	pva->syncpt_offset = 0;
+	nvhost_syncpt_unit_interface_deinit(props->pdev);
+}
+
+void pva_kmd_device_plat_init(struct pva_kmd_device *pva)
+{
+	struct pva_kmd_linux_device_data *plat_data =
+		pva_kmd_zalloc_nofail(sizeof(struct pva_kmd_linux_device_data));
+
+	pva_kmd_linux_device_set_data(pva, plat_data);
+
+	/* Get SMMU context devices that were probed earlier and their SIDs */
+	pva_kmd_linux_device_smmu_contexts_init(pva);
+}
+
+void pva_kmd_device_plat_deinit(struct pva_kmd_device *pva)
+{
+	pva_kmd_linux_host1x_deinit(pva);
+	pva_kmd_free(pva_kmd_linux_device_get_data(pva));
+}
+
+enum pva_error pva_kmd_power_on(struct pva_kmd_device *pva)
+{
+	int err = 0;
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+
+	err = pm_runtime_get_sync(&props->pdev->dev);
+	if (err < 0) {
+		pm_runtime_put_noidle(&props->pdev->dev);
+		goto out;
+	}
+
+	/* Power management operation is asynchronous. PVA may not be power
+	 * cycled between power_off -> power_on call. Therefore, we need to
+	 * reset it here to make sure it is in a clean state. */
+	reset_control_acquire(props->reset_control);
+	reset_control_reset(props->reset_control);
+	reset_control_release(props->reset_control);
+
+out:
+	return kernel_err2pva_err(err);
+}
+
+void pva_kmd_power_off(struct pva_kmd_device *pva)
+{
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *props = device_data->pva_device_properties;
+
+	pm_runtime_mark_last_busy(&props->pdev->dev);
+	pm_runtime_put(&props->pdev->dev);
+
+	/* Power management operation is asynchronous. We don't control when PVA
+	 * will really be powered down. However, we need to free memories after
+	 * this call. Therefore, we assert the reset line to stop PVA from any
+	 * further activity. */
+	reset_control_acquire(props->reset_control);
+	reset_control_assert(props->reset_control);
+	reset_control_release(props->reset_control);
+}
+
+uint32_t pva_kmd_get_syncpt_ro_offset(struct pva_kmd_device *pva)
+{
+	return safe_subu64(pva->syncpt_ro_iova, FW_SHARED_MEMORY_START);
+}
+uint32_t pva_kmd_get_syncpt_rw_offset(struct pva_kmd_device *pva)
+{
+	return safe_subu64(pva->syncpt_rw_iova, FW_SHARED_MEMORY_START);
+}
+
+enum pva_error pva_kmd_read_fw_bin(struct pva_kmd_device *pva)
+{
+	enum pva_error err = PVA_SUCCESS;
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *device_props =
+		device_data->pva_device_properties;
+	struct pva_kmd_device_memory *fw_bin_mem;
+
+	const struct firmware *fw_ucode;
+	int kerr = request_firmware(&fw_ucode, device_props->firmware_name,
+				    &device_props->pdev->dev);
+	if (kerr < 0) {
+		err = kernel_err2pva_err(kerr);
+		goto out;
+	}
+
+	fw_bin_mem = pva_kmd_device_memory_alloc_map(
+		safe_pow2_roundup_u64(fw_ucode->size, SIZE_4KB), pva,
+		PVA_ACCESS_RW, PVA_R5_SMMU_CONTEXT_ID);
+	if (fw_bin_mem == NULL) {
+		err = PVA_NOMEM;
+		goto release;
+	}
+
+	memcpy(fw_bin_mem->va, fw_ucode->data, fw_ucode->size);
+
+	pva->fw_bin_mem = fw_bin_mem;
+release:
+	release_firmware(fw_ucode);
+out:
+	return err;
+}
+
+void pva_kmd_aperture_write(struct pva_kmd_device *pva,
+			    enum pva_kmd_reg_aperture aperture, uint32_t reg,
+			    uint32_t val)
+{
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *device_props =
+		device_data->pva_device_properties;
+
+	void __iomem *addr = device_props->aperture[aperture] + reg;
+
+	writel(val, addr);
+}
+
+uint32_t pva_kmd_aperture_read(struct pva_kmd_device *pva,
+			       enum pva_kmd_reg_aperture aperture, uint32_t reg)
+{
+	struct pva_kmd_linux_device_data *device_data =
+		pva_kmd_linux_device_get_data(pva);
+	struct nvhost_device_data *device_props =
+		device_data->pva_device_properties;
+
+	void __iomem *addr = device_props->aperture[aperture] + reg;
+
+	return readl(addr);
+}
+
+enum pva_error kernel_err2pva_err(int err)
+{
+	if (err >= 0) {
+		return PVA_SUCCESS;
+	}
+
+	switch (err) {
+	case -EINVAL:
+		return PVA_INVAL;
+	case -EINTR:
+		return PVA_EINTR;
+	default:
+		return PVA_UNKNOWN_ERROR;
+	}
+}
+
+unsigned long pva_kmd_copy_data_from_user(void *dst, const void *src,
+					  uint64_t size)
+{
+	return copy_from_user(dst, src, size);
+}
+
+unsigned long pva_kmd_copy_data_to_user(void __user *to, const void *from,
+					unsigned long size)
+{
+	return copy_to_user(to, from, size);
+}
+
+unsigned long pva_kmd_strtol(const char *str, int base)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(str, base, &val);
+	if (ret < 0)
+		return 0;
+
+	return val;
+}
+
+/* TODO: Enable HVC call once HVC fix is available on dev-main */
+//static void pva_kmd_config_regs(void)
+//{
+//bool hv_err = true;
+//hv_err = hyp_pva_config_regs();
+//ASSERT(hv_err == true);
+//ASSERT(false);
+//}
+
+void pva_kmd_config_evp_seg_scr_regs(struct pva_kmd_device *pva)
+{
+	pva_kmd_config_evp_seg_regs(pva);
+	pva_kmd_config_scr_regs(pva);
+}
+
+void pva_kmd_config_sid_regs(struct pva_kmd_device *pva)
+{
+	pva_kmd_config_sid(pva);
+}
--- a/Show More
+++ b/Show More