From a3f1b7ea3396b05883518ef262da7d5e77d803f9 Mon Sep 17 00:00:00 2001 From: svcmobrel-release Date: Fri, 19 Sep 2025 10:10:49 -0700 Subject: [PATCH] Updating prebuilts and/or headers 2eba699906039d6615aae4967f6ea79bfe44a40a - event_sample_app/block_pool.c f3abb0a884f0647204ad32ff51255c4712e52120 - event_sample_app/Makefile 9ee49033e077ac5c8bf458a04c91dd3dbed9633d - event_sample_app/event_loop.h b33adce6eb1bbc7af23f6c37b6a635479e18a66a - event_sample_app/block_returnsync.c a56041c06b6bc1d3812b72b399d7d78dd7895485 - event_sample_app/block_limiter.c ca34c957759f7a010f0cbbbf9bedc03a2c98092b - event_sample_app/block_c2c.c 8d6d0ec3aa8e374a1d2a5fedc9dd24ff7bbdb731 - event_sample_app/block_multicast.c a76149a2531899e35843d939f60ad8979d8cf65f - event_sample_app/block_consumer_uc1.c 9da8763e4af4b4b7278507a3ebfe2c68a7a24585 - event_sample_app/util.h 2bf7e1383d6e8913c9b0a6a8bdd48fe63d8098d0 - event_sample_app/block_producer_uc1.c a54abf82eaa2d888e379ab4596ba68ce264e80b5 - event_sample_app/block_info.h 080a6efe263be076c7046e70e31098c2bbed0f6d - event_sample_app/block_presentsync.c 7dd10e5ea71f0c4a09bbe1f9f148f67a13ee098c - event_sample_app/util.c bc1a6f9017b28e5707c166a658a35e6b3986fdf4 - event_sample_app/usecase1.h 317f43efc59638bf1eae8303f0c79eafb059241a - event_sample_app/block_ipc.c 40361c8f0b68f7d5207db2466ce08c19c0bf1c90 - event_sample_app/event_loop_service.c efad113d0107e5d8f90146f3102a7c0ed22f1a35 - event_sample_app/event_loop_threads.c 2908615cebcf36330b9850c57e8745bf324867b2 - event_sample_app/block_queue.c 36ed68eca1a7800cf0d94e763c9fc352ee8cda1e - event_sample_app/block_common.c 675f75d61bd0226625a8eaaf0e503c9e976c8d61 - event_sample_app/main.c c3b26619dd07d221e953fc5dc29a50dcb95a8b97 - rawstream/Makefile 1fbb82e2281bb2e168c87fd20903bbed898ca160 - rawstream/rawstream_cuda.c 1d96498fe3c922f143f7e50e0a32b099714060ad - rawstream/rawstream_consumer.c d077dafc9176686f6d081026225325c2a303a60e - rawstream/rawstream_producer.c 54ae655edddda7dcabe22fbf0b27c3f617978851 - rawstream/rawstream.h d5ffeef3c7ad2af6f6f31385db7917b5ef9a7438 - rawstream/rawstream_ipc_linux.c 81e3d6f8ff5252797a7e9e170b74df6255f54f1b - rawstream/rawstream_main.c Change-Id: I0f4e671693eb0addfe8d0e6532cc8f240cb6c778 --- commitFile.txt | 29 + event_sample_app/00README.txt | 258 +++++ event_sample_app/Makefile | 146 +++ event_sample_app/block_c2c.c | 170 ++++ event_sample_app/block_common.c | 279 ++++++ event_sample_app/block_consumer_uc1.c | 1200 ++++++++++++++++++++++ event_sample_app/block_info.h | 284 ++++++ event_sample_app/block_ipc.c | 199 ++++ event_sample_app/block_limiter.c | 56 ++ event_sample_app/block_multicast.c | 57 ++ event_sample_app/block_pool.c | 743 ++++++++++++++ event_sample_app/block_presentsync.c | 55 + event_sample_app/block_producer_uc1.c | 1339 +++++++++++++++++++++++++ event_sample_app/block_queue.c | 57 ++ event_sample_app/block_returnsync.c | 55 + event_sample_app/event_loop.h | 51 + event_sample_app/event_loop_service.c | 339 +++++++ event_sample_app/event_loop_threads.c | 129 +++ event_sample_app/main.c | 1287 ++++++++++++++++++++++++ event_sample_app/usecase1.h | 55 + event_sample_app/util.c | 66 ++ event_sample_app/util.h | 33 + push_info.txt | 1 + rawstream/Makefile | 126 +++ rawstream/README.txt | 41 + rawstream/rawstream.h | 152 +++ rawstream/rawstream_consumer.c | 670 +++++++++++++ rawstream/rawstream_cuda.c | 234 +++++ rawstream/rawstream_ipc_linux.c | 172 ++++ rawstream/rawstream_main.c | 230 +++++ rawstream/rawstream_producer.c | 752 ++++++++++++++ 31 files changed, 9265 insertions(+) create mode 100644 commitFile.txt create mode 100644 event_sample_app/00README.txt create mode 100644 event_sample_app/Makefile create mode 100644 event_sample_app/block_c2c.c create mode 100644 event_sample_app/block_common.c create mode 100644 event_sample_app/block_consumer_uc1.c create mode 100644 event_sample_app/block_info.h create mode 100644 event_sample_app/block_ipc.c create mode 100644 event_sample_app/block_limiter.c create mode 100644 event_sample_app/block_multicast.c create mode 100644 event_sample_app/block_pool.c create mode 100644 event_sample_app/block_presentsync.c create mode 100644 event_sample_app/block_producer_uc1.c create mode 100644 event_sample_app/block_queue.c create mode 100644 event_sample_app/block_returnsync.c create mode 100644 event_sample_app/event_loop.h create mode 100644 event_sample_app/event_loop_service.c create mode 100644 event_sample_app/event_loop_threads.c create mode 100644 event_sample_app/main.c create mode 100644 event_sample_app/usecase1.h create mode 100644 event_sample_app/util.c create mode 100644 event_sample_app/util.h create mode 100644 push_info.txt create mode 100644 rawstream/Makefile create mode 100644 rawstream/README.txt create mode 100644 rawstream/rawstream.h create mode 100644 rawstream/rawstream_consumer.c create mode 100644 rawstream/rawstream_cuda.c create mode 100644 rawstream/rawstream_ipc_linux.c create mode 100644 rawstream/rawstream_main.c create mode 100644 rawstream/rawstream_producer.c diff --git a/commitFile.txt b/commitFile.txt new file mode 100644 index 0000000..fcc85d9 --- /dev/null +++ b/commitFile.txt @@ -0,0 +1,29 @@ +Updating prebuilts and/or headers + +2eba699906039d6615aae4967f6ea79bfe44a40a - event_sample_app/block_pool.c +f3abb0a884f0647204ad32ff51255c4712e52120 - event_sample_app/Makefile +9ee49033e077ac5c8bf458a04c91dd3dbed9633d - event_sample_app/event_loop.h +b33adce6eb1bbc7af23f6c37b6a635479e18a66a - event_sample_app/block_returnsync.c +a56041c06b6bc1d3812b72b399d7d78dd7895485 - event_sample_app/block_limiter.c +ca34c957759f7a010f0cbbbf9bedc03a2c98092b - event_sample_app/block_c2c.c +8d6d0ec3aa8e374a1d2a5fedc9dd24ff7bbdb731 - event_sample_app/block_multicast.c +a76149a2531899e35843d939f60ad8979d8cf65f - event_sample_app/block_consumer_uc1.c +9da8763e4af4b4b7278507a3ebfe2c68a7a24585 - event_sample_app/util.h +2bf7e1383d6e8913c9b0a6a8bdd48fe63d8098d0 - event_sample_app/block_producer_uc1.c +a54abf82eaa2d888e379ab4596ba68ce264e80b5 - event_sample_app/block_info.h +080a6efe263be076c7046e70e31098c2bbed0f6d - event_sample_app/block_presentsync.c +7dd10e5ea71f0c4a09bbe1f9f148f67a13ee098c - event_sample_app/util.c +bc1a6f9017b28e5707c166a658a35e6b3986fdf4 - event_sample_app/usecase1.h +317f43efc59638bf1eae8303f0c79eafb059241a - event_sample_app/block_ipc.c +40361c8f0b68f7d5207db2466ce08c19c0bf1c90 - event_sample_app/event_loop_service.c +efad113d0107e5d8f90146f3102a7c0ed22f1a35 - event_sample_app/event_loop_threads.c +2908615cebcf36330b9850c57e8745bf324867b2 - event_sample_app/block_queue.c +36ed68eca1a7800cf0d94e763c9fc352ee8cda1e - event_sample_app/block_common.c +675f75d61bd0226625a8eaaf0e503c9e976c8d61 - event_sample_app/main.c +c3b26619dd07d221e953fc5dc29a50dcb95a8b97 - rawstream/Makefile +1fbb82e2281bb2e168c87fd20903bbed898ca160 - rawstream/rawstream_cuda.c +1d96498fe3c922f143f7e50e0a32b099714060ad - rawstream/rawstream_consumer.c +d077dafc9176686f6d081026225325c2a303a60e - rawstream/rawstream_producer.c +54ae655edddda7dcabe22fbf0b27c3f617978851 - rawstream/rawstream.h +d5ffeef3c7ad2af6f6f31385db7917b5ef9a7438 - rawstream/rawstream_ipc_linux.c +81e3d6f8ff5252797a7e9e170b74df6255f54f1b - rawstream/rawstream_main.c diff --git a/event_sample_app/00README.txt b/event_sample_app/00README.txt new file mode 100644 index 0000000..bb4b6e7 --- /dev/null +++ b/event_sample_app/00README.txt @@ -0,0 +1,258 @@ +SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: LicenseRef-NvidiaProprietary + +NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +property and proprietary rights in and to this material, related +documentation and any modifications thereto. Any use, reproduction, +disclosure or distribution of this material and related documentation +without an express license agreement from NVIDIA CORPORATION or +its affiliates is strictly prohibited. + +NvSciStream Event Loop Driven Sample App - README + +--- +# nvscistream_event_sample - NvSciStream Sample App + +## Description + +This directory contains an NvSciStream sample application that +supports a variety of use cases, using an event-loop driven model. +Once the stream is fully connected, all further setup and streaming +operations are triggered by events, processed either by a single +NvSciEvent-driven thread or separate threads which wait for events +on each block. The former is the preferred approach for implementing +NvSciStream applications. In addition to those events which NvSci +itself generates, any other event which can be bound to an NvSciEvent +can be added to the event loop. This allows for robust applications +which can handle events regardless of the order in which they occur. + +To use this sample for writing your own applications: + +* See main.c for examples of how to do top level application setup and + how to select the blocks needed for your use case and connect them + all together. +* See the descriptions in the usecase*.h files to determine which use cases + involve the producer and consumer engines that you are interested in. +* See the appropriate block_*.c files for examples of creating the + necessary blocks and handling the events that they encounter. + See the block_producer_*.c and block_consumer_*.c files for examples of how + to map the relevant engines to and from NvSci. +* See the appropriate event_loop_*.c file for your chosen event handling + method. + +## Build the application + +The NvSciStream sample includes source code and a Makefile. +Navigate to the sample application directory to build the application: + + make clean + make + +## Examples of how to run the sample application: + +* NOTE: +* Inter-process and inter-chip test cases must be run with sudo. +* NvMedia/CUDA stream (use case 2) of the sample application is not supported + on x86 and Jetson Linux devices. +* Inter-chip use cases are not supported on Jetson Linux devices. +* Update the NvIpc/PCIe endpoint accordingly. + +Single-process, single-consumer CUDA/CUDA stream that uses the default event +service: + + ./nvscistream_event_sample + +Single-process, single-consumer stream that uses the threaded event handling: + + ./nvscistream_event_sample -e t + +Single-process NvMedia/CUDA stream with yuv format: + ./nvscistream_event_sample -u 2 -s y + +Single-process NvMedia/CUDA stream with three consumers, and the second uses +the mailbox mode: + + ./nvscistream_event_sample -u 2 -m 3 -q 1 m + +Multi-process CUDA/CUDA stream with three consumers, one in the same +process as the producer, and the other two in separate processes. The +first and the third consumers use the mailbox mode: + + ./nvscistream_event_sample -m 3 -p -c 0 -q 0 m & + ./nvscistream_event_sample -c 1 -c 2 -q 2 m + +Multi-process CUDA/CUDA stream with three consumers, one in the same +process as the producer, and the other two in separate processes. +To simulate the case with a less trusted consumer, one of the consumer +processes is set with lower priority. A limiter block is used to restrict +this consumer to hold at most one packet. The total number of packets is +increased to five. + +Linux example: + + ./nvscistream_event_sample -m 3 -f 5 -p -c 0 -l 2 1 & + ./nvscistream_event_sample -c 1 & + nice -n 19 ./nvscistream_event_sample -c 2 & + # Makes the third process as nice as possible. + +QNX example: + + ./nvscistream_event_sample -m 3 -f 5 -p -c 0 -l 2 1 & + ./nvscistream_event_sample -c 1 & + nice -n 1 ./nvscistream_event_sample -c 2 & + # Reduces the priority level of the third process by 1. + +Multi-process CUDA/CUDA stream with two consumers, one in the same +process as the producer, and the other in a separate processe. Both +processes enable the endpoint information option: + + ./nvscistream_event_sample -m 2 -p -c 0 -i & + ./nvscistream_event_sample -c 1 -i + +Multi-process CUDA/CUDA stream with extra validation steps for ASIL-D process +(Not support on x86 or Jetson Linux devices): + ./nvscistream_event_sample -u 3 -p & + ./nvscistream_event_sample -u 3 -c 0 + +Multi-process CUDA/CUDA stream using external event service to handle internal +I/O messages acroess process boundary: + ./nvscistream_event_sample -p -E & + ./nvscistream_event_sample -c 0 -E + +Multi-process CUDA/CUDA stream with one consumer on another SoC. +The consumer has the FIFO queue attached to the C2C IpcSrc block, and +a three-packet pool attached to the C2C IpcDst block. It uses IPC channel +nvscic2c_pcie_s0_c5_1 <-> nvscic2c_pcie_s0_c6_1 for C2C communication. + + ./nvscistream_event_sample -P 0 nvscic2c_pcie_s0_c5_1 -Q 0 f + # Run below command on another OS running on peer SOC. + ./nvscistream_event_sample -C 0 nvscic2c_pcie_s0_c6_1 -F 0 3 + +Multi-process CUDA/CUDA stream with four consumers, one in the same +process as the producer, one in another process but in the same OS as the +producer, and two in another process on another OS running in a peer SoC. +The third and fourth consumers have a mailbox queue attached to the C2C +IpcSrc block, and a five-packet pool attached to the C2C IpcDst block. +The third consumer uses nvscic2c_pcie_s0_c5_1 <-> nvscic2c_pcie_s0_c6_1 for +C2C communication. The 4th consumer uses nvscic2c_pcie_s0_c5_2 <-> +nvscic2c_pcie_s0_c6_2 for C2C communication. + + ./nvscistream_event_sample -m 4 -c 0 -q 0 m -Q 2 m -Q 3 m -P 2 nvscic2c_pcie_s0_c5_1 -P 3 nvscic2c_pcie_s0_c5_2 & + ./nvscistream_event_sample -c 1 -q 1 m + # Run below command on another OS running on peer SOC. + ./nvscistream_event_sample -C 2 nvscic2c_pcie_s0_c6_1 -q 2 f -F 2 5 -C 3 nvscic2c_pcie_s0_c6_2 -q 3 m -F 3 5 + +#Example commands for inter-process late attach usecase +Multi-process CUDA/CUDA stream with one early consumer and one late-attached consumer +Producer and early consumer processes are configured to stream 100000 frames, where as +the late-attached consumer process is configured to receive 10000 frames. + # Run the below commands to launch producer and early consumer processes. + ./nvscistream_event_sample -m 2 -r 1 -p & + ./nvscistream_event_sample -c 0 -k 0 100000 & + # Run the below command after some delay to launch the late-attached consumer process. + sleep 1; # This 1s delay will let producer and consumer to enter into streaming phase. + ./nvscistream_event_sample -L -c 1 -k 1 10000 & + +Multi-process CUDA/CUDA stream with one early consumer and two late-attached consumers +Producer and early consumer processes are configured to stream 100000 frames, where as +the late-attached consumer process one is configured to receive 10000 frames and +the late-attached consumer process two is configured to receive 50000 frames + # Run the below commands to launch producer and early consumer processes. + ./nvscistream_event_sample -m 3 -r 2 -p & + ./nvscistream_event_sample -c 0 -k 0 100000 & + # Run the below command after some delay to launch the late-attached consumer process one. + sleep 1; # This 1s delay will let producer and consumer to enter into streaming phase. + ./nvscistream_event_sample -L -c 1 -k 1 10000 & + # Run the below command after some delay to launch the late-attached consumer process two. + sleep 1; # This 1s delay will let producer and consumer to enter into streaming phase. + ./nvscistream_event_sample -L -c 2 -k 2 50000 & + +#Example commands for inter-process re-attach usecase +Multi-process CUDA/CUDA stream with one early consumer and two late-attached consumers +Producer and early consumer processes are configured to stream 100000 frames, where as +the late-attached consumer process one is configured to receive 10000 frames and +the late-attached consumer process two is configured to receive 50000 frames. +Once late-attached consumer process one completes streaming, re-attach it for receiving +5000 frames. + # Run the below commands to launch producer and early consumer processes. + ./nvscistream_event_sample -m 3 -r 2 -p & + ./nvscistream_event_sample -c 0 -k 0 100000 & + # Run the below command after some delay to launch the late-attached consumer process one. + sleep 1; # This 1s delay will let producer and consumer to enter into streaming phase. + ./nvscistream_event_sample -L -c 1 -k 1 10000 & + # Run the below command after some delay to launch the late-attached consumer process two. + sleep 1; + ./nvscistream_event_sample -L -c 2 -k 2 50000 & + # After late-attached consumer process one completes, re-attach it. + ./nvscistream_event_sample -L -c 1 -k 1 5000 & + +Limitations with C2C late/re-attach: +This sample app has the following limitations. +1. For C2C late/re-attach, this sample app does not support IPC consumer being the only early +consumer and all the remaining consumers as C2C late-attached. This is due to setting static +attribute logic for late-attach is not added. +2. A C2C consumer can acts as an IPC consumer during late-/re-attach but an IPC consumer +cannot be made as C2C consumer during Late/re-attach. + +#Example commands for inter-chip late attach usecase +Multi-process CUDA/CUDA stream with one early C2C consumer and one C2C late-attached consumer +Producer and early C2C consumer processes are configured to stream 100000 frames, where as +the late-attached C2C consumer process is configured to receive 10000 frames. +The early consumer uses nvscic2c_pcie_s0_c5_1 <-> nvscic2c_pcie_s0_c6_1 for +C2C communication. The late-attached consumer uses nvscic2c_pcie_s0_c5_2 <-> +nvscic2c_pcie_s0_c6_2 for C2C communication. + + # Run the below commands to launch producer on SOC1 + ./nvscistream_event_sample -m 2 -r 1 -P 0 nvscic2c_pcie_s0_c5_1 -P 1 nvscic2c_pcie_s0_c5_2 & + # Run the below commands to launch early consumer process on SOC2 + ./nvscistream_event_sample -C 0 nvscic2c_pcie_s0_c6_1 -k 0 100000 & + # Run the below command after some delay to launch the late-attached consumer process on SOC2 + sleep 1; # This 1s delay will let producer and consumer to enter into streaming phase. + ./nvscistream_event_sample -L -C 1 nvscic2c_pcie_s0_c6_2 -k 1 10000 & + +Multi-process CUDA/CUDA stream with one early C2C consumer and two C2C late-attached consumer +Producer and early C2C consumer processes are configured to stream 100000 frames, where as +the late-attached C2C consumer process is one configured to receive 10000 frames and +the late-attached C2C consumer process is two configured to receive 10000 frames. +The early consumer uses nvscic2c_pcie_s0_c5_1 <-> nvscic2c_pcie_s0_c6_1 for +C2C communication. The late-attached consumer one uses nvscic2c_pcie_s0_c5_2 <-> +nvscic2c_pcie_s0_c6_2 for C2C communication and the late-attached consumer two +uses nvscic2c_pcie_s0_c5_3 <->nvscic2c_pcie_s0_c6_3 for C2C communication. + + # Run the below commands to launch producer on SOC1 + ./nvscistream_event_sample -m 3 -r 2 -P 0 nvscic2c_pcie_s0_c5_1 -P 1 nvscic2c_pcie_s0_c5_2 -P 2 nvscic2c_pcie_s0_c5_3 & + # Run the below commands to launch early consumer process on SOC2 + ./nvscistream_event_sample -C 0 nvscic2c_pcie_s0_c6_1 -k 0 100000 & + # Run the below command after some delay to launch the late-attached consumer process. + sleep 1; # This 1s delay will let producer and consumer to enter into streaming phase. + ./nvscistream_event_sample -L -C 1 nvscic2c_pcie_s0_c6_2 -k 1 10000 & + # Run the below command after some delay to launch the late-attached consumer process. + sleep 1; + ./nvscistream_event_sample -L -C 2 nvscic2c_pcie_s0_c6_3 -k 2 10000 & + +#Example commands for inter-chip/process re-attach usecase +Multi-process CUDA/CUDA stream with one early consumer and two late-attached consumers +Producer and early consumer processes are configured to stream 100000 frames, where as +the late-attached consumer process one is configured to receive 10000 frames and +the late-attached consumer process two is configured to receive 50000 frames. +Once late-attached consumer process one completes streaming, re-attach it for receiving +5000 frames. +Once late-attached consumer process two completes streaming, re-attach it as IPC consumer for receiving +5000 frames. + + # Run the below commands to launch producer on SOC1 + ./nvscistream_event_sample -m 3 -r 2 -P 0 nvscic2c_pcie_s0_c5_1 -P 1 nvscic2c_pcie_s0_c5_2 -P 2 nvscic2c_pcie_s0_c5_3 & + # Run the below commands to launch early consumer process on SOC2 + ./nvscistream_event_sample -C 0 nvscic2c_pcie_s0_c6_1 -k 0 100000 & + # Run the below command after some delay to launch the late-attached consumer process. + sleep 1; # This 1s delay will let producer and consumer to enter into streaming phase. + ./nvscistream_event_sample -L -C 1 nvscic2c_pcie_s0_c6_2 -k 1 10000 & + # Run the below command after some delay to launch the late-attached consumer process. + sleep 1; + ./nvscistream_event_sample -L -C 2 nvscic2c_pcie_s0_c6_3 -k 2 50000 & + # Once late-attached consumer process one completes streaming, + # re-attach it for receiving 5000 frames. + ./nvscistream_event_sample -L -C 1 nvscic2c_pcie_s0_c6_2 -k 1 5000 & + # Once late-attached consumer process two completes streaming, + # re-attach it as IPC consumer on SOC1 for receiving 5000 frames. + ./nvscistream_event_sample -L -c 2 -k 2 5000 & diff --git a/event_sample_app/Makefile b/event_sample_app/Makefile new file mode 100644 index 0000000..869e21c --- /dev/null +++ b/event_sample_app/Makefile @@ -0,0 +1,146 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# + +# Location of common libraries +LIB_DIR = /usr/lib/aarch64-linux-gnu +# NOTE: This directory PATH will be moved from "tegra" to "nvidia". +TEGRA_LIB_DIR ?= /usr/lib/aarch64-linux-gnu/tegra + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +# Location of NVSCI header +NVSCI_HEADER_DIR ?= /usr/include/nvsci_headers +NVSCI_LIB_DIR = $(TEGRA_LIB_DIR) + +GCC ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(GCC) + +# internal flags +NVCCFLAGS := +CCFLAGS := +LDFLAGS := + +# Extra user flags +EXTRA_NVCCFLAGS ?= +EXTRA_LDFLAGS ?= +EXTRA_CCFLAGS ?= + +override abi := aarch64 +LDFLAGS += --dynamic-linker=/lib/ld-linux-aarch64.so.1 + +# Debug build flags +dbg = 0 +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + TARGET := debug +else + TARGET := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +################################################################################ + +# Common includes and paths +INCLUDES := -I./ +LIBRARIES := -L$(LIB_DIR) + +CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs +CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs +CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/aarch64-linux/lib/stubs + +CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) +ifeq ("$(CUDALIB)","") + $(error ERROR - libcuda.so not found, CUDA Driver is not installed or CUDA_PATH is not correctly set.) +else + CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" ) + LIBRARIES += -L$(CUDALIB) -lcuda -lrt +endif + +# Includes and paths for NVSCI libraries +NVSCIBUFHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvscibuf.h -print 2>/dev/null) +NVSCISYNCHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvscisync.h -print 2>/dev/null) +NVSCISTREAMHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvscistream.h -print 2>/dev/null) +NVSCIEVENTHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvscievent.h -print 2>/dev/null) +NVSCIIPCHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvsciipc.h -print 2>/dev/null) + +ifeq ("$(NVSCIBUFHEADER)","") + $(error ERROR - nvscibuf.h not found in $(NVSCI_HEADER_DIR)) +endif +ifeq ("$(NVSCISYNCHEADER)","") + $(error ERROR - nvscisync.h not found in $(NVSCI_HEADER_DIR)) +endif +ifeq ("$(NVSCISTREAMHEADER)","") + $(error ERROR - nvscistream.h not found in $(NVSCI_HEADER_DIR)) +endif +ifeq ("$(NVSCIEVENTHEADER)","") + $(error ERROR - nvscievent.h not found in $(NVSCI_HEADER_DIR)) +endif +ifeq ("$(NVSCIIPCHEADER)","") + $(error ERROR - nvsciipc.h not found in $(NVSCI_HEADER_DIR)) +endif + +INCLUDES += -I$(NVSCI_HEADER_DIR) +LIBRARIES += -L$(NVSCI_LIB_DIR) -lnvscibuf -lnvscisync -lnvscievent -lnvsciipc -lnvscistream + +ALL_CCFLAGS += --std=c++11 --threads 0 + +################################################################################ + +# Target rules +OUTPUT := nvscistream_event_sample + +all: build + +build: $(OUTPUT) + +OBJ := main.o +OBJ += block_common.o +OBJ += block_c2c.o +OBJ += block_consumer_uc1.o +OBJ += block_ipc.o +OBJ += block_limiter.o +OBJ += block_multicast.o +OBJ += block_pool.o +OBJ += block_presentsync.o +OBJ += block_producer_uc1.o +OBJ += block_queue.o +OBJ += block_returnsync.o +OBJ += event_loop_service.o +OBJ += event_loop_threads.o +OBJ += util.o + + +%.o: %.c + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) -o $@ -c $< + +$(OUTPUT): $(OBJ) + $(NVCC) $(ALL_LDFLAGS) -o $@ $+ $(LIBRARIES) + +run: build + $(OUTPUT) + +testrun: build + +clean: + rm -f $(OBJ) $(OUTPUT) + +clobber: clean diff --git a/event_sample_app/block_c2c.c b/event_sample_app/block_c2c.c new file mode 100644 index 0000000..2ce75cc --- /dev/null +++ b/event_sample_app/block_c2c.c @@ -0,0 +1,170 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - C2C block + */ + +#include +#include +#include "nvsciipc.h" +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* Create and register a new C2C src block */ +int32_t createC2cSrc( + NvSciStreamBlock* c2cSrc, + const char* channel, + NvSciStreamBlock queue) +{ + NvSciError err; + uint32_t i; + uint32_t slot = 0; + + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("C2cSrc", 0); + if (NULL == blockData) { + return 0; + } + + for (i=0; i< MAX_CONSUMERS; i++) { + if (!strcmp(ipcEP[i].c2cChannel, channel)) { + slot = i; + break; + } + } + + /* Open the named channel */ + err = NvSciIpcOpenEndpoint(channel, &ipcEP[slot].c2cEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to open channel (%s) for C2C src\n", + err, channel); + deleteCommon(blockData); + return 0; + } + err = NvSciIpcResetEndpointSafe(ipcEP[slot].c2cEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to reset IPC endpoint", err); + } + + /* Create a C2C src block */ + err = NvSciStreamIpcSrcCreate2(ipcEP[slot].c2cEndpoint, + sciSyncModule, + sciBufModule, + queue, + &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create C2C src block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *c2cSrc = blockData->block; + ipcEP[slot].ipcBlock = blockData->block; + ipcEP[slot].c2cOpened = true; + ipcEP[slot].c2cConnected = true; + + return 1; +} + + +/* Create and register a new C2C src block */ +int32_t createC2cSrc2( + NvSciStreamBlock* c2cSrc, + NvSciIpcEndpoint endpoint, + NvSciStreamBlock queue) +{ + NvSciError err; + + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("C2cSrc", 0); + if (NULL == blockData) { + return 0; + } + + /* Create a C2C src block */ + err = NvSciStreamIpcSrcCreate2(endpoint, + sciSyncModule, + sciBufModule, + queue, + &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create C2C src block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *c2cSrc = blockData->block; + return 1; +} + +/* Create and register a new C2C dst block */ +int32_t createC2cDst( + NvSciStreamBlock* c2cDst, + const char* channel, + NvSciStreamBlock pool) +{ + NvSciError err; + + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("C2cDst", 0); + if (NULL == blockData) { + return 0; + } + + /* Open the named channel */ + err = NvSciIpcOpenEndpoint(channel, &ipcEP[0].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to open channel (%s) for C2C dst\n", + err, channel); + deleteCommon(blockData); + return 0; + } + err = NvSciIpcResetEndpointSafe(ipcEP[0].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to reset IPC endpoint", err); + } + + /* Create a C2C dst block */ + err = NvSciStreamIpcDstCreate2(ipcEP[0].ipcEndpoint, + sciSyncModule, + sciBufModule, + pool, + &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create C2C dst block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *c2cDst = blockData->block; + return 1; +} diff --git a/event_sample_app/block_common.c b/event_sample_app/block_common.c new file mode 100644 index 0000000..3922068 --- /dev/null +++ b/event_sample_app/block_common.c @@ -0,0 +1,279 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - common block event handling + * + * Block types which do not require type-specific interactions make use of + * this common code. + */ + +#include +#if (QNX == 1) +#include +#endif +#include +#include +#include +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* Variable indicates whether the thread handling the +* late consumer connetions is started or not +*/ +static bool threadStarted = false; + +/* Delete common block */ +void deleteCommon( + void* data) +{ + BlockData* blockData = (BlockData*)data; + + /* Destroy block */ + if (blockData->block != 0) { + (void)NvSciStreamBlockDelete(blockData->block); + } + + /* Check if it is late/re-attach usecase */ + if (opts.numLateConsumer > 0U) { + /* Close the endpoints used by the IpcSrc/C2CSrc + * blocks for next late-/re-attach consumer connection + */ + pthread_mutex_lock(&mutex); + if ((!strcmp(blockData->name, "IpcSrc")) || + (!strcmp(blockData->name, "C2cSrc"))) { + for (uint32_t i=0; i< MAX_CONSUMERS; i++) { + if (ipcEP[i].ipcBlock == blockData->block) { + /* close the Ipc endpoint */ + if (ipcEP[i].ipcEndpoint) { +#if (QNX == 1) + if (ipcEP[i].coid != 0) { + (void)ConnectDetach_r(ipcEP[i].coid); + ipcEP[i].coid = 0; + } + if (ipcEP[i].chid != 0) { + (void)ChannelDestroy_r(ipcEP[i].chid); + ipcEP[i].chid = 0; + } +#endif + if (NvSciError_Success != + NvSciIpcCloseEndpointSafe(ipcEP[i].ipcEndpoint, false)) { + printf("Failed to close ipc endpoint\n"); + } + sleep(2); + ipcEP[i].ipcEndpoint = 0U; + } + /* close the C2C endpoint */ + if (ipcEP[i].c2cEndpoint) { + if (NvSciError_Success != + NvSciIpcCloseEndpointSafe(ipcEP[i].c2cEndpoint, false)) { + printf("Failed to close ipc endpoint\n"); + } + ipcEP[i].c2cEndpoint = 0U; + } + + /* clear the informaton as this is needed + * for next late-/re-attach connection + */ + ipcEP[i].ipcBlock = 0U; + ipcEP[i].ipcConnected = false; + ipcEP[i].c2cConnected = false; + ipcEP[i].ipcOpened = false; + ipcEP[i].c2cOpened = false; + break; + } + } + /* Wakeup the thread to handle the next set of + * late-/re-attach consumer connections + */ + pthread_cond_signal(&cond); + } + pthread_mutex_unlock(&mutex); + } + + /* Free data */ + free(blockData); +} + +/* Handle events on a common block + * + * Blocks that don't require interaction after connection usually just + * receive notification of connection and disconnection. + */ +int32_t handleCommon( + void* data, + uint32_t wait) +{ + /* Cast to common data */ + BlockData* blockData = (BlockData*)data; + + /* Get time to wait */ + int64_t waitTime = wait ? blockData->waitTime : 0; + + /* Query/wait for an event on the block */ + + NvSciStreamEventType event; + NvSciError err; + err = NvSciStreamBlockEventQuery(blockData->block, waitTime, &event); + + /* Handle errors */ + if (NvSciError_Success != err) { + + /* If not asked to wait, a timeout is not an error */ + if (!waitTime && (NvSciError_Timeout == err)) { + return 0; + } + + /* Otherwise, any error is considered fatal. A timeout probably + * indicates a failure to connect and complete setup in a timely + * fashion, so we specifically call out this case. + */ + if (NvSciError_Timeout == err) { + printf("%s timed out waiting for setup instructions\n", + blockData->name); + } else { + printf("%s event query failed with error %x\n", + blockData->name, err); + } + blockData->deleteFunc(blockData); + return -1; + } + + /* If we received an event, handle it based on its type */ + int32_t rv = 1; + NvSciError status; + switch (event) { + + /* + * Any event we don't explicitly handle is a fatal error + */ + default: + printf("%s received unknown event %x\n", + blockData->name, event); + + rv = -1; + break; + + /* + * Error events should never occur with safety-certified drivers, + * and are provided only in non-safety builds for debugging + * purposes. Even then, they should only occur when something + * fundamental goes wrong, like the system running out of memory, + * or stack/heap corruption, or a bug in NvSci which should be + * reported to NVIDIA. + */ + case NvSciStreamEventType_Error: + err = NvSciStreamBlockErrorGet(blockData->block, &status); + if (NvSciError_Success != err) { + printf("%s Failed to query the error event code %x\n", + blockData->name, err); + rv = -1; + } else { + if ((opts.numLateConsumer > 0U) && + (status == NvSciError_StreamNotConnected)) { + printf("[WARN] %s received error event: %x\n", + blockData->name, status); + rv = 2; + } else { + printf("%s received error event: %x\n", + blockData->name, status); + rv = -1; + } + } + break; + + /* + * If told to disconnect, it means either the stream finished its + * business or some other block had a failure. We'll just do a + * clean up and return without an error. + */ + case NvSciStreamEventType_Disconnected: + rv = 2; + break; + + /* + * The block doesn't have to do anything on connection, but now we may + * wait forever for any further events, so the timeout becomes infinite. + */ + case NvSciStreamEventType_Connected: + /* Query producer and consumer(s) endpoint info if needed */ + blockData->waitTime = -1; + break; + + /* All setup complete. Transition to runtime phase */ + case NvSciStreamEventType_SetupComplete: + if (opts.numLateConsumer > 0U) { + /* Check if it is multicast block */ + if (!strcmp(blockData->name, "Multicast")) { + /* Wakeup the thread to handle the next set + * of late-/re-attach consumer connections + */ + pthread_cond_signal(&cond); + if (!threadStarted) { + threadStarted = true; + /* Spawn a thread to handle the late attach connections */ + int32_t status = pthread_create(&dispatchThread, + NULL, + handleLateConsumerThreadFunc, + NULL); + if (status != 0) { + printf("Failed to spawn thread to monitor late consumer connections\n"); + /* Abort the process as this thread is important + * to process the late-/re-attach consumer connections. + * Failed to create this thread makes the late/re-attach usecase + * unusable. + */ + abort(); + } + } + } + } + break; + } + + /* On failure or final event, clean up the block */ + if ((rv < 0) || (1 < rv)) { + blockData->deleteFunc(blockData); + } + + return rv; +} + +/* Create and register a new common block */ +BlockData* createCommon( + char const* name, + size_t size) +{ + /* If no size specified, just use BlockData */ + if (0 == size) { + size = sizeof(BlockData); + } + + /* Create a data structure to track the block's status */ + BlockData* commonData = (BlockData*)calloc(1, size); + if (NULL == commonData) { + printf("Failed to allocate data structure for %s\n", name); + return NULL; + } + + /* Save the name for debugging purposes */ + strcpy(commonData->name, name); + + /* Wait time for initial connection event will be 60 seconds */ + commonData->waitTime = 60 * 1000000; + + /* Use the common delete function */ + commonData->deleteFunc = deleteCommon; + + return commonData; +} diff --git a/event_sample_app/block_consumer_uc1.c b/event_sample_app/block_consumer_uc1.c new file mode 100644 index 0000000..0abf37b --- /dev/null +++ b/event_sample_app/block_consumer_uc1.c @@ -0,0 +1,1200 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - consumer block for use case 1 + * + * This implements the consumer for use case 1: cuda to cuda streaming + */ + +#include +#include +#include +#include "nvscibuf.h" +#include "nvscisync.h" +#include "nvscistream.h" +#include "cuda.h" +#include "cuda_runtime_api.h" +#include "block_info.h" +#include "event_loop.h" +#include "usecase1.h" +#include "util.h" + +/* Internal data structure used to track packets */ +typedef struct { + /* The packet handle use for NvSciStream functions */ + NvSciStreamPacket handle; + /* NvSci buffer object for the packet's data buffer */ + NvSciBufObj dataObj; + /* CUDA external memory handle for the data buffer */ + cudaExternalMemory_t dataExtMem; + /* CUDA device memory pointer for the data buffer */ + void* dataDevMem; + /* Local system memory buffer used as the target for CUDA operations */ + uint8_t* dataDstMem; + /* NvSci buffer object for the packet's CRC buffer */ + NvSciBufObj crcObj; + /* Virtual address for the CRC buffer */ + uint8_t const* crcPtr; +} ConsPacket; + +/* Internal data used by the consumer block */ +typedef struct { + /* Common block info */ + BlockData common; + /* CUDA device ID and UUID */ + int32_t cudaDeviceId; + CUuuid cudaUuid; + + /* CUDA consumer stream */ + cudaStream_t cudaStream; + + /* CUDA sync attributes required for signaling */ + NvSciSyncAttrList signalAttr; + /* CUDA sync attributes required for waiting */ + NvSciSyncAttrList waiterAttr; + /* Sync object for CUDA to signal after processing data */ + NvSciSyncObj signalObj; + /* CUDA semaphore mapped to sync object */ + cudaExternalSemaphore_t signalSem; + + /* Sync object to wait for before processing data */ + NvSciSyncObj waiterObj; + /* CUDA semaphore mapped to sync object */ + cudaExternalSemaphore_t waiterSem; + + /* Element index chosen by pool for the CRC buffer */ + uint32_t crcIndex; + /* Element index chosen by pool for the data buffer */ + uint32_t dataIndex; + /* Size for data buffer after reconciling all requirements */ + uint64_t dataSize; + /* Number of packets provided by pool */ + uint32_t numPacket; + /* Information about each packet */ + ConsPacket packets[MAX_PACKETS]; + + /* Number of payloads processed so far */ + uint32_t counter; + /* Number of frames that the consumer is interested in receiving. + * This is mainly used for late/re-attach usecase, where a consumer + * can decide to receive certain number of frames before disconnecting. + */ + uint32_t frames; + + /* Flag indicating consumer has finished receiving desired number of payloads */ + bool finished; +} ConsData; + +/* Free up the packet resources */ +static void deletePacket( + ConsPacket* packet) +{ + if (packet != NULL) { + if (packet->handle != NvSciStreamPacket_Invalid) { + /* Free CUDA memory mapping */ + (void)cudaFree(packet->dataDevMem); + if (packet->dataExtMem) { + (void)cudaDestroyExternalMemory(packet->dataExtMem); + packet->dataExtMem = 0; + } + + if (packet->dataDstMem) { + free(packet->dataDstMem); + packet->dataDstMem = NULL; + } + + /* Free buffer objects */ + if (packet->dataObj) { + NvSciBufObjFree(packet->dataObj); + packet->dataObj = NULL; + } + + if (packet->crcObj) { + NvSciBufObjFree(packet->crcObj); + packet->crcObj = NULL; + } + } + + /* Clear out packet information */ + memset(packet, 0, sizeof(ConsPacket)); + } +} + +/* Free up consumer block resources */ +static void deleteConsumer( + ConsData* consData) +{ + /* Destroy block */ + if (consData->common.block != 0) { + (void)NvSciStreamBlockDelete(consData->common.block); + consData->common.block = 0; + } + + /* Free the packet resources */ + for (uint32_t i=0;inumPacket; i++) { + deletePacket(&consData->packets[i]); + } + + /* Free the sync objects */ + if (consData->waiterObj != NULL) { + (void)cudaDestroyExternalSemaphore(consData->waiterSem); + consData->waiterSem = 0; + NvSciSyncObjFree(consData->waiterObj); + consData->waiterObj = NULL; + } + + if (consData->signalObj != NULL) { + (void)cudaDestroyExternalSemaphore(consData->signalSem); + consData->signalSem = 0; + NvSciSyncObjFree(consData->signalObj); + consData->signalObj = NULL; + } + + /* Destroy CUDA stream */ + (void)cudaStreamDestroy(consData->cudaStream); + + /* Free data */ + free(consData); +} + +/* Handle query of basic stream info */ +static int32_t handleStreamInit( + ConsData* consData) + +{ + if (opts.endInfo) { + /* Query endpoint info from producer */ + uint32_t size = INFO_SIZE; + char info[INFO_SIZE] = {0}; + NvSciError err = NvSciStreamBlockUserInfoGet( + consData->common.block, + NvSciStreamBlockType_Producer, 0U, + ENDINFO_NAME_PROC, + &size, &info); + if (NvSciError_Success == err) { + printf("Producer info: %s\n", info); + } else if (NvSciError_StreamInfoNotProvided == err) { + printf("Info not provided by the producer\n"); + } else { + printf("Failed (%x) to query the producer info\n", err); + return 0; + } + } + return 1; +} + +/* Handle initialization of CUDA resources for consumer */ +static int32_t handleConsumerInit( + ConsData* consData) +{ + int32_t cudaRtErr; + CUresult cudaErr; + + /* Get stack limit */ + size_t unused; + cudaRtErr = cudaDeviceGetLimit(&unused, cudaLimitStackSize); + if (cudaSuccess != cudaRtErr) { + printf("Failed (%d) to get CUDA device limit\n", cudaRtErr); + return 0; + } + + /* Set CUDA device */ + consData->cudaDeviceId = 0; + cudaRtErr = cudaSetDevice(consData->cudaDeviceId); + if (cudaSuccess != cudaRtErr) { + printf("Failed (%d) to set CUDA device\n", cudaRtErr); + return 0; + } + + /* Get UUID for CUDA device */ + cudaErr = cuDeviceGetUuid(&consData->cudaUuid, consData->cudaDeviceId); + if (CUDA_SUCCESS != cudaErr) { + printf("Failed (%d) to get CUDA UUID\n", cudaErr); + return 0; + } + + /* Get CUDA streams to be used for asynchronous operation */ + cudaRtErr = cudaStreamCreateWithFlags(&consData->cudaStream, + cudaStreamNonBlocking); + if (cudaSuccess != cudaRtErr) { + printf("Failed (%d) to create CUDA stream\n", cudaRtErr); + return 0; + } + + return 1; +} + +/* Handle setup of supported buffer attributes */ +static int32_t handleConsumerElemSupport( + ConsData* consData) +{ + /* + * Note: To illustrate that NvSciStream producer and consumer do + * not need to specify the same set of element types, or use + * the same order for element types, the producer for this + * use case sends the CRC attributes first, followed by the + * primary data, while the consumer uses the opposite order. + * Our pool implementation will end up using the producer + * ordering, but that is not required either. + */ + + NvSciError sciErr; + uint32_t bufName[2]; + NvSciBufAttrList bufAttrs[2]; + + /* + * Data buffer requires read access by CPU and the GPU of the cuda + * device, and uses a raw data buffer. (Size is specified by producer.) + */ + NvSciBufAttrValAccessPerm dataPerm = NvSciBufAccessPerm_Readonly; + uint8_t dataCpu = 1U; + NvSciRmGpuId dataGpu = { 0 }; + NvSciBufType dataBufType = NvSciBufType_RawBuffer; + memcpy(&dataGpu.bytes, &consData->cudaUuid.bytes, sizeof(dataGpu.bytes)); + NvSciBufAttrKeyValuePair dataKeyVals[] = { + { NvSciBufGeneralAttrKey_GpuId, &dataGpu, sizeof(dataGpu) }, + { NvSciBufGeneralAttrKey_Types, &dataBufType, sizeof(dataBufType) }, + { NvSciBufGeneralAttrKey_RequiredPerm, &dataPerm, sizeof(dataPerm) }, + { NvSciBufGeneralAttrKey_NeedCpuAccess, &dataCpu, sizeof(dataCpu) } + }; + + /* Create and fill attribute list for data buffer */ + bufName[0] = ELEMENT_NAME_DATA; + sciErr = NvSciBufAttrListCreate(sciBufModule, &bufAttrs[0]); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to create data attribute list\n", sciErr); + return 0; + } + sciErr = NvSciBufAttrListSetAttrs(bufAttrs[0], + dataKeyVals, + sizeof(dataKeyVals) / + sizeof(NvSciBufAttrKeyValuePair)); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to fill data attribute list\n", sciErr); + return 0; + } + + /* + * CRC buffer requires read access by CPU, and uses a raw 64 byte + * data buffer with 1 byte alignment. + */ + NvSciBufAttrValAccessPerm crcPerm = NvSciBufAccessPerm_Readonly; + uint8_t crcCpu = 1U; + NvSciBufType crcBufType = NvSciBufType_RawBuffer; + uint64_t crcSize = 64U; + uint64_t crcAlign = 1U; + NvSciBufAttrKeyValuePair crcKeyVals[] = { + { NvSciBufGeneralAttrKey_Types, &crcBufType, sizeof(crcBufType) }, + { NvSciBufRawBufferAttrKey_Size, &crcSize, sizeof(crcSize) }, + { NvSciBufRawBufferAttrKey_Align, &crcAlign, sizeof(crcAlign) }, + { NvSciBufGeneralAttrKey_RequiredPerm, &crcPerm, sizeof(crcPerm) }, + { NvSciBufGeneralAttrKey_NeedCpuAccess, &crcCpu, sizeof(crcCpu) } + }; + + /* Create and fill attribute list for CRC checksum buffer */ + bufName[1] = ELEMENT_NAME_CRC; + sciErr = NvSciBufAttrListCreate(sciBufModule, &bufAttrs[1]); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to create CRC attribute list\n", sciErr); + return 0; + } + sciErr = NvSciBufAttrListSetAttrs(bufAttrs[1], + crcKeyVals, + sizeof(crcKeyVals) / + sizeof(NvSciBufAttrKeyValuePair)); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to fill CRC attribute list\n", sciErr); + return 0; + } + + /* + * Inform stream of the attributes + * Once sent, the attribute lists are no longer needed + */ + for (uint32_t i=0; i<2U; ++i) { + sciErr = NvSciStreamBlockElementAttrSet(consData->common.block, + bufName[i], bufAttrs[i]); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to send element %d attribute\n", + sciErr, i); + return 0; + } + NvSciBufAttrListFree(bufAttrs[i]); + } + + /* Indicate that all element information has been exported */ + sciErr = NvSciStreamBlockSetupStatusSet(consData->common.block, + NvSciStreamSetup_ElementExport, + true); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to complete element export\n", sciErr); + return 0; + } + + return 1; +} + +/* Handle receipt of chosen element attributes */ +static int32_t handleConsumerElemSetting( + ConsData* consData) +{ + NvSciError err; + + /* + * This application does not need to query the element count, because we + * know it is always 2. But we do so anyways to show how it is done. + */ + uint32_t count; + err = NvSciStreamBlockElementCountGet(consData->common.block, + NvSciStreamBlockType_Pool, + &count); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to query element count\n", err); + return 0; + } + if (2U != count) { + printf("Consumer received unexpected element count (%d)\n", count); + return 0; + } + + /* Process all elements */ + for (uint32_t i=0U; i<2U; ++i) { + + /* Query element type and attributes */ + uint32_t type; + NvSciBufAttrList bufAttr; + err = NvSciStreamBlockElementAttrGet(consData->common.block, + NvSciStreamBlockType_Pool, i, + &type, &bufAttr); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to query element attr %d\n", err, i); + return 0; + } + + /* For data element, need to extract size and save index */ + if (ELEMENT_NAME_DATA == type) { + consData->dataIndex = i; + NvSciBufAttrKeyValuePair keyVals[] = { + { NvSciBufRawBufferAttrKey_Size, NULL, 0 } + }; + err = NvSciBufAttrListGetAttrs(bufAttr, keyVals, 1); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to obtain buffer size\n", err); + return 0; + } + consData->dataSize = *((const uint64_t*)(keyVals[0].value)); + + /* Set waiter attributes for the asynchronous element. */ + err = NvSciStreamBlockElementWaiterAttrSet(consData->common.block, + i, + consData->waiterAttr); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to send waiter attr for elem %d\n", + err, i); + return 0; + } + + /* Once sent, the waiting attributes are no longer needed */ + NvSciSyncAttrListFree(consData->waiterAttr); + consData->waiterAttr = NULL; + } + + /* For CRC element, just need to save the index */ + else if (ELEMENT_NAME_CRC == type) { + consData->crcIndex = i; + + /* CRC element is a synchronous element. + * No need to set the waiter attr, which is NULL by default. */ + } + + /* Report any unknown element */ + else { + printf("Consumer received unknown element type (%x)\n", type); + return 0; + } + + /* Don't need to keep attribute list */ + NvSciBufAttrListFree(bufAttr); + + /* + * Indicate element will be used. + * This is the default, and we can omit this call in most applications, + * but we illustrate its use for applications that only use some + * of the buffers. + */ + err = NvSciStreamBlockElementUsageSet(consData->common.block, i, true); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to indicate element %d is used\n", + err, i); + return 0; + } + } + + /* Indicate that element import is complete */ + err = NvSciStreamBlockSetupStatusSet(consData->common.block, + NvSciStreamSetup_ElementImport, + true); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to complete element import\n", err); + return 0; + } + + /* Indicate that waiter attribute export is done. */ + err = NvSciStreamBlockSetupStatusSet(consData->common.block, + NvSciStreamSetup_WaiterAttrExport, + true); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to complete waiter attr export\n", err); + return 0; + } + + return 1; +} + +/* Handle creation of a new packet */ +static int32_t handleConsumerPacketCreate( + ConsData* consData) +{ + NvSciError err; + + /* Retrieve handle for packet pending creation */ + NvSciStreamPacket handle; + err = NvSciStreamBlockPacketNewHandleGet(consData->common.block, + &handle); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to retrieve handle for the new packet\n", + err); + return 0; + } + + /* Make sure there is room for more packets */ + if (MAX_PACKETS <= consData->numPacket) { + printf("Consumer exceeded max packets\n"); + err = NvSciStreamBlockPacketStatusSet(consData->common.block, + handle, + NvSciStreamCookie_Invalid, + NvSciError_Overflow); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to send packet status\n", err); + } + return 0; + } + + /* + * Allocate the next entry in the array for the new packet. + * Use the array entry for the cookie + */ + ConsPacket* packet = &consData->packets[consData->numPacket++]; + packet->handle = handle; + + /* Retrieve all buffers and map into application + * Consumers can skip querying elements that they don't use. + * This use case has 2 elements. + */ + for (uint32_t index = 0U; index < 2U; index++) { + NvSciBufObj bufObj; + err = NvSciStreamBlockPacketBufferGet(consData->common.block, + handle, + index, + &bufObj); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to retrieve buffer (%lx/%d)\n", + err, handle, index); + return 0; + } + + /* Handle mapping of data buffer */ + NvSciError sciErr; + int32_t cudaRtErr; + + if (index == consData->dataIndex) { + + /* Save buffer object */ + packet->dataObj = bufObj; + + /* Map in the buffer as CUDA external memory */ + struct cudaExternalMemoryHandleDesc memHandleDesc; + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.handle.nvSciBufObject = bufObj; + memHandleDesc.size = consData->dataSize; + cudaRtErr = cudaImportExternalMemory(&packet->dataExtMem, + &memHandleDesc); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to map buffer as external mem\n", + cudaRtErr); + return 0; + } + + /* Map in the buffer as CUDA device memory */ + struct cudaExternalMemoryBufferDesc memBufferDesc; + memset(&memBufferDesc, 0, sizeof(memBufferDesc)); + memBufferDesc.size = consData->dataSize; + memBufferDesc.offset = 0; + cudaRtErr = cudaExternalMemoryGetMappedBuffer(&packet->dataDevMem, + packet->dataExtMem, + &memBufferDesc); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to map buffer as device mem\n", + cudaRtErr); + return 0; + } + + /* Allocate normal memory to use as the target for the CUDA op */ + packet->dataDstMem = (uint8_t*)malloc(consData->dataSize); + if (NULL == packet->dataDstMem) { + printf("Consumer failed to allocate target buffer\n"); + return 0; + } + + /* Fill in with initial values */ + memset(packet->dataDstMem, 0xD0, consData->dataSize); + + } + + /* Handle mapping of CRC buffer */ + else if (index == consData->crcIndex) { + + /* Save buffer object */ + packet->crcObj = bufObj; + + /* Get a CPU pointer for the buffer from NvSci */ + sciErr = NvSciBufObjGetConstCpuPtr(bufObj, + (void const**)&packet->crcPtr); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to map CRC buffer\n", sciErr); + return 0; + } + + } + + /* Shouldn't be any other index */ + else { + printf("Consumer received buffer for unknown element (%d)\n", + index); + return 0; + } + } + + /* Inform pool of success. + * Note: Could inform the pool of any of the failures above. + */ + err = NvSciStreamBlockPacketStatusSet(consData->common.block, + handle, + (NvSciStreamCookie)packet, + NvSciError_Success); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to inform pool of packet status\n", err); + return 0; + } + + return 1; +} + +/* Handle deletion of packet */ +static void handleConsumerPacketDelete( + ConsData* consData) +{ + /* Get the deleted packet cookie*/ + NvSciStreamCookie cookie; + NvSciError err = + NvSciStreamBlockPacketOldCookieGet(consData->common.block, + &cookie); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to retrieve the deleted packet cookie\n", + err); + } + + /* Get packet pointer */ + ConsPacket* packet = (ConsPacket*)cookie; + + /* Free up the packet resources */ + deletePacket(packet); +} + +/* Handle setup of supported sync attributes */ +static int32_t handleConsumerSyncSupport( + ConsData* consData) +{ + NvSciError sciErr; + int32_t cudaRtErr; + + /* + * Create sync attribute list for signaling. + * This will be saved until we receive the producer's attributes + */ + sciErr = NvSciSyncAttrListCreate(sciSyncModule, &consData->signalAttr); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to allocate signal sync attrs\n", sciErr); + return 0; + } + + /* Have CUDA fill the signaling attribute list */ + cudaRtErr = cudaDeviceGetNvSciSyncAttributes(consData->signalAttr, + consData->cudaDeviceId, + cudaNvSciSyncAttrSignal); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to fill signal sync attrs\n", cudaRtErr); + return 0; + } + + /* Create sync attribute list for waiting. */ + sciErr = NvSciSyncAttrListCreate(sciSyncModule, &consData->waiterAttr); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to allocate waiter sync attrs\n", sciErr); + return 0; + } + + /* Have CUDA fill the waiting attribute list */ + cudaRtErr = cudaDeviceGetNvSciSyncAttributes(consData->waiterAttr, + consData->cudaDeviceId, + cudaNvSciSyncAttrWait); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to fill waiter sync attrs\n", cudaRtErr); + return 0; + } + + return 1; +} + +/* Handle creation and export of consumer sync object */ +static int32_t handleConsumerSyncExport( + ConsData* consData) +{ + NvSciError sciErr; + uint32_t cudaRtErr; + + /* Process waiter attrs from all elements. + * As CRC element is a synchronous element, + * no need to query the sync object for it. + */ + NvSciSyncAttrList waiterAttr = NULL; + sciErr = NvSciStreamBlockElementWaiterAttrGet(consData->common.block, + consData->dataIndex, + &waiterAttr); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to query waiter attr\n", + sciErr); + return 0; + } + if (NULL == waiterAttr) { + printf("Consumer received NULL waiter attr for data elem\n"); + return 0; + } + + /* Indicate that waiter attribute import is done. */ + sciErr = NvSciStreamBlockSetupStatusSet(consData->common.block, + NvSciStreamSetup_WaiterAttrImport, + true); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to complete waiter attr import\n", + sciErr); + return 0; + } + + /* + * Merge and reconcile producer sync attrs with ours. + */ + NvSciSyncAttrList unreconciled[2] = { + consData->signalAttr, + waiterAttr }; + NvSciSyncAttrList reconciled = NULL; + NvSciSyncAttrList conflicts = NULL; + sciErr = NvSciSyncAttrListReconcile(unreconciled, 2, + &reconciled, &conflicts); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to reconcile sync attributes\n", sciErr); + return 0; + } + + /* Allocate sync object */ + sciErr = NvSciSyncObjAlloc(reconciled, &consData->signalObj); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to allocate sync object\n", sciErr); + return 0; + } + + /* Free the attribute lists */ + NvSciSyncAttrListFree(consData->signalAttr); + consData->signalAttr = NULL; + NvSciSyncAttrListFree(waiterAttr); + NvSciSyncAttrListFree(reconciled); + + /* Create CUDA semaphore for sync object */ + struct cudaExternalSemaphoreHandleDesc extSemDesc; + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = consData->signalObj; + cudaRtErr = cudaImportExternalSemaphore(&consData->signalSem, + &extSemDesc); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to map signal object to semaphore\n", + cudaRtErr); + return 0; + } + + /* Only send the sync object for the asynchronous element. + * If this function is not called for an element, + * the sync object is assumed to be NULL. + * In this use case, CRC element doesn't use sync object. + */ + sciErr = NvSciStreamBlockElementSignalObjSet(consData->common.block, + consData->dataIndex, + consData->signalObj); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to send sync object\n", sciErr); + return 0; + } + + /* Indicate that sync object export is complete */ + sciErr = NvSciStreamBlockSetupStatusSet(consData->common.block, + NvSciStreamSetup_SignalObjExport, + true); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to complete signal obj export\n", + sciErr); + return 0; + } + + return 1; +} + +/* Handle import of producer sync object */ +static int32_t handleConsumerSyncImport( + ConsData* consData) +{ + uint32_t cudaRtErr; + NvSciError sciErr; + NvSciSyncObj waiterObj = NULL; + /* Query sync object for asynchronous elements. */ + sciErr = NvSciStreamBlockElementSignalObjGet(consData->common.block, + 0U, consData->dataIndex, + &waiterObj); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to query sync object\n", sciErr); + return 0; + } + + /* Save object */ + consData->waiterObj = waiterObj; + + /* If the waiter sync obj is NULL, + * it means this element is ready to use when received. + */ + if (NULL != waiterObj) { + /* Create CUDA semaphore for sync object */ + struct cudaExternalSemaphoreHandleDesc extSemDesc; + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = waiterObj; + cudaRtErr = cudaImportExternalSemaphore(&consData->waiterSem, + &extSemDesc); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to map waiter object to semaphore\n", + cudaRtErr); + return 0; + } + } + + /* Indicate that element import is complete */ + sciErr = NvSciStreamBlockSetupStatusSet(consData->common.block, + NvSciStreamSetup_SignalObjImport, + true); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to complete signal obj import\n", + sciErr); + return 0; + } + + return 1; +} + +/* Handle processing of payloads */ +static int32_t handleConsumerPayload( + ConsData* consData) +{ + NvSciError sciErr; + int32_t cudaRtErr; + + /* Obtain packet with the new payload */ + NvSciStreamCookie cookie; + sciErr = NvSciStreamConsumerPacketAcquire(consData->common.block, + &cookie); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to obtain packet for payload\n", sciErr); + return 0; + } + ConsPacket* packet = (ConsPacket*)cookie; + consData->counter++; + + /* Extract CRC value from the packet */ + uint32_t crc = *((uint32_t const*)(packet->crcPtr)); + + /* If the received waiter obj if NULL, + * the producer is done writing data into this element, + * skip waiting on pre-fence. + */ + if (NULL != consData->waiterObj) { + + /* Query fences from producer for data element */ + NvSciSyncFence prefence = NvSciSyncFenceInitializer; + sciErr = NvSciStreamBlockPacketFenceGet( + consData->common.block, + packet->handle, + 0U, consData->dataIndex, + &prefence); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to query fence from producer\n", + sciErr); + return 0; + } + + /* Instruct CUDA to wait for the producer fence */ + struct cudaExternalSemaphoreWaitParams waitParams; + memset(&waitParams, 0, sizeof(waitParams)); + waitParams.params.nvSciSync.fence = &prefence; + waitParams.flags = 0; + cudaRtErr = cudaWaitExternalSemaphoresAsync(&consData->waiterSem, + &waitParams, 1, + consData->cudaStream); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to wait for prefence\n", cudaRtErr); + return 0; + } + NvSciSyncFenceClear(&prefence); + } + + /* Instruct CUDA to copy the packet data buffer to the target buffer */ + cudaRtErr = cudaMemcpy2DAsync(packet->dataDstMem, + consData->dataSize, + packet->dataDevMem, + consData->dataSize, + consData->dataSize, + 1, + cudaMemcpyDeviceToHost, + consData->cudaStream); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to issue copy command\n", cudaRtErr); + return 0; + } + + /* Inform CUDA to signal a fence when the copy completes */ + NvSciSyncFence postfence = NvSciSyncFenceInitializer; + struct cudaExternalSemaphoreSignalParams signalParams; + memset(&signalParams, 0, sizeof(signalParams)); + signalParams.params.nvSciSync.fence = &postfence; + signalParams.flags = 0; + cudaRtErr = cudaSignalExternalSemaphoresAsync(&consData->signalSem, + &signalParams, + 1, + consData->cudaStream); + if (cudaSuccess != cudaRtErr) { + printf("Consumer failed (%d) to signal postfence\n", cudaRtErr); + return 0; + } + + /* Update postfence for this element */ + sciErr = NvSciStreamBlockPacketFenceSet(consData->common.block, + packet->handle, + consData->dataIndex, + &postfence); + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to set fence\n", sciErr); + return 0; + } + + /* Release the packet back to the producer */ + sciErr = NvSciStreamConsumerPacketRelease(consData->common.block, + packet->handle); + + if (NvSciError_Success != sciErr) { + printf("Consumer failed (%x) to release packet\n", sciErr); + return 0; + } + NvSciSyncFenceClear(&postfence); + + /* + * Wait for the copy to finish, generate a checkum and, compare + * with the value from the packet + */ + cudaStreamSynchronize(consData->cudaStream); + if (crc != generateCRC(packet->dataDstMem, + 1, + consData->dataSize, + consData->dataSize)) { + printf("Consumer CRC does not match the one from producer\n"); + } + + if (consData->frames == consData->counter) { + consData->finished = 1; + } + return 1; +} + +/* Handle events on a consumer block + * + * The consumer block informs the stream of the consumers buffer and + * synchronization requirements and capabilities, creates signaling + * synchronization objects and receives synchronization objects to + * wait for, maps buffers and synchronization objects to the consumer + * engine(s), and processes data. + */ +static int32_t handleConsumer( + void* data, + uint32_t wait) +{ + /* Cast to consumer data */ + ConsData* consData = (ConsData*)data; + + /* Get time to wait */ + int64_t waitTime = wait ? consData->common.waitTime : 0; + + /* Query/wait for an event on the block */ + NvSciStreamEventType event; + NvSciError err; + err = NvSciStreamBlockEventQuery(consData->common.block, waitTime, &event); + + /* Handle errors */ + if (NvSciError_Success != err) { + + /* If not asked to wait, a timeout is not an error */ + if (!waitTime && (NvSciError_Timeout == err)) { + return 0; + } + + /* Otherwise, any error is considered fatal. A timeout probably + * indicates a failure to connect and complete setup in a timely + * fashion, so we specifically call out this case. + */ + if (NvSciError_Timeout == err) { + printf("Consumer timed out waiting for setup instructions\n"); + } else { + printf("Consumer event query failed with error %x\n", err); + } + deleteConsumer(consData); + return -1; + } + + /* If we received an event, handle it based on its type + * + * Note that there's a lot of error checking we could choose to do for + * some of these events, like making sure that we only receive each + * event once for a given entry. But NvSciStream is expected to take + * care of all of that, even when the application makes a mistake. + * So we only check for things that don't trigger NvSciStream errors. + */ + int32_t rv = 1; + NvSciError status; + switch (event) { + + /* + * Any event we don't explicitly handle is a fatal error + */ + default: + printf("Consumer received unknown event %x\n", event); + + rv = -1; + break; + + /* + * Error events should never occur with safety-certified drivers, + * and are provided only in non-safety builds for debugging + * purposes. Even then, they should only occur when something + * fundamental goes wrong, like the system running out of memory, + * or stack/heap corruption, or a bug in NvSci which should be + * reported to NVIDIA. + */ + case NvSciStreamEventType_Error: + err = NvSciStreamBlockErrorGet(consData->common.block, &status); + if (NvSciError_Success != err) { + printf("%s Failed to query the error event code %x\n", + consData->common.name, err); + } else { + printf("%s received error event: %x\n", + consData->common.name, status); + } + + rv = -1; + break; + + /* + * If told to disconnect, it means either the stream finished its + * business or some other block had a failure. We'll just do a + * clean up and return without an error. + */ + case NvSciStreamEventType_Disconnected: + printf("Consumer disconnected after receiving %d payloads\n", + consData->counter); + rv = 2; + break; + + /* + * On connection, the consumer should initialize the appopriate engine(s) + * and obtain the necessary buffer and synchronization attribute lists + * for the desired use case. + */ + case NvSciStreamEventType_Connected: + /* Initialize basic stream info */ + if (!handleStreamInit(consData)) { + rv = -1; + } + /* Initialize CUDA access */ + if (!handleConsumerInit(consData)) { + rv = -1; + } + /* Determine supported buffer attributes */ + else if (!handleConsumerElemSupport(consData)) { + rv = -1; + } + /* Determined supported sync attributes */ + else if (!handleConsumerSyncSupport(consData)) { + rv = -1; + } + + /* Now that we're fully connected, set the wait time to infinite */ + consData->common.waitTime = -1; + break; + + /* Retrieve all element information from pool */ + case NvSciStreamEventType_Elements: + if (!handleConsumerElemSetting(consData)) { + rv = -1; + } + break; + + /* For a packet, set up an entry in the array */ + case NvSciStreamEventType_PacketCreate: + if (!handleConsumerPacketCreate(consData)) { + rv = -1; + } + break; + + /* Finish any setup related to packet resources */ + case NvSciStreamEventType_PacketsComplete: + /* For this use case, nothing else to setup. + * Inform the NvSciStream that the consumer has imported all packets. + */ + err = NvSciStreamBlockSetupStatusSet(consData->common.block, + NvSciStreamSetup_PacketImport, + true); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to complete packet import\n", err); + rv = -1; + } + break; + + /* Delete a packet - usually only relevant for non-safety applications */ + case NvSciStreamEventType_PacketDelete: + handleConsumerPacketDelete(consData); + break; + + case NvSciStreamEventType_WaiterAttr: + if (!handleConsumerSyncExport(consData)) { + rv = -1; + } + break; + + /* Import producer sync objects for all elements */ + case NvSciStreamEventType_SignalObj: + if (!handleConsumerSyncImport(consData)) { + rv = -1; + } + break; + + /* All setup complete. Transition to runtime phase */ + case NvSciStreamEventType_SetupComplete: + printf("Consumer setup completed\n"); + break; + + /* Processs payloads when packets arrive */ + case NvSciStreamEventType_PacketReady: + if (!handleConsumerPayload(consData)) { + rv = -1; + } else if (consData->finished) { + printf("Consumer finished receiving %d payloads\n", consData->counter); + rv = 2; + } + break; + } + + /* On failure or final event, clean up the block */ + if ((rv < 0) || (1 < rv)) { + deleteConsumer(consData); + } + + return rv; +} + +/* Create and register a new consumer block */ +int32_t createConsumer_Usecase1( + NvSciStreamBlock* consumer, + NvSciStreamBlock pool, + uint32_t index, + uint32_t frames) +{ + /* + * The index is ignored. It is provided to support use cases where + * there are multiple consumers that don't all do the same thing. + */ + (void)index; + + /* Create a data structure to track the block's status */ + ConsData* consData = (ConsData*)calloc(1, sizeof(ConsData)); + if (NULL == consData) { + printf("Failed to allocate data structure for consumer\n"); + return 0; + } + + /* Save the name for debugging purposes */ + strcpy(consData->common.name, "Consumer"); + + /* Wait time for initial connection event will be 60 seconds */ + consData->common.waitTime = 60 * 1000000; + consData->frames = frames; + + /* Create a pool block */ + NvSciError err = + NvSciStreamConsumerCreate(pool, &consData->common.block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create consumer block\n", err); + deleteConsumer(consData); + return 0; + } + + if (opts.endInfo) { + /* Add endpoint information on consumer side. + * Application can specify user-defined info to help set up stream, + * which can be queried by other blocks after stream connection. + */ + char info[INFO_SIZE] = {0}; + size_t infoSize = + snprintf(info, INFO_SIZE, "%s%d", "Consumer proc: ", getpid()); + err = NvSciStreamBlockUserInfoSet(consData->common.block, + ENDINFO_NAME_PROC, + infoSize, info); + if (NvSciError_Success != err) { + printf("Failed (%x) to setup the consumer info\n", err); + deleteConsumer(consData); + return 0; + } + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(consData->common.block, consData, handleConsumer)) { + deleteConsumer(consData); + return 0; + } + + *consumer = consData->common.block; + return 1; +} diff --git a/event_sample_app/block_info.h b/event_sample_app/block_info.h new file mode 100644 index 0000000..323725a --- /dev/null +++ b/event_sample_app/block_info.h @@ -0,0 +1,284 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - block abstraction + */ + +#ifndef _BLOCK_INFO_H +#define _BLOCK_INFO_H 1 +#include +#include +#include "nvscistream.h" + +/* Maximum number of consumers */ +#define MAX_CONSUMERS 4 + +/* Maximum number of blocks */ +#define MAX_BLOCKS 100 + +/* Maximum number of packets supported */ +#define MAX_PACKETS 32 + +/* Maximum number of elements supported */ +#define MAX_ELEMS 8 + +/* Memory size of endpoint inforamtion */ +#define INFO_SIZE 50 + +/* NvSci modules for all objects */ +extern NvSciSyncModule sciSyncModule; +extern NvSciBufModule sciBufModule; + +/* Flag used to terminate the thread that +* was spawned to handle the late/re-attached +* consumer connections upon stream disconnect +*/ +extern atomic_int streamDone; + +/* Number of registered blocks for streaming */ +extern int32_t numBlocks; + +/* Number of active blocks */ +extern uint32_t numAlive; + +/* variables for synchronization */ +extern pthread_mutex_t mutex; +extern pthread_cond_t cond; + +/* Thread for handling late/re-attached consumer +* connections +*/ +extern pthread_t dispatchThread; + +/* Common options for all blocks */ +typedef struct { + /* Indicate whether the producer/consumer sets endpoint info */ + bool endInfo; + /* Indicate whether the producer uses yuv format */ + bool yuv; + /* Indicate whether the extern event service is used */ + bool useExtEventService; + /* Indicates the number of late consumers for late/re-attach usecase */ + uint32_t numLateConsumer; + /* Total number of consumers */ + uint32_t numConsumer; + /* Indicates c2c usecase */ + bool c2cMode; + /* Indicates consumer connection is late/reattach*/ + bool lateAttach; +} CommonOptions; + +extern CommonOptions opts; + +/* Endpoint data structure for tracking the +* IPC/C2C channels +*/ +typedef struct { + /* Holds the IPC endpoint corresponding to an IPC channel */ + NvSciIpcEndpoint ipcEndpoint; + /* Holds the C2C endpoint corresponding to an C2C channel */ + NvSciIpcEndpoint c2cEndpoint; + /* named IPC channel */ + char ipcChannel[32]; + /* named IPC channel used for handsking between + * producer and late/re-attached consumer connection + */ + char ipcChannelForHandshake[32]; + /* named c2c channel */ + char c2cChannel[32]; + /* IPC/C2C block created for handling the late/re-attached + * consumer connections*/ + NvSciStreamBlock ipcBlock; + /* Queue block that is needed for c2c usecase for a c2c + * consumer late/reattach connections + */ + NvSciStreamBlock queue; + /* ReturnSync block that is needed for c2c usecase for a c2c + * consumer late/reattach connections + */ + NvSciStreamBlock returnSync; + + /* Indicates the connect state of IPC channel */ + bool ipcConnected; + /* Indicates the connect state of C2C channel */ + bool c2cConnected; + /* Indicates the Open state of IPC channel */ + bool ipcOpened; + /* Indicates the Open state of C2C channel */ + bool c2cOpened; + /* QNX channel ID for communication */ + int32_t chid; + /* QNX channel connection ID */ + int32_t coid; +} Endpoint; + +extern Endpoint ipcEP[MAX_CONSUMERS]; + +typedef struct { + /* The application's name for the element */ + uint32_t userName; + /* Attribute list for element */ + NvSciBufAttrList attrList; +} ElemAttr; + +/* + * Some block types that do not require direct interaction will share a + * common private data structure and event handling functon. + */ + +/* Common block private data */ +typedef struct { + NvSciStreamBlock block; + int64_t waitTime; + char name[32]; + void (*deleteFunc)(void*); +} BlockData; + +typedef int32_t (*BlockFunc)(void* data, uint32_t wait); + +/* Structure to track block info */ +typedef struct { + NvSciStreamBlock handle; + void* data; + BlockFunc func; + NvSciEventNotifier* notifier; + bool isAlive; + bool retry; +} BlockEventData; + +extern BlockEventData blocks[MAX_BLOCKS]; +extern BlockEventData* blocksAlive[MAX_BLOCKS]; + +/* Create data structure for common blocks */ +extern BlockData* createCommon( + char const* name, + size_t size); + +/* Handle event for common block */ +extern int32_t handleCommon( + void* data, + uint32_t wait); + +/* Delete common block */ +extern void deleteCommon( + void* data); + +/* + * Functions for setting up each kind of block + */ + +extern int32_t createIpcDst( + NvSciStreamBlock* ipcDst, + const char* channel, + bool useExternalEventService); + +extern int32_t createIpcSrc( + NvSciStreamBlock* ipcSrc, + const char* channel, + bool useExternalEventService); + +extern int32_t createIpcSrc2( + NvSciStreamBlock* ipcsrc, + NvSciIpcEndpoint endpoint, + bool useExtEventService); + +extern int32_t createC2cSrc2( + NvSciStreamBlock* c2cSrc, + NvSciIpcEndpoint endpoint, + NvSciStreamBlock queue); + +extern int32_t createC2cDst( + NvSciStreamBlock* c2cDst, + const char* channel, + NvSciStreamBlock pool); + +extern int32_t createC2cSrc( + NvSciStreamBlock* c2cSrc, + const char* channel, + NvSciStreamBlock queue); + +extern int32_t createLimiter( + NvSciStreamBlock* limiter, + uint32_t limit); + +extern int32_t createPresentSync( + NvSciStreamBlock* presentSync); + +extern int32_t createReturnSync( + NvSciStreamBlock* returnSync); + +extern int32_t createMulticast( + NvSciStreamBlock* multicast, + uint32_t numConsumer); + +extern int32_t createPool_Common( + NvSciStreamBlock* pool, + uint32_t numPacket, + bool isC2cPool); + +extern int32_t createQueue( + NvSciStreamBlock* queue, + uint32_t useMailbox); + +extern int32_t (*createProducer)( + NvSciStreamBlock* producer, + NvSciStreamBlock pool, + uint32_t numFrames); + +extern int32_t (*createConsumer)( + NvSciStreamBlock* consumer, + NvSciStreamBlock queue, + uint32_t index, + uint32_t frames); + +extern int32_t (createProducer_Usecase1)( + NvSciStreamBlock* producer, + NvSciStreamBlock pool, + uint32_t numFrames); + +extern int32_t (createConsumer_Usecase1)( + NvSciStreamBlock* consumer, + NvSciStreamBlock queue, + uint32_t index, + uint32_t frames); + +extern int32_t (createProducer_Usecase2)( + NvSciStreamBlock* producer, + NvSciStreamBlock pool, + uint32_t numFrames); + +extern int32_t (createConsumer_Usecase2)( + NvSciStreamBlock* consumer, + NvSciStreamBlock queue, + uint32_t index, + uint32_t frames); + +extern int32_t(createProducer_Usecase3)( + NvSciStreamBlock* producer, + NvSciStreamBlock pool, + uint32_t numFrames); + +extern int32_t(createConsumer_Usecase3)( + NvSciStreamBlock* consumer, + NvSciStreamBlock queue, + uint32_t index, + uint32_t frames); + +extern int32_t createPool_Usecase3( + NvSciStreamBlock* pool, + uint32_t numPacket, + bool isC2cPool); + +extern void* handleLateConsumerThreadFunc(void*); + +#endif // _BLOCK_INFO_H diff --git a/event_sample_app/block_ipc.c b/event_sample_app/block_ipc.c new file mode 100644 index 0000000..4def6e5 --- /dev/null +++ b/event_sample_app/block_ipc.c @@ -0,0 +1,199 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - ipc blocks + */ + +#include +#include +#include "nvsciipc.h" +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* + * Ipc blocks do not require any block-specific interactions so we + * use the set of common functions to handle its events. However + * they do have an additional data field which needs to be cleaned + * up when the block is destroyed, so we use more than the common + * data structure and delete function. + */ + +/* Create and register a new ipcsrc block */ +int32_t createIpcSrc( + NvSciStreamBlock* ipcsrc, + const char* channel, + bool useExtEventService) +{ + NvSciError err; + uint32_t i; + uint32_t slot = 0; + + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("IpcSrc", 0); + if (NULL == blockData) { + return 0; + } + + for (i=0; i< MAX_CONSUMERS; i++) { + if (!strcmp(ipcEP[i].ipcChannel, channel)) { + slot = i; + break; + } + } + + /* Open the named channel */ + err = NvSciIpcOpenEndpoint(channel, &ipcEP[slot].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to open channel (%s) for IpcSrc\n", + err, channel); + deleteCommon(blockData); + return 0; + } + + err = NvSciIpcResetEndpointSafe(ipcEP[slot].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to reset IPC endpoint", err); + } + + /* Create a ipcsrc block */ + err = NvSciStreamIpcSrcCreate(ipcEP[slot].ipcEndpoint, + sciSyncModule, + sciBufModule, + &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create IpcSrc block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + /* Configuate block to use the external event service for + * internal I/O messages */ + if (useExtEventService && !eventFuncs->regInt(blockData->block)) { + deleteCommon(blockData); + return 0; + } + + *ipcsrc = blockData->block; + ipcEP[slot].ipcBlock = blockData->block; + ipcEP[slot].ipcOpened = true; + ipcEP[slot].ipcConnected = true; + + return 1; +} + + +/* Create and register a new ipcsrc block */ +int32_t createIpcSrc2( + NvSciStreamBlock* ipcsrc, + NvSciIpcEndpoint endpoint, + bool useExtEventService) +{ + NvSciError err; + + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("IpcSrc", 0); + if (NULL == blockData) { + return 0; + } + + /* Create a ipcsrc block */ + err = NvSciStreamIpcSrcCreate(endpoint, + sciSyncModule, + sciBufModule, + &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create IpcSrc block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + /* Configuate block to use the external event service for + * internal I/O messages */ + if (useExtEventService && !eventFuncs->regInt(blockData->block)) { + deleteCommon(blockData); + return 0; + } + + *ipcsrc = blockData->block; + + return 1; +} + +/* Create and register a new ipcdst block */ +int32_t createIpcDst( + NvSciStreamBlock* ipcdst, + const char* channel, + bool useExtEventService) +{ + NvSciError err; + + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("IpcDst", 0); + if (NULL == blockData) { + return 0; + } + + /* Open the named channel */ + err = NvSciIpcOpenEndpoint(channel, &ipcEP[0].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to open channel (%s) for IpcDst\n", + err, channel); + deleteCommon(blockData); + return 0; + } + + err = NvSciIpcResetEndpointSafe(ipcEP[0].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to reset IPC endpoint", err); + } + + /* Create a ipcdst block */ + err = NvSciStreamIpcDstCreate(ipcEP[0].ipcEndpoint, + sciSyncModule, + sciBufModule, + &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create IpcDst block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + /* Configuate block to use the external event service for + * internal I/O messages */ + if (useExtEventService && !eventFuncs->regInt(blockData->block)) { + deleteCommon(blockData); + return 0; + } + + *ipcdst = blockData->block; + return 1; +} diff --git a/event_sample_app/block_limiter.c b/event_sample_app/block_limiter.c new file mode 100644 index 0000000..63d0c5f --- /dev/null +++ b/event_sample_app/block_limiter.c @@ -0,0 +1,56 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - limiter block + */ + +#include +#include +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* + * Limiter blocks do not require any block-specific interactions so we + * use the set of common functions to handle its events. + */ + +/* Create and register a new limiter block */ +int32_t createLimiter( + NvSciStreamBlock* limiter, + uint32_t limit) +{ + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("Limiter", 0); + if (NULL == blockData) { + return 0; + } + + /* Create a limiter block */ + NvSciError err = + NvSciStreamLimiterCreate(limit, &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create limiter block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *limiter = blockData->block; + return 1; +} diff --git a/event_sample_app/block_multicast.c b/event_sample_app/block_multicast.c new file mode 100644 index 0000000..7b59596 --- /dev/null +++ b/event_sample_app/block_multicast.c @@ -0,0 +1,57 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - multicast block + */ + +#include +#include +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* + * Multicast blocks do not require any block-specific interactions so we + * use the set of common functions to handle its events. + */ + +/* Create and register a new multicast block */ +int32_t createMulticast( + NvSciStreamBlock* multicast, + uint32_t numConsumer) +{ + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("Multicast", 0); + if (NULL == blockData) { + return 0; + } + + /* Create a multicast block */ + NvSciError err = + NvSciStreamMulticastCreate(numConsumer, &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create limiter block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *multicast = blockData->block; + return 1; +} + diff --git a/event_sample_app/block_pool.c b/event_sample_app/block_pool.c new file mode 100644 index 0000000..dd680bb --- /dev/null +++ b/event_sample_app/block_pool.c @@ -0,0 +1,743 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - pool block + */ + +#include +#include +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* Internal data used by the pool block */ +typedef struct { + BlockData common; + bool isC2cPool; + uint32_t numConsumers; + uint32_t numProdElem; + uint32_t numConsElem; + bool elementsDone; + ElemAttr prodElem[MAX_ELEMS]; + ElemAttr consElem[MAX_ELEMS]; + uint32_t numPacket; + uint32_t numPacketReady; + bool packetsDone; + NvSciStreamPacket packet[MAX_PACKETS]; +} PoolData; + +/* Free up pool block resources */ +static void deletePool( + PoolData* poolData) +{ + /* Destroy block */ + if (poolData->common.block != 0) { + (void)NvSciStreamBlockDelete(poolData->common.block); + } + + /* Free data */ + free(poolData); +} + +/* Handle query of basic stream info */ +static int32_t handleStreamInit( + PoolData* poolData) +{ + /* Query number of consumers */ + NvSciError err = + NvSciStreamBlockConsumerCountGet(poolData->common.block, + &poolData->numConsumers); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to query number of consumers", err); + return 0; + } + + /* Query producer and consumer(s) endpoint info if needed */ + + return 1; +} + +/* + * If ready, handle setup of the pool buffers. + * + * Most of the work the pool application has to do resides in this function. + */ +static int32_t handlePoolBufferSetup( + PoolData* poolData) +{ + NvSciError err; + + /* Query producer element count */ + err = NvSciStreamBlockElementCountGet(poolData->common.block, + NvSciStreamBlockType_Producer, + &poolData->numProdElem); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to query producer element count\n", err); + return 0; + } + + /* Query consumer element count */ + err = NvSciStreamBlockElementCountGet(poolData->common.block, + NvSciStreamBlockType_Consumer, + &poolData->numConsElem); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to query consumer element count\n", err); + return 0; + } + + /* Query all producer elements */ + for (uint32_t i=0U; inumProdElem; ++i) { + err = NvSciStreamBlockElementAttrGet(poolData->common.block, + NvSciStreamBlockType_Producer, i, + &poolData->prodElem[i].userName, + &poolData->prodElem[i].attrList); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to query producer element %d\n", err, i); + return 0; + } + } + + /* Query all consumer elements */ + for (uint32_t i=0U; inumConsElem; ++i) { + err = NvSciStreamBlockElementAttrGet(poolData->common.block, + NvSciStreamBlockType_Consumer, i, + &poolData->consElem[i].userName, + &poolData->consElem[i].attrList); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to query consumer element %d\n", err, i); + return 0; + } + } + + /* Indicate that all element information has been imported */ + poolData->elementsDone = true; + err = NvSciStreamBlockSetupStatusSet(poolData->common.block, + NvSciStreamSetup_ElementImport, + true); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to complete element import\n", err); + return 0; + } + + /* + * Go through requested elements from producer and consumer and line + * them up. A general streaming application might not have a one to + * one correspondence, and the pool may have to decide what subset + * of elements to select based on knowledge of the data types that + * the application suite supports. This sample application is much + * simpler, but we still go through the process rather than assuming + * producer and consumer have requested the same things in the same + * order. + */ + uint32_t numElem = 0, p, c, e, i; + ElemAttr elem[MAX_ELEMS]; + for (p=0; pnumProdElem; ++p) { + ElemAttr* prodElem = &poolData->prodElem[p]; + for (c=0; cnumConsElem; ++c) { + ElemAttr* consElem = &poolData->consElem[c]; + + /* If requested element types match, combine the entries */ + if (prodElem->userName == consElem->userName) { + ElemAttr* poolElem = &elem[numElem++]; + poolElem->userName = prodElem->userName; + poolElem->attrList = NULL; + + /* Combine and reconcile the attribute lists */ + NvSciBufAttrList oldAttrList[2] = { prodElem->attrList, + consElem->attrList }; + NvSciBufAttrList conflicts = NULL; + err = NvSciBufAttrListReconcile(oldAttrList, 2, + &poolElem->attrList, + &conflicts); + + /* Discard any conflict list. + * (Could report its contents for additional debug info) + */ + if (NULL != conflicts) { + NvSciBufAttrListFree(conflicts); + } + + /* Abort on error */ + if (NvSciError_Success != err) { + printf("Failed to reconcile element %x attrs (%x)\n", + poolElem->userName, err); + return 0; + } + + /* Found a match for this producer element so move on */ + break; + } /* if match */ + } /* for all requested consumer elements */ + } /* for all requested producer elements */ + + /* Should be at least one element */ + if (0 == numElem) { + printf("Pool didn't find any common elements\n"); + return 0; + } + + /* The requested attribute lists are no longer needed, so discard them */ + for (p=0; pnumProdElem; ++p) { + ElemAttr* prodElem = &poolData->prodElem[p]; + if (NULL != prodElem->attrList) { + NvSciBufAttrListFree(prodElem->attrList); + prodElem->attrList = NULL; + } + } + for (c=0; cnumConsElem; ++c) { + ElemAttr* consElem = &poolData->consElem[c]; + if (NULL != consElem->attrList) { + NvSciBufAttrListFree(consElem->attrList); + consElem->attrList = NULL; + } + } + + /* Inform the stream of the chosen elements */ + for (e=0; ecommon.block, + poolElem->userName, + poolElem->attrList); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to send element %d info\n", err, e); + return 0; + } + } + + /* Indicate that all element information has been exported */ + err = NvSciStreamBlockSetupStatusSet(poolData->common.block, + NvSciStreamSetup_ElementExport, + true); + if (NvSciError_Success != err) { + printf("Consumer failed (%x) to complete element export\n", err); + return 0; + } + + /* + * Create and send all the packets and their buffers + * Note: Packets and buffers are not guaranteed to be received by + * producer and consumer in the same order sent, nor are the + * status messages sent back guaranteed to preserve ordering. + * This is one reason why an event driven model is more robust. + */ + for (i=0; inumPacket; ++i) { + + /* + * Create a new packet + * Our pool implementation doesn't need to save any packet-specific + * data, but we do need to provide unique cookies, so we just + * use the pointer to the location we save the handle. For other + * blocks, this will be a pointer to the structure where the + * packet information is kept. + */ + NvSciStreamCookie cookie = (NvSciStreamCookie)&poolData->packet[i]; + err = NvSciStreamPoolPacketCreate(poolData->common.block, + cookie, + &poolData->packet[i]); + if (NvSciError_Success != err) { + printf("Failed (%x) to create packet %d\n", err, i); + return 0; + } + + /* Create buffers for the packet */ + for (e=0; ecommon.block, + poolData->packet[i], + e, obj); + if (NvSciError_Success != err) { + printf("Failed (%x) to insert buffer %d of packet %d\n", + err, e, i); + return 0; + } + + /* The pool doesn't need to keep a copy of the object handle */ + NvSciBufObjFree(obj); + } + + /* Indicate packet setup is complete */ + err = NvSciStreamPoolPacketComplete(poolData->common.block, + poolData->packet[i]); + if (NvSciError_Success != err) { + printf("Failed (%x) to complete packet %d setup\n", + err, i); + return 0; + } + } + + /* + * Indicate that all packets have been sent. + * Note: An application could choose to wait to send this until + * the status has been received, in order to try to make any + * corrections for rejected packets. + */ + err = NvSciStreamBlockSetupStatusSet(poolData->common.block, + NvSciStreamSetup_PacketExport, + true); + if (NvSciError_Success != err) { + printf("Failed (%x) to complete packet export\n", + err); + return 0; + } + + /* Once all packets are set up, no longer need to keep the attributes */ + for (e=0; eattrList) { + NvSciBufAttrListFree(poolElem->attrList); + poolElem->attrList = NULL; + } + } + + return 1; +} + +/* +* If ready, handle setup of the C2C pool buffers. +* +* Most of the work the pool application has to do resides in this function. +*/ +static int32_t handleC2cPoolBufferSetup( + PoolData* poolData) +{ + NvSciError err; + + /* Query allocated element count from the primary pool */ + uint32_t numElem; + err = NvSciStreamBlockElementCountGet(poolData->common.block, + NvSciStreamBlockType_Producer, + &numElem); + if (NvSciError_Success != err) { + printf("C2C pool failed (%x) to query allocated element count\n", + err); + return 0; + } + + /* Query all allocated elements from the primary pool */ + ElemAttr elem[MAX_ELEMS]; + for (uint32_t i = 0U; icommon.block, + NvSciStreamBlockType_Producer, i, + &elem[i].userName, + &elem[i].attrList); + if (NvSciError_Success != err) { + printf("C2C pool failed (%x) to query allocated element %d\n", + err, i); + return 0; + } + } + + /* If necessary, query the consumer elements for validation */ + + /* Indicate that all element information has been imported */ + poolData->elementsDone = true; + err = NvSciStreamBlockSetupStatusSet(poolData->common.block, + NvSciStreamSetup_ElementImport, + true); + if (NvSciError_Success != err) { + printf("C2C pool failed (%x) to complete element import\n", err); + return 0; + } + + /* + * Create and send all the packets and their buffers + */ + for (uint32_t i = 0; inumPacket; ++i) { + + /* + * Create a new packet + * Our pool implementation doesn't need to save any packet-specific + * data, but we do need to provide unique cookies, so we just + * use the pointer to the location we save the handle. For other + * blocks, this will be a pointer to the structure where the + * packet information is kept. + */ + NvSciStreamCookie cookie = (NvSciStreamCookie)&poolData->packet[i]; + err = NvSciStreamPoolPacketCreate(poolData->common.block, + cookie, + &poolData->packet[i]); + if (NvSciError_Success != err) { + printf("Failed (%x) to create packet %d\n", err, i); + return 0; + } + + /* Create buffers for the packet */ + for (uint32_t e = 0; ecommon.block, + poolData->packet[i], + e, obj); + if (NvSciError_Success != err) { + printf("Failed (%x) to insert buffer %d of packet %d\n", + err, e, i); + return 0; + } + + /* The pool doesn't need to keep a copy of the object handle */ + NvSciBufObjFree(obj); + } + + /* Indicate packet setup is complete */ + err = NvSciStreamPoolPacketComplete(poolData->common.block, + poolData->packet[i]); + if (NvSciError_Success != err) { + printf("Failed (%x) to complete packet %d setup\n", + err, i); + return 0; + } + } + + /* + * Indicate that all packets have been sent. + * Note: An application could choose to wait to send this until + * the status has been received, in order to try to make any + * corrections for rejected packets. + */ + err = NvSciStreamBlockSetupStatusSet(poolData->common.block, + NvSciStreamSetup_PacketExport, + true); + if (NvSciError_Success != err) { + printf("Failed (%x) to complete packet export\n", + err); + return 0; + } + + /* Once all packets are set up, no longer need to keep the attributes */ + for (uint32_t e = 0; eattrList) { + NvSciBufAttrListFree(poolElem->attrList); + poolElem->attrList = NULL; + } + } + + return 1; +} + +/* Check packet status */ +static int32_t handlePacketsStatus( + PoolData* poolData) +{ + bool packetFailure = false; + NvSciError err; + + /* Check each packet */ + for (uint32_t p = 0; p < poolData->numPacket; ++p) { + /* Check packet acceptance */ + bool accept; + err = NvSciStreamPoolPacketStatusAcceptGet(poolData->common.block, + poolData->packet[p], + &accept); + if (NvSciError_Success != err) { + printf("Failed (%x) to retrieve packet %d's acceptance-statue\n", + err, p); + return 0; + } + if (accept) { + continue; + } + + /* On rejection, query and report details */ + packetFailure = true; + NvSciError status; + + /* Check packet status from producer */ + err = NvSciStreamPoolPacketStatusValueGet( + poolData->common.block, + poolData->packet[p], + NvSciStreamBlockType_Producer, 0U, + &status); + if (NvSciError_Success != err) { + printf("Failed (%x) to retrieve packet %d's statue from producer\n", + err, p); + return 0; + } + if (status != NvSciError_Success) { + printf("Producer rejected packet %d with error %x\n", p, status); + } + + /* Check packet status from consumers */ + for (uint32_t c = 0; c < poolData->numConsumers; ++c) { + err = NvSciStreamPoolPacketStatusValueGet( + poolData->common.block, + poolData->packet[p], + NvSciStreamBlockType_Consumer, c, + &status); + if (NvSciError_Success != err) { + printf("Failed (%x) to retrieve packet %d's statue from consumer %d\n", + err, p, c); + return 0; + } + if (status != NvSciError_Success) { + printf("Consumer %d rejected packet %d with error %x\n", + c, p, status); + } + } + } + + /* Indicate that status for all packets has been received. */ + poolData->packetsDone = true; + err = NvSciStreamBlockSetupStatusSet(poolData->common.block, + NvSciStreamSetup_PacketImport, + true); + if (NvSciError_Success != err) { + printf("Pool failed (%x) to complete packets import\n", err); + return 0; + } + + return packetFailure ? 0 : 1; +} + +/* Handle events on a pool block + * + * The pool block coordinates allocation of packets based on producer + * and consumer requirements during setup. After that, no further + * events should be received until the stream is torn down. + */ +static int32_t handlePool( + void* data, + uint32_t wait) +{ + /* Cast to pool data */ + PoolData* poolData = (PoolData*)data; + + /* Get time to wait */ + int64_t waitTime = wait ? poolData->common.waitTime : 0; + + /* Query/wait for an event on the block */ + NvSciStreamEventType event; + NvSciError err; + err = NvSciStreamBlockEventQuery(poolData->common.block, waitTime, &event); + + /* Handle errors */ + if (NvSciError_Success != err) { + + /* If not asked to wait, a timeout is not an error */ + if (!waitTime && (NvSciError_Timeout == err)) { + return 0; + } + + /* Otherwise, any error is considered fatal. A timeout probably + * indicates a failure to connect and complete setup in a timely + * fashion, so we specifically call out this case. + */ + if (NvSciError_Timeout == err) { + printf("Pool timed out waiting for setup instructions\n"); + } else { + printf("Pool event query failed with error %x\n", err); + } + deletePool(poolData); + return -1; + } + + /* If we received an event, handle it based on its type + * + * Note that there's a lot of error checking we could choose to do for + * some of these events, like making sure that we only receive each + * event once for a given entry. But NvSciStream is expected to take + * care of all of that, even when the application makes a mistake. + * So we only check for things that don't trigger NvSciStream errors. + */ + int32_t rv = 1; + NvSciError status; + switch (event) { + /* + * Any event we don't explicitly handle is a fatal error + */ + default: + printf("Pool received unknown event %x\n", event); + + rv = -1; + break; + + /* + * Error events should never occur with safety-certified drivers, + * and are provided only in non-safety builds for debugging + * purposes. Even then, they should only occur when something + * fundamental goes wrong, like the system running out of memory, + * or stack/heap corruption, or a bug in NvSci which should be + * reported to NVIDIA. + */ + case NvSciStreamEventType_Error: + err = NvSciStreamBlockErrorGet(poolData->common.block, &status); + if (NvSciError_Success != err) { + printf("%s Failed to query the error event code %x\n", + poolData->common.name, err); + rv = -1; + } else { + if ((opts.numLateConsumer > 0U) && + (status == NvSciError_StreamNotConnected)) { + printf("[WARN] %s received error event: %x\n", + poolData->common.name, status); + rv = 2; + } else { + printf("%s received error event: %x\n", + poolData->common.name, status); + rv = -1; + } + } + + break; + + /* + * If told to disconnect, it means either the stream finished its + * business or some other block had a failure. We'll just do a + * clean up and return without an error. But if it happened before + * all the pool setup operations finished, we'll report it for + * debugging purposes. + */ + case NvSciStreamEventType_Disconnected: + if (!poolData->elementsDone) { + printf("Warning: Pool disconnect before element support\n"); + } else if (!poolData->packetsDone) { + printf("Warning: Pool disconnect before packet setup\n"); + } + rv = 2; + break; + + /* + * The pool doesn't have to do anything immediately on connection, but + * now that the stream is complete we can reduce the timeout to wait + * for the producer and consumer events to arrive. + */ + case NvSciStreamEventType_Connected: + /* Initialize basic stream info */ + if (!handleStreamInit(poolData)) { + rv = -1; + } + + poolData->common.waitTime = 10 * 1000000; + break; + + /* Process all element support from producer and consumer(s) */ + case NvSciStreamEventType_Elements: + if (poolData->isC2cPool) { + if (!handleC2cPoolBufferSetup(poolData)) { + rv = -1; + } + } else { + if (!handlePoolBufferSetup(poolData)) { + rv = -1; + } + } + break; + + /* + * Check packet/buffer status returned from producer/consumer + * A more sophisticated application might have the means to recover + * from any failures. But in general we expect that in a production + * application, any failures are due to something fundamental going + * wrong like lack of memory/resources, which hopefully has been + * designed out. So these status checks are more useful during + * development, where we just report the issue for debugging purposes. + * + * Once all the status events have been received for all packets + * and buffers, the pool should require no further interaction + * until the time comes to shut down the application. We set the + * wait time to infinite. + */ + case NvSciStreamEventType_PacketStatus: + /* There are multiple ways the status handling could be organized. + * In particular, waiting for status could be interleaved with + * sending the packets. This example waits for status from all + * packets before checking each packet's status. + */ + if (++poolData->numPacketReady < poolData->numPacket) { + break; + } + + if (!handlePacketsStatus(poolData)) { + rv = -1; + } + + poolData->common.waitTime = -1; + break; + + /* All setup complete. Transition to runtime phase */ + case NvSciStreamEventType_SetupComplete: + break; + } + + /* On failure or final event, clean up the block */ + if ((rv < 0) || (1 < rv)) { + deletePool(poolData); + } + + return rv; +} + + +/* Create and register a new pool block */ +int32_t createPool_Common( + NvSciStreamBlock* pool, + uint32_t numPacket, + bool isC2cPool) +{ + /* Create a data structure to track the block's status */ + PoolData* poolData = (PoolData*)calloc(1, sizeof(PoolData)); + if (NULL == poolData) { + printf("Failed to allocate data structure for pool\n"); + return 0; + } + + /* Save the name for debugging purposes */ + strcpy(poolData->common.name, "Pool"); + + /* Save the c2c pool flag */ + poolData->isC2cPool = isC2cPool; + + /* Save the packet count */ + poolData->numPacket = numPacket; + + /* Wait time for initial connection event will be 60 seconds */ + poolData->common.waitTime = 60 * 1000000; + + /* Create a pool block */ + NvSciError err = + NvSciStreamStaticPoolCreate(poolData->numPacket, + &poolData->common.block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create pool block\n", err); + deletePool(poolData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(poolData->common.block, poolData, handlePool)) { + deletePool(poolData); + return 0; + } + + *pool = poolData->common.block; + return 1; +} diff --git a/event_sample_app/block_presentsync.c b/event_sample_app/block_presentsync.c new file mode 100644 index 0000000..1e4c5b7 --- /dev/null +++ b/event_sample_app/block_presentsync.c @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - PresentSync block + */ + +#include +#include +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* + * PresentSync blocks do not require any block-specific interactions so we + * use the set of common functions to handle its events. + */ + +/* Create and register a new presentSync block */ +int32_t createPresentSync( + NvSciStreamBlock* presentSync) +{ + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("PresentSync", 0); + if (NULL == blockData) { + return 0; + } + + /* Create a PresentSync block */ + NvSciError err = + NvSciStreamPresentSyncCreate(sciSyncModule, &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create PresentSync block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *presentSync = blockData->block; + return 1; +} diff --git a/event_sample_app/block_producer_uc1.c b/event_sample_app/block_producer_uc1.c new file mode 100644 index 0000000..62ccb7b --- /dev/null +++ b/event_sample_app/block_producer_uc1.c @@ -0,0 +1,1339 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - producer block for use case 1 + * + * This implements the producer for use case 1: cuda to cuda streaming + */ + +#include +#include +#include +#include "nvscibuf.h" +#include "nvscisync.h" +#include "nvscistream.h" +#include "cuda.h" +#include "cuda_runtime_api.h" +#include "block_info.h" +#include "event_loop.h" +#include "usecase1.h" +#include "util.h" + +/* Internal data structure used to track packets */ +typedef struct { + /* The packet handle use for NvSciStream functions */ + NvSciStreamPacket handle; + /* NvSci buffer object for the packet's data buffer */ + NvSciBufObj dataObj; + /* CUDA external memory handle for the data buffer */ + cudaExternalMemory_t dataExtMem; + /* CUDA device memory pointer for the data buffer */ + void* dataDevMem; + /* Local system memory buffer used as the source for CUDA operations */ + uint8_t* dataSrcMem; + /* NvSci buffer object for the packet's CRC buffer */ + NvSciBufObj crcObj; + /* Virtual address for the CRC buffer */ + uint8_t* crcPtr; + /* Fence for the latest payload using this packet */ + NvSciSyncFence fence; +} ProdPacket; + +/* Internal data used by the producer block */ +typedef struct { + /* Common block info */ + BlockData common; + + /* Number of consumers */ + uint32_t numConsumers; + + /* CUDA device ID and UUID */ + int32_t cudaDeviceId; + CUuuid cudaUuid; + + /* CUDA producer stream */ + cudaStream_t cudaStream; + + /* NvSciSync context to do CPU waiting for fences */ + NvSciSyncCpuWaitContext cpuWaitContext; + /* Sync attributes for CPU waiting */ + NvSciSyncAttrList cpuWaitAttr; + + /* CUDA sync attributes required for signaling */ + NvSciSyncAttrList signalAttr; + /* CUDA sync attributes required for waiting */ + NvSciSyncAttrList waiterAttr; + /* Sync object for CUDA to signal after generating data */ + NvSciSyncObj signalObj; + /* CUDA semaphore mapped to sync object */ + cudaExternalSemaphore_t signalSem; + /* Sync objects to wait for before generating data */ + NvSciSyncObj waiterObj[MAX_CONSUMERS]; + /* CUDA semaphores mapped to sync objects */ + cudaExternalSemaphore_t waiterSem[MAX_CONSUMERS]; + + /* Element index chosen by pool for the CRC buffer */ + uint32_t crcIndex; + /* Element index chosen by pool for the data buffer */ + uint32_t dataIndex; + /* Size for data buffer after reconciling all requirements */ + uint64_t dataSize; + /* Number of packets provided by pool */ + uint32_t numPacket; + /* Information about each packet */ + ProdPacket packets[MAX_PACKETS]; + + /* Number of payloads generated so far */ + uint32_t counter; + /* Index of dataSrcMem array */ + uint32_t idxSrcMem; + /* Flag indicating producer has finished generating all payloads */ + uint32_t finished; + /* Number of frames that producer is configured to send */ + uint32_t framesDesired; +} ProdData; + +/* Free up the packet resources */ +static void deletePacket( + ProdPacket* packet) +{ + if (packet != NULL) { + if (packet->handle != NvSciStreamPacket_Invalid) { + /* Free CUDA memory mapping */ + (void)cudaFree(packet->dataDevMem); + if (packet->dataExtMem) { + (void)cudaDestroyExternalMemory(packet->dataExtMem); + packet->dataExtMem = 0; + } + + if (packet->dataSrcMem) { + free(packet->dataSrcMem); + packet->dataSrcMem = NULL; + } + + /* Free buffer objects */ + if (packet->dataObj) { + NvSciBufObjFree(packet->dataObj); + packet->dataObj = NULL; + } + + if (packet->crcObj) { + NvSciBufObjFree(packet->crcObj); + packet->crcObj = NULL; + } + + /* Clear the fences */ + NvSciSyncFenceClear(&packet->fence); + } + + /* Clear out packet information */ + memset(packet, 0, sizeof(ProdPacket)); + } +} + +/* Free up producer block resources */ +static void deleteProducer( + ProdData* prodData) +{ + /* Destroy block */ + if (prodData->common.block != 0) { + (void)NvSciStreamBlockDelete(prodData->common.block); + prodData->common.block = 0; + } + + /* Free the packet resources */ + for (uint32_t i=0;inumPacket; i++) { + deletePacket(&prodData->packets[i]); + } + + /* Free the sync objects */ + for (uint32_t i=0; i< prodData->numConsumers; i++) { + if (prodData->waiterObj[i] != NULL) { + (void)cudaDestroyExternalSemaphore(prodData->waiterSem[i]); + prodData->waiterSem[i] = 0; + NvSciSyncObjFree(prodData->waiterObj[i]); + prodData->waiterObj[i] = NULL; + } + } + + if (prodData->signalObj != NULL) { + (void)cudaDestroyExternalSemaphore(prodData->signalSem); + prodData->signalSem = 0; + NvSciSyncObjFree(prodData->signalObj); + prodData->signalObj = NULL; + } + + /* Free the cpu waiters */ + if (prodData->cpuWaitAttr != NULL) { + NvSciSyncAttrListFree(prodData->cpuWaitAttr); + prodData->cpuWaitAttr = NULL; + } + + /* Free the CPU wait contetxt */ + if (prodData->cpuWaitContext != NULL) { + NvSciSyncCpuWaitContextFree(prodData->cpuWaitContext); + prodData->cpuWaitContext = NULL; + } + + /* Destroy CUDA stream */ + (void)cudaStreamDestroy(prodData->cudaStream); + + /* Free data */ + free(prodData); +} + +/* Handle query of basic stream info */ +static int32_t handleStreamInit( + ProdData* prodData) +{ + /* Query number of consumers */ + NvSciError err = + NvSciStreamBlockConsumerCountGet(prodData->common.block, + &prodData->numConsumers); + + if (NvSciError_Success != err) { + printf("Failed (%x) to query the number of consumers\n", err); + return 0; + } + + if (opts.endInfo) { + /* Query endpoint info from all consumers */ + for (uint32_t i = 0U; i < prodData->numConsumers; i++) { + uint32_t size = INFO_SIZE; + char info[INFO_SIZE] = {0}; + err = NvSciStreamBlockUserInfoGet( + prodData->common.block, + NvSciStreamBlockType_Consumer, i, + ENDINFO_NAME_PROC, + &size, &info); + if (NvSciError_Success == err) { + printf("Consumer %i info: %s\n", i, info); + } else if (NvSciError_StreamInfoNotProvided == err) { + printf("Info not provided by the consumer %d\n", i); + } else { + printf("Failed (%x) to query the consumer %d info\n", err, i); + return 0; + } + } + } + + return 1; +} + +/* Handle initialization of CUDA resources for producer */ +static int32_t handleProducerInit( + ProdData* prodData) +{ + int32_t cudaRtErr; + CUresult cudaErr; + + /* Get stack limit */ + size_t unused; + cudaRtErr = cudaDeviceGetLimit(&unused, cudaLimitStackSize); + if (cudaSuccess != cudaRtErr) { + printf("Failed (%d) to get CUDA device limit\n", cudaRtErr); + return 0; + } + + /* Set CUDA device */ + prodData->cudaDeviceId = 0; + cudaRtErr = cudaSetDevice(prodData->cudaDeviceId); + if (cudaSuccess != cudaRtErr) { + printf("Failed (%d) to set CUDA device\n", cudaRtErr); + return 0; + } + + /* Get UUID for CUDA device */ + cudaErr = cuDeviceGetUuid(&prodData->cudaUuid, prodData->cudaDeviceId); + if (CUDA_SUCCESS != cudaErr) { + printf("Failed (%d) to get CUDA UUID\n", cudaErr); + return 0; + } + + /* Get CUDA stream for asynchronous operation */ + cudaRtErr = cudaStreamCreateWithFlags(&prodData->cudaStream, + cudaStreamNonBlocking); + if (cudaSuccess != cudaRtErr) { + printf("Failed (%d) to create CUDA stream\n", cudaRtErr); + return 0; + } + + return 1; +} + +/* Handle setup of supported buffer attributes */ +static int32_t handleProducerElemSupport( + ProdData* prodData) +{ + /* + * Note: To illustrate that NvSciStream producer and consumer do + * not need to specify the same set of element types, or use + * the same order for element types, the producer for this + * use case sends the CRC attributes first, followed by the + * primary data, while the consumer uses the opposite order. + * Our pool implementation will end up using the producer + * ordering, but that is not required either. + */ + + NvSciError sciErr; + uint32_t bufName[2]; + NvSciBufAttrList bufAttrs[2]; + + /* + * CRC buffer requires write access by CPU, and uses a raw 64 byte + * data buffer with 1 byte alignment. + */ + NvSciBufAttrValAccessPerm crcPerm = NvSciBufAccessPerm_ReadWrite; + uint8_t crcCpu = 1U; + NvSciBufType crcBufType = NvSciBufType_RawBuffer; + uint64_t crcSize = 64U; + uint64_t crcAlign = 1U; + NvSciBufAttrKeyValuePair crcKeyVals[] = { + { NvSciBufGeneralAttrKey_Types, &crcBufType, sizeof(crcBufType) }, + { NvSciBufRawBufferAttrKey_Size, &crcSize, sizeof(crcSize) }, + { NvSciBufRawBufferAttrKey_Align, &crcAlign, sizeof(crcAlign) }, + { NvSciBufGeneralAttrKey_RequiredPerm, &crcPerm, sizeof(crcPerm) }, + { NvSciBufGeneralAttrKey_NeedCpuAccess, &crcCpu, sizeof(crcCpu) } + }; + + /* Create and fill attribute list for CRC checksum buffer */ + bufName[0] = ELEMENT_NAME_CRC; + sciErr = NvSciBufAttrListCreate(sciBufModule, &bufAttrs[0]); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to create CRC attribute list\n", sciErr); + return 0; + } + sciErr = NvSciBufAttrListSetAttrs(bufAttrs[0], + crcKeyVals, + sizeof(crcKeyVals) / + sizeof(NvSciBufAttrKeyValuePair)); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to fill CRC attribute list\n", sciErr); + return 0; + } + + /* + * Data buffer requires write access by CPU and the GPU of the cuda + * device, and uses a raw 128KB data buffer with 4KB alignment. + */ + NvSciBufAttrValAccessPerm dataPerm = NvSciBufAccessPerm_ReadWrite; + uint8_t dataCpu = 1U; + NvSciRmGpuId dataGpu = { 0 }; + NvSciBufType dataBufType = NvSciBufType_RawBuffer; + uint64_t dataSize = 128U * 1024U; + uint64_t dataAlign = 4U * 1024U; + memcpy(&dataGpu.bytes, &prodData->cudaUuid.bytes, sizeof(dataGpu.bytes)); + NvSciBufAttrKeyValuePair dataKeyVals[] = { + { NvSciBufGeneralAttrKey_GpuId, &dataGpu, sizeof(dataGpu) }, + { NvSciBufGeneralAttrKey_Types, &dataBufType, sizeof(dataBufType) }, + { NvSciBufRawBufferAttrKey_Size, &dataSize, sizeof(dataSize) }, + { NvSciBufRawBufferAttrKey_Align, &dataAlign, sizeof(dataAlign) }, + { NvSciBufGeneralAttrKey_RequiredPerm, &dataPerm, sizeof(dataPerm) }, + { NvSciBufGeneralAttrKey_NeedCpuAccess, &dataCpu, sizeof(dataCpu) } + }; + + /* Create and fill attribute list for data buffer */ + bufName[1] = ELEMENT_NAME_DATA; + sciErr = NvSciBufAttrListCreate(sciBufModule, &bufAttrs[1]); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to create data attribute list\n", sciErr); + return 0; + } + sciErr = NvSciBufAttrListSetAttrs(bufAttrs[1], + dataKeyVals, + sizeof(dataKeyVals) / + sizeof(NvSciBufAttrKeyValuePair)); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to fill data attribute list\n", sciErr); + return 0; + } + + /* + * Inform stream of the attributes + * Once sent, the attribute lists are no longer needed + */ + for (uint32_t i=0; i<2U; ++i) { + sciErr = NvSciStreamBlockElementAttrSet(prodData->common.block, + bufName[i], bufAttrs[i]); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to send element %d attribute\n", + sciErr, i); + return 0; + } + NvSciBufAttrListFree(bufAttrs[i]); + } + + /* Indicate that all element information has been exported */ + sciErr = NvSciStreamBlockSetupStatusSet(prodData->common.block, + NvSciStreamSetup_ElementExport, + true); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to complete element export\n", sciErr); + return 0; + } + + return 1; +} + +/* Handle receipt of chosen element attributes */ +static int32_t handleProducerElemSetting( + ProdData* prodData) +{ + NvSciError err; + + /* + * This application does not need to query the element count, because we + * know it is always 2. But we do so anyways to show how it is done. + */ + uint32_t count; + err = NvSciStreamBlockElementCountGet(prodData->common.block, + NvSciStreamBlockType_Pool, + &count); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to query element count\n", err); + return 0; + } + if (2U != count) { + printf("Producer received unexpected element count (%d)\n", count); + return 0; + } + + /* Process all elements */ + for (uint32_t i=0U; i<2U; ++i) { + + /* Query element type and attributes */ + uint32_t type; + NvSciBufAttrList bufAttr; + err = NvSciStreamBlockElementAttrGet(prodData->common.block, + NvSciStreamBlockType_Pool, i, + &type, &bufAttr); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to query element attr %d\n", err, i); + return 0; + } + + /* For data element, need to extract size and save index */ + if (ELEMENT_NAME_DATA == type) { + prodData->dataIndex = i; + NvSciBufAttrKeyValuePair keyVals[] = { + { NvSciBufRawBufferAttrKey_Size, NULL, 0 } + }; + err = NvSciBufAttrListGetAttrs(bufAttr, keyVals, 1); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to obtain buffer size\n", err); + return 0; + } + prodData->dataSize = *((const uint64_t*)(keyVals[0].value)); + + /* Set waiter attributes for the asynchronous element. */ + err = NvSciStreamBlockElementWaiterAttrSet(prodData->common.block, + i, + prodData->waiterAttr); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to send waiter attr for elem %d\n", + err, i); + return 0; + } + + /* Once sent, the waiting attributes are no longer needed */ + NvSciSyncAttrListFree(prodData->waiterAttr); + prodData->waiterAttr = NULL; + } + + /* For CRC element, just need to save the index */ + else if (ELEMENT_NAME_CRC == type) { + prodData->crcIndex = i; + + /* CRC element is a synchronous element. + * Pass NULL for the attr to indicate no sync object is needed. + * This call could be omitted since NULL is the default. */ + err = NvSciStreamBlockElementWaiterAttrSet(prodData->common.block, + i, NULL); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to send waiter attr for elem %d\n", + err, i); + return 0; + } + } + + /* Report any unknown element */ + else { + printf("Producer received unknown element type (%x)\n", type); + return 0; + } + + /* Don't need to keep attribute list */ + NvSciBufAttrListFree(bufAttr); + } + + /* Indicate that element import is complete */ + err = NvSciStreamBlockSetupStatusSet(prodData->common.block, + NvSciStreamSetup_ElementImport, + true); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to complete element import\n", err); + return 0; + } + + /* Indicate that waiter attribute export is done. */ + err = NvSciStreamBlockSetupStatusSet(prodData->common.block, + NvSciStreamSetup_WaiterAttrExport, + true); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to complete waiter attr export\n", err); + return 0; + } + + return 1; +} + +/* Handle creation of a new packet */ +static int32_t handleProducerPacketCreate( + ProdData* prodData) +{ + NvSciError err; + + /* Retrieve handle for packet pending creation */ + NvSciStreamPacket handle; + err = NvSciStreamBlockPacketNewHandleGet(prodData->common.block, + &handle); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to retrieve handle for the new packet\n", + err); + return 0; + } + + /* Make sure there is room for more packets */ + if (MAX_PACKETS <= prodData->numPacket) { + printf("Producer exceeded max packets\n"); + err = NvSciStreamBlockPacketStatusSet(prodData->common.block, + handle, + NvSciStreamCookie_Invalid, + NvSciError_Overflow); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to send packet status\n", err); + } + return 0; + } + + /* + * Allocate the next entry in the array for the new packet. + * Use the array entry for the cookie + */ + ProdPacket* packet = &prodData->packets[prodData->numPacket++]; + packet->handle = handle; + packet->fence = NvSciSyncFenceInitializer; + + /* Retrieve all buffers and map into application + * This use case has 2 elements. + */ + for (uint32_t index = 0; index < 2; index++) { + NvSciBufObj bufObj; + err = NvSciStreamBlockPacketBufferGet(prodData->common.block, + handle, + index, + &bufObj); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to retrieve buffer (%lx/%d)\n", + err, handle, index); + return 0; + } + + /* Handle mapping of data buffer */ + NvSciError sciErr; + int32_t cudaRtErr; + + if (index == prodData->dataIndex) { + + /* Save buffer object */ + packet->dataObj = bufObj; + + /* Map in the buffer as CUDA external memory */ + struct cudaExternalMemoryHandleDesc memHandleDesc; + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.handle.nvSciBufObject = bufObj; + memHandleDesc.size = prodData->dataSize; + cudaRtErr = cudaImportExternalMemory(&packet->dataExtMem, + &memHandleDesc); + if (cudaSuccess != cudaRtErr) { + printf("Producer failed (%d) to map buffer as external mem\n", + cudaRtErr); + return 0; + } + + /* Map in the buffer as CUDA device memory */ + struct cudaExternalMemoryBufferDesc memBufferDesc; + memset(&memBufferDesc, 0, sizeof(memBufferDesc)); + memBufferDesc.size = prodData->dataSize; + memBufferDesc.offset = 0; + cudaRtErr = cudaExternalMemoryGetMappedBuffer(&packet->dataDevMem, + packet->dataExtMem, + &memBufferDesc); + if (cudaSuccess != cudaRtErr) { + printf("Producer failed (%d) to map buffer as device mem\n", + cudaRtErr); + return 0; + } + + /* Allocate normal memory to use as the source for the CUDA op */ + packet->dataSrcMem = (uint8_t*)malloc(prodData->dataSize); + if (NULL == packet->dataSrcMem) { + printf("Producer failed to allocate source buffer\n"); + return 0; + } + + /* Fill in with initial values */ + memset(packet->dataSrcMem, 0x5A, prodData->dataSize); + + } + + /* Handle mapping of CRC buffer */ + else if (index == prodData->crcIndex) { + + /* Save buffer object */ + packet->crcObj = bufObj; + + /* Get a CPU pointer for the buffer from NvSci */ + sciErr = NvSciBufObjGetCpuPtr(bufObj, (void**)&packet->crcPtr); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to map CRC buffer\n", sciErr); + return 0; + } + + } + + /* Shouldn't be any other index */ + else { + printf("Producer received buffer for unknown element (%d)\n", + index); + return 0; + } + + } + + /* Inform pool of success. + * Note: Could inform the pool of any of the failures above. + */ + err = NvSciStreamBlockPacketStatusSet(prodData->common.block, + handle, + (NvSciStreamCookie)packet, + NvSciError_Success); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to inform pool of packet status\n", err); + return 0; + } + + return 1; +} + +/* Handle deletion of packet */ +static void handleProducerPacketDelete( + ProdData* prodData) +{ + /* Get the deleted packet cookie*/ + NvSciStreamCookie cookie; + NvSciError err = + NvSciStreamBlockPacketOldCookieGet(prodData->common.block, + &cookie); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to retrieve the deleted packet cookie\n", + err); + } + + /* Get packet pointer */ + ProdPacket* packet = (ProdPacket*)cookie; + + /* Free the packet resources */ + deletePacket(packet); +} + +/* Handle setup of supported sync attributes */ +static int32_t handleProducerSyncSupport( + ProdData* prodData) +{ + NvSciError sciErr; + int32_t cudaRtErr; + + /* + * Create sync attribute list for signaling. + * This will be saved until we receive the consumer's attributes + */ + sciErr = NvSciSyncAttrListCreate(sciSyncModule, &prodData->signalAttr); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to allocate signal sync attrs\n", sciErr); + return 0; + } + + /* Have CUDA fill the signaling attribute list */ + cudaRtErr = cudaDeviceGetNvSciSyncAttributes(prodData->signalAttr, + prodData->cudaDeviceId, + cudaNvSciSyncAttrSignal); + if (cudaSuccess != cudaRtErr) { + printf("Producer failed (%d) to fill signal sync attrs\n", cudaRtErr); + return 0; + } + + /* Create sync attribute list for waiting. */ + sciErr = NvSciSyncAttrListCreate(sciSyncModule, &prodData->waiterAttr); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to allocate waiter sync attrs\n", sciErr); + return 0; + } + + /* Have CUDA fill the waiting attribute list */ + cudaRtErr = cudaDeviceGetNvSciSyncAttributes(prodData->waiterAttr, + prodData->cudaDeviceId, + cudaNvSciSyncAttrWait); + if (cudaSuccess != cudaRtErr) { + printf("Producer failed (%d) to fill waiter sync attrs\n", cudaRtErr); + return 0; + } + + /* + * Most producers will only need to signal their own sync objects and + * wait for the consumer sync object(s). But to protect a local + * data buffer, this producer will also need the ability to do + * CPU waits on the sync objects it signals. + */ + + /* Create attribute list for CPU waiting */ + sciErr = NvSciSyncAttrListCreate(sciSyncModule, &prodData->cpuWaitAttr); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to allocate cpu wait sync attrs\n", + sciErr); + return 0; + } + + /* Fill attribute list for CPU waiting */ + uint8_t cpuSync = 1; + NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly; + NvSciSyncAttrKeyValuePair cpuKeyVals[] = { + { NvSciSyncAttrKey_NeedCpuAccess, &cpuSync, sizeof(cpuSync) }, + { NvSciSyncAttrKey_RequiredPerm, &cpuPerm, sizeof(cpuPerm) } + }; + sciErr = NvSciSyncAttrListSetAttrs(prodData->cpuWaitAttr, cpuKeyVals, 2); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to fill cpu wait sync attrs\n", sciErr); + return 0; + } + + /* Create a context for CPU waiting */ + sciErr = NvSciSyncCpuWaitContextAlloc(sciSyncModule, + &prodData->cpuWaitContext); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to allocate cpu wait context\n", sciErr); + return 0; + } + + return 1; +} + +/* Handle creation and export of producer sync object */ +static int32_t handleProducerSyncExport( + ProdData* prodData) +{ + NvSciError sciErr; + uint32_t cudaRtErr; + + /* Process waiter attrs from all elements. + * As CRC element is a synchronous element, + * no need to query the sync object for it. + */ + NvSciSyncAttrList waiterAttr = NULL; + sciErr = NvSciStreamBlockElementWaiterAttrGet(prodData->common.block, + prodData->dataIndex, + &waiterAttr); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to query waiter attr\n", + sciErr); + return 0; + } + if (NULL == waiterAttr) { + printf("Producer received NULL waiter attr for data elem\n"); + return 0; + } + + /* Indicate that waiter attribute import is done. */ + sciErr = NvSciStreamBlockSetupStatusSet(prodData->common.block, + NvSciStreamSetup_WaiterAttrImport, + true); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to complete waiter attr import\n", + sciErr); + return 0; + } + + /* + * Merge and reconcile consumer sync attrs with ours. + * Note: Many producers would only require their signaler attributes + * and the consumer waiter attributes. As noted above, we also + * add in attributes to allow us to CPU wait for the syncs that + * we signal. + */ + NvSciSyncAttrList unreconciled[3] = { + prodData->signalAttr, + waiterAttr, + prodData->cpuWaitAttr }; + NvSciSyncAttrList reconciled = NULL; + NvSciSyncAttrList conflicts = NULL; + sciErr = NvSciSyncAttrListReconcile(unreconciled, 3, + &reconciled, &conflicts); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to reconcile sync attributes\n", sciErr); + return 0; + } + + /* Allocate sync object */ + sciErr = NvSciSyncObjAlloc(reconciled, &prodData->signalObj); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to allocate sync object\n", sciErr); + return 0; + } + + /* Free the attribute lists */ + NvSciSyncAttrListFree(prodData->signalAttr); + prodData->signalAttr = NULL; + NvSciSyncAttrListFree(waiterAttr); + NvSciSyncAttrListFree(reconciled); + + /* Create CUDA semaphore for sync object */ + struct cudaExternalSemaphoreHandleDesc extSemDesc; + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = prodData->signalObj; + cudaRtErr = cudaImportExternalSemaphore(&prodData->signalSem, + &extSemDesc); + if (cudaSuccess != cudaRtErr) { + printf("Producer failed (%d) to map signal object to semaphore\n", + cudaRtErr); + return 0; + } + + /* Only send the sync object for the asynchronous element. + * If this function is not called for an element, + * the sync object is assumed to be NULL. + * In this use case, CRC element doesn't use sync object. + */ + sciErr = NvSciStreamBlockElementSignalObjSet(prodData->common.block, + prodData->dataIndex, + prodData->signalObj); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to send sync object\n", sciErr); + return 0; + } + + /* Indicate that sync object export is complete */ + sciErr = NvSciStreamBlockSetupStatusSet(prodData->common.block, + NvSciStreamSetup_SignalObjExport, + true); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to complete signal obj export\n", + sciErr); + return 0; + } + + return 1; +} + +/* Handle import of consumer sync object */ +static int32_t handleProducerSyncImport( + ProdData* prodData) +{ + uint32_t cudaRtErr; + NvSciError sciErr; + + /* Query sync objects for asynchronous elements + * from all consumers. + */ + for (uint32_t c = 0U; c < prodData->numConsumers; c++) { + NvSciSyncObj waiterObj = NULL; + sciErr = NvSciStreamBlockElementSignalObjGet( + prodData->common.block, + c, prodData->dataIndex, + &waiterObj); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to query sync obj from consumer %d\n", + sciErr, c); + return 0; + } + + /* Save object */ + prodData->waiterObj[c] = waiterObj; + + /* If the waiter sync obj is NULL, + * it means this element is ready to use when received. + */ + if (NULL != waiterObj) { + /* Create CUDA semaphore for sync object */ + struct cudaExternalSemaphoreHandleDesc extSemDesc; + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = waiterObj; + cudaRtErr = cudaImportExternalSemaphore(&prodData->waiterSem[c], + &extSemDesc); + if (cudaSuccess != cudaRtErr) { + printf("Producer failed (%d) to map waiter obj from cons %d\n", + cudaRtErr, c); + return 0; + } + } + } + + /* Indicate that element import is complete */ + sciErr = NvSciStreamBlockSetupStatusSet(prodData->common.block, + NvSciStreamSetup_SignalObjImport, + true); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to complete signal obj import\n", + sciErr); + return 0; + } + + return 1; +} + +/* Handle generation of payloads */ +static int32_t handleProducerPayload( + ProdData* prodData) +{ + NvSciError sciErr; + int32_t cudaErr; + + /* Obtain packet for the new payload */ + NvSciStreamCookie cookie; + sciErr = NvSciStreamProducerPacketGet(prodData->common.block, + &cookie); + if (NvSciError_Success != sciErr) { + if ((opts.numLateConsumer > 0U) && (sciErr == NvSciError_StreamNotConnected)) { + printf("[WARN] Producer failed (%x) to obtain packet for payload\n", sciErr); + } else { + printf("Producer failed (%x) to obtain packet for payload\n", sciErr); + } + return 0; + } + ProdPacket* packet = (ProdPacket*)cookie; + + /* + * Before modifying the contents of the source buffer, make sure the + * previous copy from the buffer has completed. Once done, the + * fence can be cleared. + * Note: This CPU wait on the previously generated payload for this + * packet is only necesary to protect the source buffer contents. + * If this producer were processing data coming in from an external + * source or generating data that didn't involve copying from a + * fixed source, this wait would not be necessary. For most + * producers, it is sufficient to have the engine wait for the + * consumer prefences. + * However, this wait does add some throttling, preventing the + * producer from issuing commands for many payloads in advance, + * which can be valuable in some use cases. + */ + sciErr = NvSciSyncFenceWait(&packet->fence, prodData->cpuWaitContext, -1); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to CPU wait for previous fence\n", + sciErr); + return 0; + } + NvSciSyncFenceClear(&packet->fence); + + /* + * Modify 1 byte of the source. + * We do this mainly so we're not sending the same thing every time, + * but also to have an excuse to illustrate the CPU wait above, + * for use cases where the producer needs to wait for more than + * just an available buffer to continue. + */ + packet->dataSrcMem[prodData->idxSrcMem] = + (uint8_t)(prodData->counter % 256); + prodData->counter++; + prodData->idxSrcMem = (prodData->idxSrcMem+1) % (prodData->dataSize); + + /* Query fences for data element from each consumer */ + for (uint32_t i = 0U; i < prodData->numConsumers; i++) { + /* If the received waiter obj if NULL, + * the consumer is done using this element, + * skip waiting on pre-fence. + */ + if (NULL == prodData->waiterObj[i]) { + continue; + } + + NvSciSyncFence prefence = NvSciSyncFenceInitializer; + sciErr = NvSciStreamBlockPacketFenceGet(prodData->common.block, + packet->handle, + i, prodData->dataIndex, + &prefence); + if (NvSciError_Success != sciErr) { + if ((opts.numLateConsumer > 0U) && (sciErr == NvSciError_StreamNotConnected)) { + printf("[WARN] Producer failed (%x) to query fence from consumer %d\n", + sciErr, i); + } else { + printf("Producer failed (%x) to query fence from consumer %d\n", + sciErr, i); + } + return 0; + } + + /* Instruct CUDA to wait for each of the consumer fences */ + struct cudaExternalSemaphoreWaitParams waitParams; + memset(&waitParams, 0, sizeof(waitParams)); + waitParams.params.nvSciSync.fence = &prefence; + waitParams.flags = 0; + cudaErr = cudaWaitExternalSemaphoresAsync( + &prodData->waiterSem[i], + &waitParams, 1, + prodData->cudaStream); + if (cudaSuccess != cudaErr) { + printf("Producer failed (%d) to wait for prefence from cons %d\n", + cudaErr, i); + return 0; + } + NvSciSyncFenceClear(&prefence); + } + + /* Instruct CUDA to copy the source buffer to the packet data buffer */ + cudaErr = cudaMemcpy2DAsync(packet->dataDevMem, + prodData->dataSize, + packet->dataSrcMem, + prodData->dataSize, + prodData->dataSize, + 1, + cudaMemcpyHostToDevice, + prodData->cudaStream); + if (cudaSuccess != cudaErr) { + printf("Producer failed (%d) to issue copy command\n", cudaErr); + return 0; + } + + /* Inform CUDA to signal a fence when the copy completes */ + struct cudaExternalSemaphoreSignalParams signalParams; + memset(&signalParams, 0, sizeof(signalParams)); + signalParams.params.nvSciSync.fence = &packet->fence; + signalParams.flags = 0; + cudaErr = cudaSignalExternalSemaphoresAsync(&prodData->signalSem, + &signalParams, + 1, + prodData->cudaStream); + if (cudaSuccess != cudaErr) { + printf("Producer failed (%d) to signal postfence\n", cudaErr); + return 0; + } + + /* Generate a checkum and save to the CRC buffer of the packet */ + *((uint32_t*)(packet->crcPtr)) = generateCRC(packet->dataSrcMem, + 1, + prodData->dataSize, + prodData->dataSize); + + + /* Update postfence for data element */ + sciErr = NvSciStreamBlockPacketFenceSet(prodData->common.block, + packet->handle, + prodData->dataIndex, + &packet->fence); + if (NvSciError_Success != sciErr) { + if ((opts.numLateConsumer > 0U) && (sciErr == NvSciError_StreamNotConnected)) { + printf("[WARN] Producer failed (%x) to set postfence\n", sciErr); + } else { + printf("Producer failed (%x) to set postfence\n", sciErr); + } + return 0; + } + + /* Send the new payload to the consumer(s) */ + sciErr = NvSciStreamProducerPacketPresent(prodData->common.block, + packet->handle); + if (NvSciError_Success != sciErr) { + if ((opts.numLateConsumer > 0U) && (sciErr == NvSciError_StreamNotConnected)) { + printf("[WARN] Producer failed (%x) to present packet\n", sciErr); + } else { + printf("Producer failed (%x) to present packet\n", sciErr); + } + return 0; + } + + /* If counter has reached the limit, indicate finished */ + if (prodData->counter == prodData->framesDesired) { + /* Make sure all operations have been completed + * before resource cleanup. + */ + sciErr = NvSciSyncFenceWait(&packet->fence, + prodData->cpuWaitContext, + 0xFFFFFFFF); + if (NvSciError_Success != sciErr) { + printf("Producer failed (%x) to wait for all operations done\n", + sciErr); + return 0; + } + prodData->finished = 1; + } + + return 1; +} + +/* Handle events on a producer block + * + * The producer block informs the stream of the producers buffer and + * synchronization requirements and capabilities, creates signaling + * synchronization objects and receives synchronization objects to + * wait for, maps buffers and synchronization objects to the producer + * engine(s), and generates data. + */ +static int32_t handleProducer( + void* data, + uint32_t wait) +{ + /* Cast to producer data */ + ProdData* prodData = (ProdData*)data; + + /* Get time to wait */ + int64_t waitTime = wait ? prodData->common.waitTime : 0; + + /* Query/wait for an event on the block */ + + NvSciStreamEventType event; + NvSciError err; + err = NvSciStreamBlockEventQuery(prodData->common.block, waitTime, &event); + + /* Handle errors */ + if (NvSciError_Success != err) { + + /* If not asked to wait, a timeout is not an error */ + if (!waitTime && (NvSciError_Timeout == err)) { + return 0; + } + + /* Otherwise, any error is considered fatal. A timeout probably + * indicates a failure to connect and complete setup in a timely + * fashion, so we specifically call out this case. + */ + if (NvSciError_Timeout == err) { + printf("Producer timed out waiting for setup instructions\n"); + } else { + printf("Producer event query failed with error %x\n", err); + } + deleteProducer(prodData); + return -1; + } + + /* If we received an event, handle it based on its type + * + * Note that there's a lot of error checking we could choose to do for + * some of these events, like making sure that we only receive each + * event once for a given entry. But NvSciStream is expected to take + * care of all of that, even when the application makes a mistake. + * So we only check for things that don't trigger NvSciStream errors. + */ + int32_t rv = 1; + NvSciError status; + switch (event) { + /* + * Any event we don't explicitly handle is a fatal error + */ + default: + printf("Producer received unknown event %x\n", event); + + rv = -1; + break; + + /* + * Error events should never occur with safety-certified drivers, + * and are provided only in non-safety builds for debugging + * purposes. Even then, they should only occur when something + * fundamental goes wrong, like the system running out of memory, + * or stack/heap corruption, or a bug in NvSci which should be + * reported to NVIDIA. + */ + case NvSciStreamEventType_Error: + err = NvSciStreamBlockErrorGet(prodData->common.block, &status); + if (NvSciError_Success != err) { + printf("%s Failed to query the error event code %x\n", + prodData->common.name, err); + } else { + printf("%s received error event: %x\n", + prodData->common.name, status); + } + + rv = -1; + break; + + /* + * If told to disconnect, it means either the stream finished its + * business or some other block had a failure. We'll just do a + * clean up and return without an error. + */ + case NvSciStreamEventType_Disconnected: + printf("Producer disconnected after sending %d payloads\n", prodData->counter); + rv = 2; + break; + + /* + * On connection, the producer should initialize the appopriate engine(s) + * and obtain the necessary buffer and synchronization attribute lists + * for the desired use case. + */ + case NvSciStreamEventType_Connected: + + /* Initialize basic stream info */ + if (!handleStreamInit(prodData)) { + rv = -1; + } + /* Initialize CUDA access */ + else if (!handleProducerInit(prodData)) { + rv = -1; + } + /* Determine supported buffer attributes */ + else if (!handleProducerElemSupport(prodData)) { + rv = -1; + } + /* Determined supported sync attributes */ + else if (!handleProducerSyncSupport(prodData)) { + rv = -1; + } + + /* Now that we're fully connected, set the wait time to infinite */ + prodData->common.waitTime = -1; + break; + + /* Retrieve all element information from pool */ + case NvSciStreamEventType_Elements: + if (!handleProducerElemSetting(prodData)) { + rv = -1; + } + break; + + /* For a packet, set up an entry in the array */ + case NvSciStreamEventType_PacketCreate: + if (!handleProducerPacketCreate(prodData)) { + rv = -1; + } + break; + + /* Finish any setup related to packet resources */ + case NvSciStreamEventType_PacketsComplete: + /* For this use case, nothing else to setup. + * Inform the NvSciStream that the producer has imported all packets. + */ + err = NvSciStreamBlockSetupStatusSet(prodData->common.block, + NvSciStreamSetup_PacketImport, + true); + if (NvSciError_Success != err) { + printf("Producer failed (%x) to complete packet import\n", err); + rv = -1; + } + break; + + /* Delete a packet - usually only relevant for non-safety applications */ + case NvSciStreamEventType_PacketDelete: + handleProducerPacketDelete(prodData); + break; + + case NvSciStreamEventType_WaiterAttr: + if (!handleProducerSyncExport(prodData)) { + rv = -1; + } + break; + + /* Import consumer sync objects for all elements */ + case NvSciStreamEventType_SignalObj: + if (!handleProducerSyncImport(prodData)) { + rv = -1; + } + break; + + /* All setup complete. Transition to runtime phase */ + case NvSciStreamEventType_SetupComplete: + printf("Producer setup completed\n"); + break; + + /* Generate payloads when packets are available */ + case NvSciStreamEventType_PacketReady: + if (!handleProducerPayload(prodData)) { + rv = -1; + } else if (prodData->finished) { + printf("Producer finished sending %d payloads\n", prodData->counter); + rv = 2; + } + break; + } + + /* On failure or final event, clean up the block */ + if ((rv < 0) || (1 < rv)) { + if ((rv < 0) && (opts.numLateConsumer > 0U)) { + printf("Producer disconnected after sending %d payloads\n", prodData->counter); + rv = 2; + } + deleteProducer(prodData); + } + + return rv; +} + +/* Create and register a new producer block */ +int32_t createProducer_Usecase1( + NvSciStreamBlock* producer, + NvSciStreamBlock pool, + uint32_t numFrames) +{ + /* Create a data structure to track the block's status */ + ProdData* prodData = (ProdData*)calloc(1, sizeof(ProdData)); + if (NULL == prodData) { + printf("Failed to allocate data structure for producer\n"); + return 0; + } + + /* Save the name for debugging purposes */ + strcpy(prodData->common.name, "Producer"); + + /* Wait time for initial connection event will be 60 seconds */ + prodData->common.waitTime = 60 * 1000000; + prodData->framesDesired = numFrames; + + /* Create a producer block */ + NvSciError err = + NvSciStreamProducerCreate(pool, &prodData->common.block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create producer block\n", err); + deleteProducer(prodData); + return 0; + } + + if (opts.endInfo) { + /* Add endpoint information on producer side. + * Application can specify user-defined info to help set up stream, + * which can be queried by other blocks after stream connection. + */ + char info[INFO_SIZE] = {0}; + size_t infoSize = + snprintf(info, INFO_SIZE, "%s%d", "Producer proc: ", getpid()); + err = NvSciStreamBlockUserInfoSet(prodData->common.block, + ENDINFO_NAME_PROC, + infoSize, info); + if (NvSciError_Success != err) { + printf("Failed (%x) to setup the producer info\n", err); + deleteProducer(prodData); + return 0; + } + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(prodData->common.block, prodData, handleProducer)) { + deleteProducer(prodData); + return 0; + } + + *producer = prodData->common.block; + + return 1; +} diff --git a/event_sample_app/block_queue.c b/event_sample_app/block_queue.c new file mode 100644 index 0000000..47f71b2 --- /dev/null +++ b/event_sample_app/block_queue.c @@ -0,0 +1,57 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - queue block + */ + +#include +#include +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* + * Queue blocks do not require any block-specific interactions so we + * use the set of common functions to handle its events. + */ + +/* Create and register a new limiter block */ +int32_t createQueue( + NvSciStreamBlock* queue, + uint32_t useMailbox) +{ + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon(useMailbox ? "Mailbox" : "FIFO", 0); + if (NULL == blockData) { + return 0; + } + + /* Create a queue block */ + NvSciError err = useMailbox + ? NvSciStreamMailboxQueueCreate(&blockData->block) + : NvSciStreamFifoQueueCreate(&blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create queue block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *queue = blockData->block; + return 1; +} diff --git a/event_sample_app/block_returnsync.c b/event_sample_app/block_returnsync.c new file mode 100644 index 0000000..3511cb1 --- /dev/null +++ b/event_sample_app/block_returnsync.c @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - ReturnSync block + */ + +#include +#include +#include "nvscistream.h" +#include "block_info.h" +#include "event_loop.h" + +/* + * ReturnSync blocks do not require any block-specific interactions so we + * use the set of common functions to handle its events. + */ + +/* Create and register a new returnSync block */ +int32_t createReturnSync( + NvSciStreamBlock* returnSync) +{ + /* Create a data structure to track the block's status */ + BlockData* blockData = createCommon("ReturnSync", 0); + if (NULL == blockData) { + return 0; + } + + /* Create a ReturnSync block */ + NvSciError err = + NvSciStreamReturnSyncCreate(sciSyncModule, &blockData->block); + if (NvSciError_Success != err) { + printf("Failed (%x) to create ReturnSync block\n", err); + deleteCommon(blockData); + return 0; + } + + /* Register block with event handling mechanism */ + if (!eventFuncs->reg(blockData->block, blockData, handleCommon)) { + deleteCommon(blockData); + return 0; + } + + *returnSync = blockData->block; + return 1; +} diff --git a/event_sample_app/event_loop.h b/event_sample_app/event_loop.h new file mode 100644 index 0000000..7d5e43c --- /dev/null +++ b/event_sample_app/event_loop.h @@ -0,0 +1,51 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - event handler abstraction + */ + +#ifndef _EVENT_LOOP_H +#define _EVENT_LOOP_H 1 + +#include +#include "nvscistream.h" + +/* + * Block event handler function. + * Input: + * data: The block's type-specific private data structure + * wait: Flag indicating whether to wait for an event + * Returns: + * +2: Block has completed its last operation and will be destroyed + * +1: An event was found and processed + * 0: No event was found (not an error) + * -1: Block has encountered a fatal error and will be destroyed + */ +typedef int32_t (*BlockFunc)(void* data, uint32_t wait); + +/* Table of events to abstract the two approaches for event loops */ +typedef struct { + int32_t (*init)(void); + int32_t (*reg)(NvSciStreamBlock, void*, BlockFunc); + int32_t (*regInt)(NvSciStreamBlock); + int32_t (*loop)(void); +} EventFuncs; + +/* Chosen event function table */ +extern EventFuncs const* eventFuncs; + +/* Event tables for the two methods */ +extern EventFuncs const eventFuncs_Service; +extern EventFuncs const eventFuncs_Threads; + +#endif // _EVENT_LOOP_H diff --git a/event_sample_app/event_loop_service.c b/event_sample_app/event_loop_service.c new file mode 100644 index 0000000..48aeee3 --- /dev/null +++ b/event_sample_app/event_loop_service.c @@ -0,0 +1,339 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - service-based event handling + * + * This file implements the option to handle events for all blocks + * through an event service. Each block adds an event notifier to + * a list. That notifier will be signaled when an event is ready + * on the block. A single main loop waits for one or more of the + * notifiers to trigger, processes events on the corresponding + * blocks, and goes back to waiting. When all blocks have been + * destroyed either due to failure or all payloads being processed, + * the loop exits and the function returns. + */ + +#include +#include +#include +#include +#if (QNX == 1) +#include +#endif +#include "nvscievent.h" +#include "block_info.h" +#include "event_loop.h" + +/* Event service */ +static NvSciEventLoopService* service = NULL; + +/* List of blocks */ +#define MAX_INTERNAL_NOTIFIERS 10 +#define MAX_NOTIFIERS MAX_BLOCKS + MAX_INTERNAL_NOTIFIERS + +int32_t numBlocks = 0U; +uint32_t numAlive = 0U; +static uint32_t numIntNotifiers = 0U; +static int32_t numNotifiers = 0U; + +BlockEventData blocks[MAX_BLOCKS]; +BlockEventData* blocksAlive[MAX_BLOCKS]; +static NvSciEventNotifier* intNotifiers[MAX_INTERNAL_NOTIFIERS]; + +static uint32_t success = 1U; + +/* Initialize service-based event handling */ +static int32_t eventServiceInit(void) +{ + /* + * The OS configuration should be NULL for Linux and should + * have a valid configuration for QNX. + * See NvSciEventLoopServiceCreateSafe API Specification for more + * information. + */ + void *osConfig = NULL; + +#if (QNX == 1) + struct nto_channel_config config = {0}; + + /* + * The number of pulses could be calculated based on the + * number of notifiers bind to the event service, number of packets and + * number of events handled by each block. + * (num_of_pulses = num_of_notifiers * 4 + \ + * (num_packets + 5) * num_of_endpoints) + * If experienced pulse pool shortage issue in normal operation, increase + * the number of pulses. + * If there are no available pulses in the pool, SIGKILL is delivered + * by default. You may configure the sigevent that you want to be + * delivered when a pulse can't be obtained from the pool. + * + * See NvSciEventLoopServiceCreateSafe API Specification for more + * information. + */ + + /* The num_pulses set below is just an example number and should be + * adjusted depending on the use case. + */ + config.num_pulses = 1024U; + config.rearm_threshold = 0; + osConfig = &config; +#endif + + /* Create event loop service */ + NvSciError err = NvSciEventLoopServiceCreateSafe(1U, osConfig, &service); + if (NvSciError_Success != err) { + printf("Failed (%x) to create event service\n", err); + return 0; + } + + return 1; +} + +/* Register a new block with the event management */ +static int32_t eventServiceRegister( + NvSciStreamBlock blockHandle, + void* blockData, + BlockFunc blockFunc) +{ + /* Sanity check to make sure we left room for enough blocks */ + if (numBlocks >= MAX_BLOCKS) { + printf("Exceeded maximum number of blocks\n"); + return 0; + } + + /* Grab the next entry in the list for the new block and fill it in */ + BlockEventData* entry = &blocks[numBlocks++]; + entry->handle = blockHandle; + entry->data = blockData; + entry->func = blockFunc; + entry->isAlive = true; + entry->retry = false; + + /* Create a notifier for events on this block */ + NvSciError err = + NvSciStreamBlockEventServiceSetup(entry->handle, + &service->EventService, + &entry->notifier); + + if (NvSciError_Success != err ) { + printf("Failed (%x) to create event notifier for block\n", err); + return 0; + } + + return 1; +} + +/* Register a new block with the event management to handle internal event. + * + * It's only supported on IpcSrc/IpcDst blocks now. + * + * Without user-provided event service, each IpcSrc/IpcDst block creates + * an internal event service and spawns a dispatch thread to handle the + * internal I/O messages. + * + * With the user-provided event service, no internal thread will be created. + * The application needs to wait for events on these internal notifiers. + * When there's a new notification on the internal notifiers, it will + * trigger the NvSciStream callback function automatically. + * + * The application can bind the internal notifiers and the external + * notifiers, which is used to monitor the NvSciStreamEvent on the block, + * to the same event service or different ones. In this sample app, we + * bind them to the same event service and use one thread to handle all + * the events. + */ +static int32_t eventServiceInternalRegister( + NvSciStreamBlock blockHandle) +{ + /* Gets notifiers for internal events on this block */ + numIntNotifiers = MAX_INTERNAL_NOTIFIERS; + NvSciError err = + NvSciStreamBlockInternalEventServiceSetup( + blockHandle, + &service->EventService, + &numIntNotifiers, + intNotifiers); + + if (NvSciError_Success != err) { + printf("Failed (%x) to setup internal event service for block\n", err); + return 0; + } + + /* Sanity check to make sure we left room for enough internal notifiers */ + if (numIntNotifiers >= MAX_INTERNAL_NOTIFIERS) { + printf("Exceeded maximum number of internal notifiers\n"); + return 0; + } + + return 1; +} + +/* Main service-based event loop */ +static int32_t eventServiceLoop(void) +{ + int32_t i; + int32_t k; + + /* + * Notes on handling notificiations: + * If more than one signal occurs on a notifier in between calls + * to check for events, then NvSciEvent will squash the notifications, + * so only one is received. This means the application must drain + * all pending events on a block after its notifier signals. It won't + * receive new notifications for those pending events. + * A simple implementation might process each block's events in a loop + * until there are no more, and then move on to the next block. But + * this poses a risk of starvation. Consider the case of a stream in + * mailbox mode, where the mailbox already has a waiting payload. + * If the producer receives a PacketReady event, it will obtain + * the packet, fill it with data, and present it to the stream. + * Because the mailbox is full, the packet will immediately be + * returned, resulting in a new PacketReady event. The application + * can go into an infinite loop, generating new payloads on the + * producer without giving the consumer a chance to process them. + * We therefore use an event loop that only processes one event + * per block for each iteration, but keeps track of whether there + * was an event on a block for the previous pass, and if so + * retries it even if no new signal occurred. The event loop + * waits for events only when there was no prior event. Otherwise + * it only polls for new ones. + * For internal notifiers, as handler is registered by NvSciStream + * when creating the notifiers, the handler will be triggered + * automatically when there's new event. Application only needs + * to wait for new events but no need to handle the new events. + */ + + /* Pack all notifiers into an array */ + NvSciEventNotifier* notifiers[MAX_NOTIFIERS]; + + /* Initialize loop control parameters */ + int64_t timeout = 1000000; + bool event[MAX_NOTIFIERS]; + uint32_t numAliveBlocks; + + numAlive = numBlocks; + + /* Main loop - Handle events until all blocks report completion or fail */ + while (numAlive && !atomic_load(&streamDone)) { + + numNotifiers = 0; + numAliveBlocks = 0; + + /* Acquire the lock */ + pthread_mutex_lock(&mutex); + /* Pack the external notifiers for the block */ + for (i=0; iWaitForMultipleEventsExt( + &service->EventService, + notifiers, + numNotifiers, + timeout, + event); + + if ((NvSciError_Success != err) && (NvSciError_Timeout != err)) { + printf("Failure (%x) while waiting/polling event service\n", err); + return 0; + } + + /* Timeout for next pass will be infinite unless we need to retry */ + timeout = 1000000; + + /* + * Check for events on new blocks that signaled or old blocks that + * had an event on the previous pass. This is done in reverse + * of the order in which blocks were registered. This is because + * producers are created before consumers, and for mailbox mode + * we want to give the consumer a chance to use payloads before + * the producer replaces them. + */ + for (i=numAliveBlocks-1; ((i>=0) && (!atomic_load(&streamDone))); --i) { + /* Get block info */ + BlockEventData* entry = blocksAlive[i]; + if (entry != NULL) { + if (event[i] || entry->retry) { + + /* Reset to no retry for next pass */ + entry->retry = false; + + /* Skip if this block is no longer in use */ + if (entry->data) { + + /* Call the block's event handler function */ + int32_t rv = entry->func(entry->data, 0); + if (rv < 0) { + /* On failure, no longer check block and app failed */ + success = 0U; + entry->data = NULL; + numAlive--; + } else if (rv == 2) { + /* On completion, no longer check block */ + entry->isAlive = false; + entry->data = NULL; + numAlive--; + } else if (rv == 1) { + /* If event found, retry next loop */ + timeout = 0; + entry->retry = true; + } + } + } + } + } + } + + + /* Delete internal notifiers */ + for (uint32_t j=0; jDelete(intNotifiers[j]); + } + + /* Delete notifiers */ + for (i=0; iDelete(blocks[i].notifier); + } + + /* Delete service */ + service->EventService.Delete(&service->EventService); + + return success; +} + +/* Table of functions for service-based event handling */ +EventFuncs const eventFuncs_Service = { + .init = eventServiceInit, + .reg = eventServiceRegister, + .regInt = eventServiceInternalRegister, + .loop = eventServiceLoop +}; diff --git a/event_sample_app/event_loop_threads.c b/event_sample_app/event_loop_threads.c new file mode 100644 index 0000000..29eb0a2 --- /dev/null +++ b/event_sample_app/event_loop_threads.c @@ -0,0 +1,129 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - thread-based event handling + * + * This file implements the option to handle events for each block in + * a separate thread. Each thread just has a loop that waits for a + * block event to occur and handles it appropriately, until the block + * has performed all required actions or receives notification that + * the stream has disconnected. + * + * In practice, only a few block types (producer, consumer, and pool) + * receive any events that need to be handled. So a more streamlined + * application might choose to only monitor them, assuming that the + * other blocks can be left alone until the time comes to tear them + * down. + * + * Note: We use standard pthread functions here because it allows this + * sample to run on all operating systems. QNX has its own thread + * management functions which might be more efficient when using + * this approach. + */ + +#include +#include +#include +#include "event_loop.h" + +/* Structure to track block info */ +typedef struct { + NvSciStreamBlock handle; + void* data; + BlockFunc func; + pthread_t thread; +} BlockEventData; + +/* List of blocks */ +#define MAX_BLOCKS 100U +static uint32_t numBlocks = 0U; +static BlockEventData blocks[MAX_BLOCKS]; +static uint32_t success = 1U; + +/* The per-thread loop function for each block */ +static void* eventThreadFunc(void* arg) +{ + /* Simple loop, waiting for events on the block until the block is done */ + BlockEventData* entry = (BlockEventData*)arg; + while (1) { + int32_t rv = entry->func(entry->data, 1); + if (rv < 0) { + success = 0U; + break; + } else if (rv == 2) { + break; + } + } + return NULL; +} + +/* Initialize per-thread event handling */ +static int32_t eventThreadInit(void) +{ + /* No special initialization required for this method */ + return 1; +} + +/* Register a new block with the event management */ +static int32_t eventThreadRegister( + NvSciStreamBlock blockHandle, + void* blockData, + BlockFunc blockFunc) +{ + /* Sanity check to make sure we left room for enough blocks */ + if (numBlocks >= MAX_BLOCKS) { + printf("Exceeded maximum number of blocks\n"); + return 0; + } + + /* Grab the next entry in the list for the new block and fill it in */ + BlockEventData* entry = &blocks[numBlocks++]; + entry->handle = blockHandle; + entry->data = blockData; + entry->func = blockFunc; + + /* Spawn a thread */ + int32_t rv = pthread_create(&entry->thread, + NULL, + eventThreadFunc, + (void*)entry); + if (rv != 0) { + printf("Failed to spawn thread to monitor block\n"); + return 0; + } + + return 1; +} + +/* Main per-thread event loop */ +static int32_t eventThreadLoop(void) +{ + /* + * Each block has its own thread loop. This main function just needs + * to wait for all of them to exit, and then return any error. This + * waiting can be done in any order. + */ + for (uint32_t i=0; i +#include +#if (QNX == 1) +#include +#endif +#include "nvscisync.h" +#include "nvscibuf.h" +#include "nvsciipc.h" +#include "nvscistream.h" +#include "event_loop.h" +#include "block_info.h" +#include +#include + +/* Base name for all IPC channels */ +static const char ipcBaseName[] = "nvscistream_"; + +/* Event handling function table */ +EventFuncs const* eventFuncs = NULL; + +/* Top level use-case setup function pointers */ +int32_t (*createProducer)( + NvSciStreamBlock* producer, + NvSciStreamBlock pool, + uint32_t numFrames) = createProducer_Usecase1; +int32_t (*createConsumer)( + NvSciStreamBlock* consumer, + NvSciStreamBlock queue, + uint32_t index, + uint32_t frames) = createConsumer_Usecase1; + +int32_t(*createPool)( + NvSciStreamBlock* pool, + uint32_t numPacket, + bool isC2cPool) = createPool_Common; + + +/* NvSci modules */ +NvSciSyncModule sciSyncModule; +NvSciBufModule sciBufModule; + +/* Flag used to terminate the thread that +* was spawned to handle the late/re-attached +* consumer connections upon stream disconnect +*/ +atomic_int streamDone; + +/* Holds the multicast block handle for +* late/re-attach usecase +*/ +static NvSciStreamBlock multicastBlock = 0U; + +/* Dispatch thread for handling late/re-attach +* consumer connections +*/ +pthread_t dispatchThread; + +/* Endpoint status structure*/ +Endpoint ipcEP[MAX_CONSUMERS]; + +/* pthread variables for thread synchronization */ +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond = PTHREAD_COND_INITIALIZER; + +#if (QNX == 1) +/* Data needed for QNX channel connection */ +int8_t DELTA = 1; +int8_t ipcCode = (_PULSE_CODE_MINAVAIL + 1); +#endif +/* Common options for all blocks */ +CommonOptions opts; + +/* Options for producer */ +typedef struct { + uint32_t resident; + uint32_t numConsumer; + uint32_t numPacket; + uint32_t numFrames; + uint32_t usecase; +} ProducerOptions; + +/* Options for each consumer */ +typedef struct { + uint32_t resident; + uint32_t useMailbox; + uint32_t useLimiter; + uint32_t c2cMode; + uint32_t c2cSrcUseMailbox; + uint32_t c2cDstNumPacket; + char srcChannel[32]; + char dstChannel[32]; + char ipcChannelForHandshake[32]; + uint32_t numFrames; +} ConsumerOptions; + +ConsumerOptions consOpts[MAX_CONSUMERS]; + +/* Print command line options */ +static void print_usage(const char *str) +{ + printf("%s [options]\n", str); + printf(" For single- or inter-process/chip operation:\n"); + printf(" -m [default 1, max %d]\n", MAX_CONSUMERS); + printf(" number of multicast consumers\n"); + printf(" (ignored if process doesn't own producer\n"); + printf(" -f [default 3]\n"); + printf(" number of packets in main pool\n"); + printf(" (ignored if process doesn't own producer\n"); + printf(" -l [default - not used]\n"); + printf(" use limiter block for indexed consumer\n"); + printf(" (ignored if process doesn't own producer\n"); + printf(" -q {f|m} [default f]\n"); + printf(" use fifo (f) or maibox (m) for indexed consumer\n"); + printf(" (ignored if process doesn't own the indexed consumer\n"); + printf(" -e {s|t} [default s]\n"); + printf(" s : events are handled through a single service\n"); + printf(" t : events are handled with separate per-block threads\n"); + printf(" -E\n"); + printf(" Use the user-provided event service to handle internal I/O " + "messages on ipc blocks.\n"); + printf(" Only Supported with event service\n"); +#if (NV_SUPPORT_NVMEDIA == 1) + printf(" -s {y|r} [default r]\n"); + printf(" y : NvSciColor_Y8U8Y8V8 Image Color Format in use case 2\n"); + printf(" r : NvSciColor_A8R8G8B8 Image Color Format in use case 2\n"); +#endif + printf(" -u [default 1]\n"); + printf(" use case (must be same for all processes)\n"); + printf(" 1 : CUDA (rt) producer to CUDA (rt) consumer\n"); +#if (NV_SUPPORT_NVMEDIA == 1) + printf(" 2 : NvMedia producer to CUDA (rt) consumer\n"); +#endif +#if (NV_SUPPORT_ASILD == 1) + printf(" 3 : CUDA (rt) producer to CUDA (rt) consumer " + "in ASIL-D process, not supported in C2C.\n"); +#endif + printf(" -i [default - not used]\n"); + printf(" set endpoint info and query info from other endpoints\n"); + printf(" For inter-process operation:\n"); + printf(" -p\n"); + printf(" producer resides in this process\n"); + printf(" -c \n"); + printf(" indexed consumer resides in this process\n"); + printf(" For inter-chip (C2C) operation:\n"); + printf(" -P \n"); + printf(" producer resides in this process\n"); + printf(" Ipc endpoint used by the producer to communicate with the " + "indexed chip-to-chip (C2C) consumer\n"); + printf(" User must provide all the C2C endpoints required to " + "communicate with the total number of consumers for C2C " + "usecase when late-/reattach is chosen.\n"); + printf(" -C \n"); + printf(" indexed consumer resides in this process\n"); + printf(" Ipc endpoint used by this chip-to-chip (C2C) consumer\n"); + printf(" -C and -c can't be used simultaneously.\n"); + printf(" (ignored if process owns producer)\n"); + printf(" -F [default 3]\n"); + printf(" number of packets in pool attached to the IpcDst block " + "of the indexed C2C consumer\n"); + printf(" set along with the indexed C2C consumer.\n"); + printf(" (ignored if process doesn't own indexed C2C consumer)\n"); + printf(" -Q {f|m} [default f]\n"); + printf(" use fifo (f) or maibox (m) for C2C IpcSrc of indexed " + "consumer.\n"); + printf(" Can't specify same index as -c)\n"); + printf(" set in the producer process.\n"); + printf(" (ignored if process doesn't own producer)\n"); + printf(" -r [default 0]\n"); + printf(" Number of late-attach consumers\n"); + printf(" set in the producer process and currently supported " + "for usecase1.\n"); + printf(" -L\n"); + printf(" set in the consumer process to indicate the consumer " + "connection is late/re-attach.\n"); + printf(" -k [default 0]\n"); + printf(" Number of frames expected to be received by the indexed consumer\n"); + printf(" -n [default 32]\n"); + printf(" Number of frames expected to be produced by the producer\n"); + printf(" With -r option is specified, the default value is set to 100000\n"); + printf(" With -r option is NOT specified, the default value is set to 32\n"); +} + +/* Deletes the block that are created for handling late-/re-attach +* connection when late/re-attach consumer connection fails. +*/ +static void deleteBlock(NvSciStreamBlock block) +{ + for (int32_t i=0; i< numBlocks; i++) { + BlockEventData* entry = &blocks[i]; + if (entry->handle == block) { + deleteCommon(entry->data); + } + } +} + +/* Function to handle the opening of IPC endpoint +* for handshaking. +*/ +static bool openIpcEndpoint(uint32_t index, bool isConsumer) +{ + NvSciError err; + if (!isConsumer) { + err = NvSciIpcOpenEndpoint(ipcEP[index].ipcChannelForHandshake, + &ipcEP[index].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to open channel (%s) for IpcSrc\n", + err, ipcEP[index].ipcChannelForHandshake); + return false; + } + } else { + err = NvSciIpcOpenEndpoint(consOpts[index].ipcChannelForHandshake, + &ipcEP[index].ipcEndpoint); + if (NvSciError_Success != err) { + printf("Failed (%x) to open channel (%s) for IpcDst\n", + err, consOpts[index].ipcChannelForHandshake); + return false; + } + } + +#if (QNX == 1) + ipcEP[index].chid = ChannelCreate_r(_NTO_CHF_UNBLOCK | _NTO_CHF_PRIVATE); + if (ipcEP[index].chid < 0) { + printf("ChannelCreate_r: fail for connection index=%d \n", index); + return false; + } + + ipcEP[index].coid = ConnectAttach_r(0, 0, ipcEP[index].chid, + _NTO_SIDE_CHANNEL, _NTO_COF_CLOEXEC); + if (ipcEP[index].coid < 0) { + printf("ConnectAttach_r: fail for connection index=%d\n", index); + return false; + } + err = NvSciIpcSetQnxPulseParamSafe(ipcEP[index].ipcEndpoint, ipcEP[index].coid, + SIGEV_PULSE_PRIO_INHERIT, + ipcCode); + if (err != NvSciError_Success) { + printf("NvSciIpcSetQnxPulseParamSafe(%x) failed for connection index=%d\n", + err, index); + return false; + } +#endif + + err = NvSciIpcResetEndpointSafe(ipcEP[index].ipcEndpoint); + if (err != NvSciError_Success) { + printf("NvSciIpcResetEndpointSafe(%x) failed for connection index=%d\n", + err, index); + return false; + } + + return true; +} + +/* Function to handle the IPC connection for handshaking */ +static bool waitForIpcConnection(uint32_t index, bool isConsumer) +{ + NvSciError err; + bool retry = true; + + while(retry) { + uint32_t receivedEvents = 0U; + err = NvSciIpcGetEventSafe(ipcEP[index].ipcEndpoint, &receivedEvents); + if (NvSciError_Success != err) { + atomic_store(&streamDone, 1); + printf("Failed (%x) to retrieve IPC events for connection index=%d\n", + err, index); + return false; + } + /* No need to retry if it is a producer */ + if(!isConsumer) { + retry = false; + } + + if (receivedEvents & (NV_SCI_IPC_EVENT_CONN_EST_ALL)) { +#if (QNX == 1) + if (ipcEP[index].coid != 0) { + (void)ConnectDetach_r(ipcEP[index].coid); + ipcEP[index].coid = 0; + } + if (ipcEP[index].chid != 0) { + (void)ChannelDestroy_r(ipcEP[index].chid); + ipcEP[index].chid = 0; + } +#endif + err = NvSciIpcCloseEndpointSafe(ipcEP[index].ipcEndpoint, false); + if (NvSciError_Success != err) { + atomic_store(&streamDone, 1); + printf("Failed (%x) to close IPC endpoint for connection index=%d\n", + err, index); + return false; + } + ipcEP[index].ipcEndpoint = 0U; + + /* We need to open the endpoint again if it is not a consumer */ + if (!isConsumer) { + err = NvSciIpcOpenEndpoint(ipcEP[index].ipcChannel, + &ipcEP[index].ipcEndpoint); + if (NvSciError_Success != err) { + atomic_store(&streamDone, 1); + printf("Failed (%x) to open channel (%s) for IpcSrc \ + for connection index=%d\n", + err, ipcEP[index].ipcChannel, index); + return false; + } + err = NvSciIpcResetEndpointSafe(ipcEP[index].ipcEndpoint); + if (NvSciError_Success != err) { + atomic_store(&streamDone, 1); + printf("Failed (%x) to reset IPC endpoint for connection \ + index = %d\n", err, index); + return false; + } + ipcEP[index].ipcConnected = true; + } + return true; + } + } + return false; +} + +/* Dispatch thread function to handle late/re-attach +* consumer connections +*/ +void* handleLateConsumerThreadFunc(void *args) +{ + NvSciError err; + bool lateConsumerConnectionFound = false; + bool retry = false; + + while(!atomic_load(&streamDone) || retry) { + /* Poll for the status of IPC channels */ + pthread_mutex_lock(&mutex); + for (uint32_t i=0; inumPacket, false)) { + return 0; + } + + /* Create producer */ + NvSciStreamBlock producerBlock; + if (!createProducer(&producerBlock, poolBlock, prodOpts->numFrames)) { + return 0; + } + + /* If multicast required, add the block. */ + if (prodOpts->numConsumer > 1) { + + /* Create multicast block */ + if (!createMulticast(&multicastBlock, prodOpts->numConsumer)) { + return 0; + } + + /* Connect to producer */ + if (NvSciError_Success != + NvSciStreamBlockConnect(producerBlock, multicastBlock)) { + printf("Failed to connect multicast to producer\n"); + return 0; + } + + /* Multicast block is end of chain */ + *producerLink = multicastBlock; + + } else { + + /* Producer block is end of chain */ + *producerLink = producerBlock; + + } + + return 1; +} + +/* Set up chain of consumer side blocks */ +static int32_t setupConsumerChain( + NvSciStreamBlock* consumerLink, + ConsumerOptions* consOpts, + uint32_t index) +{ + /* + * Note: Currently the consumer "chain" just consists of the consumer + * itself and its associated queue. We follow this paradigm to + * allow easy addition of new optional blocks in the future. + */ + + /* Create queue */ + NvSciStreamBlock queueBlock; + if (!createQueue(&queueBlock, consOpts->useMailbox)) { + return 0; + } + + /* Create consumer */ + NvSciStreamBlock consumerBlock; + if (!createConsumer(&consumerBlock, queueBlock, index, consOpts->numFrames)) { + return 0; + } + + /* Consumer block is start of chain */ + *consumerLink = consumerBlock; + + return 1; +} + +/* Add additional branch options */ +static int32_t setupBranchOptions( + NvSciStreamBlock* consumerLink, + ConsumerOptions* consOpts) +{ + /* If limiter requested, add it */ + if (consOpts->useLimiter) { + + /* If a consumer may generate unreliable fences, a ReturnSync block can + * be added as the downstream of the Limiter block for that consumer, + * to isolate any packets with bad fences. + */ + NvSciStreamBlock returnSyncBlock; + if (!createReturnSync(&returnSyncBlock)) { + return 0; + } + + /* Connect to incoming consumer chain */ + if (NvSciError_Success != + NvSciStreamBlockConnect(returnSyncBlock, *consumerLink)) { + printf("Failed to connect returnSyncBlock to consumer chain\n"); + return 0; + } + + + /* ReturnSync is new end of chain */ + *consumerLink = returnSyncBlock; + + /* Create limiter */ + NvSciStreamBlock limiterBlock; + if (!createLimiter(&limiterBlock, consOpts->useLimiter)) { + return 0; + } + + /* Connect to incoming consumer chain */ + if (NvSciError_Success != + NvSciStreamBlockConnect(limiterBlock, *consumerLink)) { + printf("Failed to connect limiter to consumer chain\n"); + return 0; + } + + /* Limiter is new end of chain */ + *consumerLink = limiterBlock; + } + + return 1; +} + +/* Set up IPC from producer to consumer */ +static int32_t setupProdToConsIPC( + NvSciStreamBlock* consumerLink, + ConsumerOptions* consOpts, + bool useExtEventService) +{ + if (!consOpts->c2cMode) { + /* Create IPC block */ + if (!createIpcSrc(consumerLink, + consOpts->srcChannel, + useExtEventService)) { + return 0; + } + } else { + /* Create a queue for C2C src block */ + NvSciStreamBlock queueBlock; + if (!createQueue(&queueBlock, consOpts->c2cSrcUseMailbox)) { + return 0; + } + + /* Create C2C block */ + if (!createC2cSrc(consumerLink, consOpts->srcChannel, queueBlock)) { + return 0; + } + + /* If mailbox is used with C2CSrc, then create presentSync block */ + if (1U == consOpts->c2cSrcUseMailbox) { + NvSciStreamBlock presentSyncBlock; + if (!createPresentSync(&presentSyncBlock)) { + return 0; + } + + if (NvSciError_Success != + NvSciStreamBlockConnect(presentSyncBlock, *consumerLink)) { + printf("Failed to connect PresentSync to consumer chain\n"); + return 0; + } + + /* PresentSync is new end of chain */ + *consumerLink = presentSyncBlock; + } + } + return 1; +} + +/* Set up IPC from consumer to producer */ +static int32_t setupConsToProdIPC( + NvSciStreamBlock* producerLink, + ConsumerOptions* consOpts, + bool useExtEventService) +{ + if (!consOpts->c2cMode) { + /* Create IPC block */ + if (!createIpcDst(producerLink, + consOpts->dstChannel, + useExtEventService)) { + return 0; + } + } else { + /* Create a pool for C2C dst block */ + NvSciStreamBlock poolBlock; + if (!createPool(&poolBlock, consOpts->c2cDstNumPacket, true)) { + return 0; + } + /* Create C2C block */ + if (!createC2cDst(producerLink, consOpts->dstChannel, poolBlock)) { + return 0; + } + } + return 1; +} + +/* + * Main application function. + * As per standards, return of 0 indicates success and anything + * else is failure. + */ +int main(int argc, char *argv[]) +{ + uint32_t i; + int ret = 0; + + /* Initialize parameters */ + uint32_t badParam = 0U; + uint32_t multiProcess = 0U; + uint32_t multiSOC = 0U; + uint32_t eventOption = 0U; + + ProducerOptions prodOpts = {.resident=0U, .numConsumer=1U, + .numPacket=3U, .numFrames=32, .usecase=1}; + memset(consOpts, 0, sizeof(consOpts)); + memset(&opts, 0, sizeof(CommonOptions)); + memset(ipcEP, 0, sizeof(Endpoint)); + + /* Parse command line */ + int32_t opt; + while ((opt = getopt(argc, argv, "m:n:r:f:l:q:k:e:ELs:u:ipc:P:C:F:Q:")) != EOF) { + switch (opt) { + case 'm': /* set number of consumers */ + prodOpts.numConsumer = atoi(optarg); + opts.numConsumer = prodOpts.numConsumer; + if ((prodOpts.numConsumer < 1U) || + (prodOpts.numConsumer > MAX_CONSUMERS)) { + badParam = 1U; + } + for (i=0; i< MAX_CONSUMERS; i++) { + sprintf(ipcEP[i].ipcChannel, "%s%d", ipcBaseName, 2*i+0); + sprintf(ipcEP[i].ipcChannelForHandshake, + "%s%d", ipcBaseName, 2*i+8); + } + break; + case 'r': /* set number of late consumers */ + opts.numLateConsumer = atoi(optarg); + opts.lateAttach = true; + if (opts.numLateConsumer > MAX_CONSUMERS) { + badParam = 1U; + } + /* there must be atleast one early consumer */ + if ((prodOpts.numConsumer - opts.numLateConsumer) < 1U) { + badParam = 1U; + } + prodOpts.numFrames = 100000; + break; + case 'f': /* set number of packets */ + prodOpts.numPacket = atoi(optarg); + if ((prodOpts.numPacket < 1U) || + (prodOpts.numPacket > MAX_PACKETS)) { + badParam = 1U; + } + break; + + case 'k': /* use specified number of frames for indexed consumer */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + consOpts[i].numFrames = atoi(argv[optind++]); + } + break; + + case 'n': /* use specified number of frames for producer */ + prodOpts.numFrames = atoi(optarg); + break; + + case 'l': /* use limiter block for indexed consumer */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + consOpts[i].useLimiter = atoi(argv[optind++]); + } + break; + case 'q': /* use specified queue for indexed consumer */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + char t = argv[optind++][0]; + if (t == 'm') { + consOpts[i].useMailbox = 1U; + } else if (t == 'f') { + consOpts[i].useMailbox = 0U; + } else { + badParam = 1U; + } + } + break; + case 'e': /* set event handling mechanism */ + if (optarg[0] == 's') { + eventOption = 0U; + } else if (optarg[0] == 't') { + eventOption = 1U; + } else { + badParam = 1U; + } + break; + case 'E': /* set user-provided event service to handle IPC event */ + opts.useExtEventService = true; + break; + + case 'L': /* Indicates late/reattaching of a consumer connection */ + opts.lateAttach = true; + break; + + case 's': /* set Image Color Format type */ + if (optarg[0] == 'r') { + opts.yuv = false; + } else if (optarg[0] == 'y') { + opts.yuv = true; + } else { + badParam = 1U; + } + break; + case 'u': /* set use case */ + i = atoi(optarg); + prodOpts.usecase = i; + if (i == 1) { + createProducer = createProducer_Usecase1; + createConsumer = createConsumer_Usecase1; + createPool = createPool_Common; + } +#if (NV_SUPPORT_NVMEDIA == 1) + else if (i == 2) { + createProducer = createProducer_Usecase2; + createConsumer = createConsumer_Usecase2; + createPool = createPool_Common; + + } +#endif +#if (NV_SUPPORT_ASILD == 1) + else if (i == 3) { + createProducer = createProducer_Usecase3; + createConsumer = createConsumer_Usecase3; + createPool = createPool_Usecase3; + } +#endif + else { + badParam = 1U; + } + break; + case 'i': + opts.endInfo = true; + break; + + /* For inter - process operation */ + + case 'p': /* set producer resident */ + prodOpts.resident = 1U; + multiProcess = 1U; + break; + case 'c': /* set consumer resident */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + consOpts[i].resident = 1U; + multiProcess = 1U; + } + break; + + /* For inter - chip (C2C) operation */ + + case 'P': /* set ipc endpoint for C2C */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + /* Ipc channel used to communicate with this C2C consumer */ + strcpy(consOpts[i].srcChannel, argv[optind++]); + strcpy(ipcEP[i].c2cChannel, consOpts[i].srcChannel); + consOpts[i].c2cMode = 1U; + prodOpts.resident = 1U; + multiProcess = 1U; + multiSOC = 1U; + opts.c2cMode = true; + } + break; + case 'C': /* set C2C mode */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + /* Ipc channel name used by this C2C consumer */ + strcpy(consOpts[i].dstChannel, argv[optind++]); + consOpts[i].c2cMode = 1U; + multiProcess = 1U; + multiSOC = 1U; + if (consOpts[i].c2cDstNumPacket == 0U) { + /* default packet size 3 if not set already */ + consOpts[i].c2cDstNumPacket = 3U; + } + } + break; + case 'F': /* set number of packets for C2C Dst of indexed consumer */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + consOpts[i].c2cDstNumPacket = atoi(argv[optind++]); + if ((consOpts[i].c2cDstNumPacket < 1U) || + (consOpts[i].c2cDstNumPacket > MAX_PACKETS)) { + badParam = 1U; + } + } + break; + case 'Q': /* use specified queue for C2C Src of indexed consumer */ + i = atoi(optarg); + if (i >= MAX_CONSUMERS) { + badParam = 1U; + } else { + char t = argv[optind++][0]; + if (t == 'm') { + consOpts[i].c2cSrcUseMailbox = 1U; + } else if (t == 'f') { + consOpts[i].c2cSrcUseMailbox = 0U; + } else { + badParam = 1U; + } + } + break; + default: + badParam = 1U; + break; + } + } + + /* Handle parsing failure */ + if (badParam) { + print_usage(argv[0]); + return 1; + } + + /* Initialize streamDone */ + atomic_init(&streamDone, 0); + + if (opts.useExtEventService && (opts.numLateConsumer > 0U)) { + /* Using external event service for late/re-attach usecase + * is not supported for now. + */ + return 1; + } + + if (opts.useExtEventService && (eventOption == 1U)) { + /* Using external event service for internal ipc I/O messages + * not supported with threading model in this sample app */ + return 1; + } + + + if ((prodOpts.usecase > 1U) && (opts.numLateConsumer > 0U)) { + /* late/re-attach usecase is not supported except for usecase1 + in this sample app. */ + return 1; + } + + /* Check validity of the combination C2C & non-C2C consumers */ + for (i=0U; iinit() != 0) { + return 1; + } + + /* + * Initialize NvSci libraries + */ + if (NvSciError_Success != NvSciSyncModuleOpen(&sciSyncModule)) { + printf("Unable to open NvSciSync module\n"); + } + if (NvSciError_Success != NvSciBufModuleOpen(&sciBufModule)) { + printf("Unable to open NvSciBuf module\n"); + } + if (NvSciError_Success != NvSciIpcInit()) { + printf("Unable to initialize NvSciIpc\n"); + } + + /* + * If producer is resident, create producer block chain and attach + * all consumers. + */ + if (prodOpts.resident) { + + /* Set up producer chain (up through any multicast block) */ + NvSciStreamBlock producerLink; + if (!setupProducerChain(&producerLink, &prodOpts)) { + return 1; + } + + /* + * For each consumer, either set up the consumer chain or create + * the IPC block to communicate with it, depending on whether the + * consumer is resident. + */ + for (i=0U; i<(prodOpts.numConsumer - opts.numLateConsumer); ++i) { + + /* Create consumer or IPC to consumer */ + NvSciStreamBlock consumerLink; + if (consOpts[i].resident) { + if (!setupConsumerChain(&consumerLink, &consOpts[i], i)) { + return 1; + } + } else { + if (!setupProdToConsIPC(&consumerLink, + &consOpts[i], + opts.useExtEventService)) { + return 1; + } + } + /* When Late-/reattach usecase is selected, a returnSync block + * is connected to the consumer chain to ensure proper fence waits + * during consumer disconnect and reconnect. + */ + if (opts.numLateConsumer > 0U) { + NvSciStreamBlock returnSyncBlock; + if (!createReturnSync(&returnSyncBlock)) { + return 1; + } + + /* Connect to incoming consumer chain */ + if (NvSciError_Success != + NvSciStreamBlockConnect(returnSyncBlock, consumerLink)) { + printf("Failed to connect returnSyncBlock to consumer chain\n"); + return 1; + } + consumerLink = returnSyncBlock; + } + + /* Add any other options (e.g. limiter) for this branch */ + if (!setupBranchOptions(&consumerLink, &consOpts[i])) { + return 1; + } + + /* Attach to producer chain */ + if (NvSciError_Success != + NvSciStreamBlockConnect(producerLink, consumerLink)) { + printf("Failed to connect consumer %d to producer\n", i); + return 1; + } + + } + if (opts.numLateConsumer > 0U) { + NvSciError err; + err = NvSciStreamBlockSetupStatusSet(multicastBlock, + NvSciStreamSetup_Connect, + true); + if (err != NvSciError_Success) { + printf("Attaching a late consumer connection failed=%x\n", err); + return 1; + } + } + } + + /* + * Otherwise, create any consumer chains resident in this process, + * and connect with IPC back to the producer process. + */ + else { + + for (i=0U; iloop()) { + ret = 1; + } + + /* Wakeup the dispatch thread to terminate upon + * stream disconnect + */ + atomic_store(&streamDone, 1); + pthread_cond_signal(&cond); + + /* Wait for dispatch thread to terminate */ + if (opts.numLateConsumer > 0U) { + (void)pthread_join(dispatchThread, NULL); + } + + if (sciBufModule != NULL) { + NvSciBufModuleClose(sciBufModule); + sciBufModule = NULL; + } + + if (sciSyncModule != NULL) { + NvSciSyncModuleClose(sciSyncModule); + sciSyncModule = NULL; + } + + /* Close the NvSciIpc endpoint */ + for (uint32_t i = 0U; i< MAX_CONSUMERS; i++) { +#if (QNX == 1) + if (ipcEP[i].coid != 0) { + (void)ConnectDetach_r(ipcEP[i].coid); + ipcEP[i].coid = 0; + } + if (ipcEP[i].chid != 0) { + (void)ChannelDestroy_r(ipcEP[i].chid); + ipcEP[i].chid = 0; + } +#endif + if (ipcEP[i].ipcEndpoint) { + if (NvSciError_Success != + NvSciIpcCloseEndpointSafe(ipcEP[i].ipcEndpoint, false)) { + printf("Failed to close ipc endpoint\n"); + } + ipcEP[i].ipcEndpoint = 0U; + } + if (ipcEP[i].c2cEndpoint) { + if (NvSciError_Success != + NvSciIpcCloseEndpointSafe(ipcEP[i].c2cEndpoint, false)) { + printf("Failed to close c2c endpoint\n"); + } + ipcEP[i].c2cEndpoint = 0U; + } + } + + /* freeing the resources */ + pthread_mutex_destroy(&mutex); + pthread_cond_destroy(&cond); + + NvSciIpcDeinit(); + + return ret; +} diff --git a/event_sample_app/usecase1.h b/event_sample_app/usecase1.h new file mode 100644 index 0000000..41a1be0 --- /dev/null +++ b/event_sample_app/usecase1.h @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - usecase #1 + * + * This use case consists of CUDA producer and CUDA consumer(s). + * It makes use of the CUDA runtime toolkit. + * + * There are two packet elements, a large data buffer and a small + * buffer containing a CRC checksum value. + * + * The producer operation is very simple. It takes a local source buffer + * filled with simple data, and issues a CUDA command to asynchronously + * copy it to the packet's data buffer. It generates a checksum from the + * source buffer and puts that in the packet's CRC buffer. + * + * The consumer(s) similarly issues a CUDA command to copy the packet's + * data buffer to a local buffer. When finished, it generates a checksum + * from the the local copy and compares it to the value in the packet's + * CRC buffer. + * + * The data buffer is processed through the CUDA engine, with commands issued + * asynchronously. Sync objects must be used to coordinate when it is safe + * to write and read the buffer. The CRC buffer is written and read directly + * through the CPU. It uses immediate mode and it is not necessary to wait + * for the sync objects before accessing it. + * + * In addition to the normal case where producers signal sync objects that + * consumers wait for, and vice versa, this use case also needs the + * producer to be able to wait for the fences it generates, in order + * to protect its local buffer from modification while still in use. + * So this use case also provides an example of CPU waiting for fences. + */ + +#ifndef _USECASE1_H +#define _USECASE1_H 1 + +/* Names for the packet elements */ +#define ELEMENT_NAME_DATA 0xdada +#define ELEMENT_NAME_CRC 0xcc + +/* Names for the endpoint info */ +#define ENDINFO_NAME_PROC 0xabcd + +#endif // _USECASE1_H diff --git a/event_sample_app/util.c b/event_sample_app/util.c new file mode 100644 index 0000000..39cc268 --- /dev/null +++ b/event_sample_app/util.c @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - utility functions + */ + +#include "util.h" + +#define CRC32_POLYNOMIAL 0xEDB88320L + +static uint32_t calculateBufferCRC( + uint32_t count, + uint32_t crc, + uint8_t* buffer) +{ + static uint32_t crcTable[256]; + static int initialized = 0; + uint32_t i, j, tmp; + + if (!initialized) { + for (i = 0; i <= 255; i++) { + tmp = i; + for (j = 8; j > 0; j--) { + if (tmp & 1) { + tmp = (tmp >> 1) ^ CRC32_POLYNOMIAL; + } else { + tmp >>= 1; + } + } + crcTable[i] = tmp; + } + initialized = 1; + } + + while (count-- != 0) { + tmp = (crc >> 8) & 0x00FFFFFFL; + crc = tmp ^ crcTable[((uint32_t) crc ^ *buffer++) & 0xFF]; + } + + return crc; +} + +uint32_t generateCRC( + uint8_t *data_ptr, + uint32_t height, + uint32_t width, + uint32_t pitch) +{ + uint32_t y = 0U; + uint32_t crc = 0U; + for (y = 0U; y < height; y++) { + crc = calculateBufferCRC(width, crc, data_ptr); + data_ptr += pitch; + } + return crc; +} diff --git a/event_sample_app/util.h b/event_sample_app/util.h new file mode 100644 index 0000000..11243a8 --- /dev/null +++ b/event_sample_app/util.h @@ -0,0 +1,33 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +/* + * NvSciStream Event Loop Driven Sample App - utilities + */ + +#ifndef _UTIL_H +#define _UTIL_H 1 + +#include + +#if defined(__x86_64__) +#define cuDeviceGetUuid cuDeviceGetUuid_v2 +#endif + +/* CRC checksum generator */ +extern uint32_t generateCRC( + uint8_t *data_ptr, + uint32_t height, + uint32_t width, + uint32_t pitch); + +#endif // _UTIL_H diff --git a/push_info.txt b/push_info.txt new file mode 100644 index 0000000..dbe8a31 --- /dev/null +++ b/push_info.txt @@ -0,0 +1 @@ +jetson_38.2.1 diff --git a/rawstream/Makefile b/rawstream/Makefile new file mode 100644 index 0000000..050bed0 --- /dev/null +++ b/rawstream/Makefile @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +# + +# Location of common libraries +LIB_DIR = /usr/lib/aarch64-linux-gnu +# NOTE: This directory PATH will be moved from "tegra" to "nvidia". +TEGRA_LIB_DIR ?= /usr/lib/aarch64-linux-gnu/tegra + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +# Location of NVSCI header +NVSCI_HEADER_DIR ?= /usr/include/nvsci_headers +NVSCI_LIB_DIR = $(TEGRA_LIB_DIR) + +GCC ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(GCC) + +# internal flags +NVCCFLAGS := +CCFLAGS := +LDFLAGS := + +# Extra user flags +EXTRA_NVCCFLAGS ?= +EXTRA_LDFLAGS ?= +EXTRA_CCFLAGS ?= + +override abi := aarch64 +LDFLAGS += --dynamic-linker=/lib/ld-linux-aarch64.so.1 + +# Debug build flags +dbg = 0 +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + TARGET := debug +else + TARGET := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +################################################################################ + +# Common includes and paths +INCLUDES := -I./ +LIBRARIES := -L$(LIB_DIR) + +CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs +CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs +CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/aarch64-linux/lib/stubs + +CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) +ifeq ("$(CUDALIB)","") + $(error ERROR - libcuda.so not found, CUDA Driver is not installed or CUDA_PATH is not correctly set.) +else + CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" ) + LIBRARIES += -L$(CUDALIB) -lcuda -lrt +endif + +# Includes and paths for NVSCI libraries +NVSCIBUFHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvscibuf.h -print 2>/dev/null) +NVSCISYNCHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvscisync.h -print 2>/dev/null) +NVSCIIPCHEADER := $(shell find -L $(NVSCI_HEADER_DIR) -name nvsciipc.h -print 2>/dev/null) + +ifeq ("$(NVSCIBUFHEADER)","") + $(error ERROR - nvscibuf.h not found in $(NVSCI_HEADER_DIR)) +endif +ifeq ("$(NVSCISYNCHEADER)","") + $(error ERROR - nvscisync.h not found in $(NVSCI_HEADER_DIR)) +endif +ifeq ("$(NVSCIIPCHEADER)","") + $(error ERROR - nvsciipc.h not found in $(NVSCI_HEADER_DIR)) +endif + +INCLUDES += -I$(NVSCI_HEADER_DIR) +LIBRARIES += -L$(NVSCI_LIB_DIR) -lnvscibuf -lnvscisync -lnvscievent -lnvsciipc -lnvscistream + +ALL_CCFLAGS += --std=c++11 --threads 0 + +################################################################################ +# Target rules +OUTPUT := rawstream + +all: build + +build: $(OUTPUT) + +OBJ := rawstream_consumer.o +OBJ += rawstream_cuda.o +OBJ += rawstream_ipc_linux.o +OBJ += rawstream_main.o +OBJ += rawstream_producer.o + +%.o: %.c + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) -o $@ -c $< + +$(OUTPUT): $(OBJ) + $(NVCC) $(ALL_LDFLAGS) -o $@ $+ $(LIBRARIES) + +run: build + $(OUTPUT) + +testrun: build + +clean: + rm -f $(OBJ) $(OUTPUT) + +clobber: clean diff --git a/rawstream/README.txt b/rawstream/README.txt new file mode 100644 index 0000000..b0b31b9 --- /dev/null +++ b/rawstream/README.txt @@ -0,0 +1,41 @@ +Rawstream Sample App - README + +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + +NVIDIA Corporation and its licensors retain all intellectual property and +proprietary rights in and to this software, related documentation and any +modifications thereto. Any use, reproduction, disclosure or distribution +of this software and related documentation without an express license +agreement from NVIDIA Corporation is strictly prohibited. + +--- +# rawstream - NvStreams Rawstream Sample App + +## Description + +This directory contains a raw stream sample application using NvSciBuf, +NvSciSync and NvSciIpc. + + +## Build the application + +The rawstream sample includes source code and a Makefile. +1. On the host system, navigate to the sample application directory: + + $ cd /samples/nvsci/rawstream/ + +2. Build the sample application: + + $ make clean + $ make + + +## Examples of how to run the sample application: + + $ sudo ./rawstream -p & + $ sudo ./rawstream -c + +## Examples of how to run the sample application for late attach: + + $ sudo ./rawstream -p -l & + $ sudo ./rawstream -c -l diff --git a/rawstream/rawstream.h b/rawstream/rawstream.h new file mode 100644 index 0000000..e55554a --- /dev/null +++ b/rawstream/rawstream.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020-2025 NVIDIA Corporation. All Rights Reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. Any + * use, reproduction, disclosure or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation + * is strictly prohibited. + */ + +#ifndef _rawstream_h +#define _rawstream_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda.h" +#include "cuda_runtime_api.h" + +// Constants controlling configuration +#define totalFrames 32U +#define totalBuffers 4U + +#if defined(__x86_64__) +#define cuDeviceGetUuid cuDeviceGetUuid_v2 +#endif + +// Sync/Buf modules +extern NvSciSyncModule syncModule; +extern NvSciBufModule bufModule; + +// Exchanged sync info +extern NvSciSyncAttrList producerSignalAttrs; +extern NvSciSyncAttrList consumerSignalAttrs; +extern NvSciSyncAttrList producerWaitAttrs; +extern NvSciSyncAttrList consumerWaitAttrs; +extern NvSciSyncAttrList prodToConsAttrs; +extern NvSciSyncAttrList consToProdAttrs; +extern NvSciSyncObj consumerSignalObj; +extern NvSciSyncObj producerSignalObj; +extern NvSciSyncObj consumerWaitObj; +extern NvSciSyncObj producerWaitObj; + +// Exchanged buf info +extern NvSciBufAttrList producerWriteAttrs; +extern NvSciBufAttrList consumerReadAttrs; +extern NvSciBufAttrList combinedBufAttrs; + +// CUDA info common to producer and consumer +typedef struct { + int deviceId; + CUuuid uuid; + cudaStream_t stream; + cudaExternalSemaphore_t signalerSem; + cudaExternalSemaphore_t waiterSem; + NvSciBufType bufType; + uint64_t bufSize; + uint8_t* bufCopy; +} CudaClientInfo; + +// List of buffers with status +typedef struct { + // Buffer handle + NvSciBufObj obj; + // CUDA external memory object + cudaExternalMemory_t extMem; + // Mapping into virtual memory + uint8_t* ptr; + // Current owner (0 = producer, 1 = consumer) + uint32_t owner; + // Fence to wait for + NvSciSyncFence fence; + // Checksum for error checking + uint32_t crc; +} Buffer; +extern Buffer buffers[totalBuffers]; + +// packet data +// Note: The checksum is not, in general, needed in a real streaming +// application. All that is required is something to identify +// the buffer and provide the fences. See comments in the producer +// and consumer for the reason for the checksum. +typedef struct { + // buffer identifier + uint32_t bufferId; + // buffer checksum + uint32_t crc; + // Fence to wait for + NvSciSyncFenceIpcExportDescriptor fenceDesc; +} Packet; + +// IPC related info +typedef struct { + // NvSciIpc handle + NvSciIpcEndpoint endpoint; + // IPC channel info + struct NvSciIpcEndpointInfo info; + + // QNX: Channel id to get event + int32_t chId; + // QNX: Connection id to send event in library + int32_t connId; + // Linux: IPC event fd + int32_t ipcEventFd; +} IpcWrapper; +extern IpcWrapper ipcWrapper; + +// CUDA data types +typedef struct cudaExternalSemaphoreHandleDesc cudaExternalSemaphoreHandleDesc; +typedef struct cudaExternalMemoryHandleDesc cudaExternalMemoryHandleDesc; +typedef struct cudaExternalMemoryBufferDesc cudaExternalMemoryBufferDesc; +typedef struct cudaExternalSemaphoreWaitParams cudaExternalSemaphoreWaitParams; +typedef struct cudaExternalSemaphoreSignalParams cudaExternalSemaphoreSignalParams; + +// Utility functions +extern uint32_t GenerateCRC(uint8_t* data, uint32_t width, uint32_t height, uint32_t pitch); + +// Thread functions +extern void* producerFunc(void*); +extern void* consumerFunc(void*); + +// IPC functions +extern NvSciError ipcInit(const char* endpointName, IpcWrapper* ipcWrapper); +extern NvSciError ipcSend(IpcWrapper* ipcWrapper, const void* buf, const size_t size); +extern NvSciError ipcRecvFill(IpcWrapper* ipcWrapper, void* buf, const size_t size); +extern void ipcDeinit(IpcWrapper* ipcWrapper); + +// CUDA-specific operations +extern bool setupCuda(CudaClientInfo* info); +extern bool setupCudaSync(CudaClientInfo* info, + NvSciSyncObj sciSignalObj, + NvSciSyncObj sciWaitObj); +extern bool setupCudaBufAttr(CudaClientInfo* info, + NvSciBufAttrList attrs); +extern bool setupCudaBuffer(CudaClientInfo* info, + Buffer* buf); +extern bool waitCudaFence(CudaClientInfo* info, + Buffer* buf); +extern bool signalCudaFence(CudaClientInfo* info, + Buffer* buf); +extern void deinitCuda(CudaClientInfo* info); +extern void deinitCudaBuffer(Buffer* buf, int num); + +#endif // _rawstream_h diff --git a/rawstream/rawstream_consumer.c b/rawstream/rawstream_consumer.c new file mode 100644 index 0000000..e77a40d --- /dev/null +++ b/rawstream/rawstream_consumer.c @@ -0,0 +1,670 @@ +/* + * Copyright (c) 2020-2024 NVIDIA Corporation. All Rights Reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. Any + * use, reproduction, disclosure or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation + * is strictly prohibited. + */ + +#include "rawstream.h" + +void* consumerFunc(void* arg) +{ + CudaClientInfo cudaInfo; + NvSciError sciErr; + int cudaErr; + void* recvWaitListDesc = NULL; + void* recvObjAndListDesc = NULL; + void* recvBufListDesc = NULL; + + *(int*)arg = 1; + fprintf(stderr, "Consumer starting\n"); + + // Do common cuda initialization + if (!setupCuda(&cudaInfo)) { + goto done; + } + + // Create an empty sync attribute list for signaling permissions. + sciErr = NvSciSyncAttrListCreate(syncModule, &consumerSignalAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to create consumer signal attrs (%x)\n", + sciErr); + goto done; + } + + // Query CUDA for attributes needed to signal syncs + cudaErr = cudaDeviceGetNvSciSyncAttributes(consumerSignalAttrs, + cudaInfo.deviceId, + cudaNvSciSyncAttrSignal); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Could not query signal attributes from CUDA (%d)\n", + cudaErr); + goto done; + } + + fprintf(stderr, "Consumer signal attributes established\n"); + + // Create an empty sync attribute list for waiting permissions. + sciErr = NvSciSyncAttrListCreate(syncModule, &consumerWaitAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to create consumer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Query CUDA for attributes needed to wait for syncs + cudaErr = cudaDeviceGetNvSciSyncAttributes(consumerWaitAttrs, + cudaInfo.deviceId, + cudaNvSciSyncAttrWait); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Could not query wait attributes from CUDA (%d)\n", + cudaErr); + goto done; + } + + fprintf(stderr, "Consumer wait attributes established\n"); + + // Export consumer's wait attributes to a form suitable for IPC + size_t sendWaitAttrListSize = 0U; + void* sendWaitListDesc = NULL; + sciErr = NvSciSyncAttrListIpcExportUnreconciled(&consumerWaitAttrs, + 1, + ipcWrapper.endpoint, + &sendWaitListDesc, + &sendWaitAttrListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to export consumer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Send the size of the consumer's wait attributes to the producer, + // so it knows how much data to expect + sciErr = ipcSend(&ipcWrapper, + &sendWaitAttrListSize, + sizeof(sendWaitAttrListSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to send consumer wait attrs size (%x)\n", + sciErr); + goto done; + } + + // Send the exported form of the consumer's wait attributes + sciErr = ipcSend(&ipcWrapper, + sendWaitListDesc, + sendWaitAttrListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to send consumer wait attrs (%x)\n", sciErr); + goto done; + } + + // Wait to receive the size of the producer's wait attributes + size_t recvWaitAttrListSize = 0U; + sciErr = ipcRecvFill(&ipcWrapper, + &recvWaitAttrListSize, + sizeof(recvWaitAttrListSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to recv producer wait attr size (%x)\n", + sciErr); + goto done; + } + + // Allocate a buffer big enough to receive the producer's wait attributes + recvWaitListDesc = malloc(recvWaitAttrListSize); + if (recvWaitListDesc == NULL) { + sciErr = NvSciError_InsufficientMemory; + fprintf(stderr, + "Sync attr allocation failed (%x)\n", + sciErr); + goto done; + } + + // Wait to receive producer's wait attributes + sciErr = ipcRecvFill(&ipcWrapper, + recvWaitListDesc, + recvWaitAttrListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to recv producer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Convert the received producer wait attributes to an attribute list + sciErr = NvSciSyncAttrListIpcImportUnreconciled(syncModule, + ipcWrapper.endpoint, + recvWaitListDesc, + recvWaitAttrListSize, + &producerWaitAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to import producer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Get combined attributes for consumer to producer signaling + NvSciSyncAttrList syncAllAttrs[2], syncConflictAttrs; + syncAllAttrs[0] = consumerSignalAttrs; + syncAllAttrs[1] = producerWaitAttrs; + sciErr = NvSciSyncAttrListReconcile(syncAllAttrs, 2, + &consToProdAttrs, &syncConflictAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't merge consumer->producer attrs (%x)\n", + sciErr); + goto done; + } + + // Allocate consumer to producer sync object + sciErr = NvSciSyncObjAlloc(consToProdAttrs, &consumerSignalObj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't allocate consumer->producer sync (%x)\n", + sciErr); + goto done; + } + + // Export sync attributes and object to a form suitable for IPC + void* sendObjAndListDesc = NULL; + size_t sendObjAndListSize = 0U; + sciErr = NvSciSyncIpcExportAttrListAndObj(consumerSignalObj, + NvSciSyncAccessPerm_WaitOnly, + ipcWrapper.endpoint, + &sendObjAndListDesc, + &sendObjAndListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't export consumer->producer sync description (%x)\n", + sciErr); + goto done; + } + + // Send the size of the sync description to the producer, + // so it knows how much data to expect + sciErr = ipcSend(&ipcWrapper, &sendObjAndListSize, sizeof(size_t)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't send consumer->producer sync description size(%x)\n", + sciErr); + goto done; + } + + // Send the sync description to the producer + sciErr = ipcSend(&ipcWrapper, sendObjAndListDesc, sendObjAndListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't send consumer->producer sync description (%x)\n", + sciErr); + goto done; + } + + // Wait to receive the size of the producer->consumer sync desription + size_t recvObjAndListSize = 0U; + sciErr = ipcRecvFill(&ipcWrapper, + &recvObjAndListSize, + sizeof(size_t)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't recv producer->consumer sync description size (%x)\n", + sciErr); + goto done; + } + + // Allocate a buffer big enough to receive the description + recvObjAndListDesc = malloc(recvObjAndListSize); + if (NULL == recvObjAndListDesc) { + sciErr = NvSciError_InsufficientMemory; + fprintf(stderr, "Sync description allocation failed (%x)\n", sciErr); + goto done; + } + + // Wait to receive producer->consumer sync description + sciErr = ipcRecvFill(&ipcWrapper, + recvObjAndListDesc, + recvObjAndListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't receive producer->consumer sync description (%x)\n", + sciErr); + goto done; + } + + // Convert the received producer->consumer sync description to a + // sync attribute list and object + sciErr = NvSciSyncIpcImportAttrListAndObj(syncModule, + ipcWrapper.endpoint, + recvObjAndListDesc, + recvObjAndListSize, + &consumerWaitAttrs, + 1, + NvSciSyncAccessPerm_WaitOnly, + ipcWrapper.endpoint, + &consumerWaitObj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't import producer->consumer sync (%x)\n", + sciErr); + goto done; + } + + // Validate imported reconciled attribute list and object + { + NvSciSyncAttrList consumerWaitList; + + sciErr = NvSciSyncAttrListValidateReconciledAgainstAttrs( + consToProdAttrs, + NULL, + 0, + NvSciSyncAccessPerm_SignalOnly); + if (NvSciError_Success != sciErr) { + fprintf( + stderr, + "Validation of consToProd list failed: %x\n", sciErr); + goto done; + } + + sciErr = NvSciSyncObjGetAttrList(consumerWaitObj, + &consumerWaitList); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't get the reconciled list from consumer wait object (%x)\n", + sciErr); + goto done; + } + + sciErr = NvSciSyncAttrListValidateReconciledAgainstAttrs( + consumerWaitList, + NULL, + 0, + NvSciSyncAccessPerm_WaitOnly); + if (NvSciError_Success != sciErr) { + fprintf( + stderr, + "Validation of imported reconciled consumer wait list failed: %x\n", + sciErr); + goto done; + } + + sciErr = NvSciSyncObjValidate(consumerWaitObj); + if (NvSciError_Success != sciErr) { + fprintf( + stderr, + "Validation of imported consumer wait object failed: %x\n", + sciErr); + goto done; + } + } + + // Set up CUDA sync objects, importing NvSciSync objects + if (!setupCudaSync(&cudaInfo, consumerSignalObj, consumerWaitObj)) { + goto done; + } + + fprintf(stderr, "Consumer exchanged sync objects with producer\n"); + + // Create an empty buffer attribute list for consumer buffers + sciErr = NvSciBufAttrListCreate(bufModule, &consumerReadAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to create consumer buffer attrs (%x)\n", + sciErr); + goto done; + } + + // Fill consumer buffer attribute list with values + NvSciBufAttrKeyValuePair bufKeyValue[4]; + NvSciRmGpuId gpuId; + memcpy(&gpuId.bytes, &cudaInfo.uuid.bytes, sizeof(cudaInfo.uuid.bytes)); + bufKeyValue[0].key = NvSciBufGeneralAttrKey_GpuId; + bufKeyValue[0].value = &gpuId; + bufKeyValue[0].len = sizeof(gpuId); + NvSciBufType bufType = NvSciBufType_RawBuffer; + bufKeyValue[1].key = NvSciBufGeneralAttrKey_Types; + bufKeyValue[1].value = &bufType; + bufKeyValue[1].len = sizeof(bufType); + NvSciBufAttrValAccessPerm bufPerm = NvSciBufAccessPerm_Readonly; + bufKeyValue[2].key = NvSciBufGeneralAttrKey_RequiredPerm; + bufKeyValue[2].value = &bufPerm; + bufKeyValue[2].len = sizeof(bufPerm); + bool bufAccessFlag = true; + bufKeyValue[3].key = NvSciBufGeneralAttrKey_NeedCpuAccess; + bufKeyValue[3].value = &bufAccessFlag; + bufKeyValue[3].len = sizeof(bufAccessFlag); + + sciErr = NvSciBufAttrListSetAttrs(consumerReadAttrs, bufKeyValue, 4); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to fill consumer buffer attrs (%x)\n", sciErr); + goto done; + } + + fprintf(stderr, "Consumer buffer attributes established\n"); + + // Export consumer buffer attributes in a form suitable for IPC + size_t consumerReadAttrsSize = 0U; + void* consumerReadAttrsDesc = NULL; + sciErr = NvSciBufAttrListIpcExportUnreconciled(&consumerReadAttrs, + 1, + ipcWrapper.endpoint, + &consumerReadAttrsDesc, + &consumerReadAttrsSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to export consumer buffer attrs (%x)\n", + sciErr); + goto done; + } + + // Send size of consumer buffer attributes + sciErr = ipcSend(&ipcWrapper, + &consumerReadAttrsSize, + sizeof(consumerReadAttrsSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to send consumer buffer attrs size (%x)\n", + sciErr); + goto done; + } + + // Send consumer buffer attributes + sciErr = ipcSend(&ipcWrapper, + consumerReadAttrsDesc, + consumerReadAttrsSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to send consumer buffer attrs (%x)\n", sciErr); + goto done; + } + + // Wait to receive the size of the combined buffer attributes + size_t recvBufListSize = 0U; + sciErr = ipcRecvFill(&ipcWrapper, + &recvBufListSize, + sizeof(recvBufListSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to receive combinedbuffer attr size (%x)\n", + sciErr); + goto done; + } + + // Allocate a buffer big enough to receive the combined buffer attributes + recvBufListDesc = malloc(recvBufListSize); + if (NULL == recvBufListDesc) { + sciErr = NvSciError_InsufficientMemory; + fprintf(stderr, "Buffer attr allocation failed(%x)\n", sciErr); + goto done; + } + + // Receive the combined buffer attributes + sciErr = ipcRecvFill(&ipcWrapper, + recvBufListDesc, + recvBufListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to recv combined buffer attr desc (%x)\n", + sciErr); + goto done; + } + + // Convert the combined buffer attributes to an attribute list + sciErr = NvSciBufAttrListIpcImportReconciled(bufModule, + ipcWrapper.endpoint, + recvBufListDesc, + recvBufListSize, + NULL, + 0, + &combinedBufAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to import combined buffer attr (%x)\n", + sciErr); + goto done; + } + + // Extract attributes needed by CUDA + if (!setupCudaBufAttr(&cudaInfo, combinedBufAttrs)) { + goto done; + } + + // Receive all buffers + for (uint32_t i=0U; iobj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to import buffer %d from producer (%x)\n", + i, sciErr); + goto done; + } + + // Import the buffer into CUDA + if (!setupCudaBuffer(&cudaInfo, buf)) { + goto done; + } + + // Validate handles before entering runtime phase + { + NvSciBufAttrList reconciledList; + + sciErr = NvSciBufObjGetAttrList(buf->obj, &reconciledList); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't get the reconciled list from NvSciBufObj %d (%x)\n", + i, sciErr); + goto done; + } + + NvSciBufAttrListValidateReconciledAgainstAttrs(reconciledList, bufKeyValue, 4); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Validation of combinedBufAttrs list failed: %x\n", sciErr); + goto done; + } + + sciErr = NvSciBufObjValidate(buf->obj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Validation of imported buffer %d failed (%x)\n", + i, sciErr); + goto done; + } + } + } + + fprintf(stderr, "Consumer buffers received and established\n"); + + // Receive all frames + uint32_t currFrame = 0; + uint32_t currBuffer = 0; + Packet packet; + while (currFrame < totalFrames) { + fprintf(stderr, "Consumer starting frame %d in buffer %d\n", + currFrame, currBuffer); + Buffer* buf = &buffers[currBuffer]; + + // Wait for buffer to be available + while (buf->owner != 1U) { + + // Wait for next presented buffer + sciErr = ipcRecvFill(&ipcWrapper, &packet, sizeof(packet)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Failure to recv buffer from producer (%x)\n", + sciErr); + goto done; + } + + // Import transmitted fence description to a fence + sciErr = NvSciSyncIpcImportFence(consumerWaitObj, + &packet.fenceDesc, + &buffers[packet.bufferId].fence); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Failure to import fence from producer (%x)\n", + sciErr); + goto done; + } + + // copy CRC data from packet + buffers[packet.bufferId].crc = packet.crc; + + // Mark consumer as owner of this buffer + buffers[packet.bufferId].owner = 1U; + } + + // Wait for fence generated by producer before reading + if (!waitCudaFence(&cudaInfo, buf)) { + goto done; + } + + // Read the buffer to the local copy + cudaErr = cudaMemcpy2DAsync(cudaInfo.bufCopy, + cudaInfo.bufSize, + buf->ptr, + cudaInfo.bufSize, + cudaInfo.bufSize, + 1, + cudaMemcpyDeviceToHost, + cudaInfo.stream); + if (cudaSuccess != cudaErr) { + fprintf(stderr, "Unable to initiate CUDA copy (%d)\n", cudaErr); + goto done; + } + + // Wait for operation to finish, then compute and compare checksum + // IMPORTANT NOTE: + // A normal stream application would not perform these steps. + // A checksum is not required for streaming, and waiting for + // operations to finish (which we only need because the + // checksum is calculated by the CPU) introduces bubbles + // in the hardware pipeline. A real application can rely on + // the generated NvSciSync fences for synchronization. + // These steps are only taken in this sample application + // because the consumer has no output visible to the user, + // so the checksum allows us to verify that the application + // is behaving properly. + cudaDeviceSynchronize(); + uint32_t crc = GenerateCRC(cudaInfo.bufCopy, + 1, + cudaInfo.bufSize, + cudaInfo.bufSize); + if (buf->crc != crc) { + fprintf(stderr, "Checksums don't match (%x vs %x)\n", + crc, buf->crc); + goto done; + } + + fprintf(stderr, "Consumer read frame %d in buffer %d\n", + currFrame, currBuffer); + + // Generate new fence indicating when reading has finished + if (!signalCudaFence(&cudaInfo, buf)) { + goto done; + } + + // Mark buffer as owned by producer now + buf->owner = 0U; + + // Export buffer index and fence for transmission over IPC + // There is no checksum for the return trip. + packet.bufferId = currBuffer; + packet.crc = 0U; + sciErr = NvSciSyncIpcExportFence(&buf->fence, + ipcWrapper.endpoint, + &packet.fenceDesc); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to export consumer fence (%x)\n", sciErr); + goto done; + } + + // Send buffer index and fence to producer + sciErr = ipcSend(&ipcWrapper, &packet, sizeof(packet)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Failure to send buffer to producer (%x)\n", + sciErr); + goto done; + } + + fprintf(stderr, "Consumer finished frame %d in buffer %d\n", + currFrame, currBuffer); + + // Advance buffer and frame + currBuffer = (currBuffer + 1U) % totalBuffers; + currFrame++; + } + + // Success + *(int*)arg = 0; +done: + // Free CUDA resources + deinitCudaBuffer(buffers, totalBuffers); + deinitCuda(&cudaInfo); + + // Free NvSci objects + if (NULL != consumerSignalAttrs) + NvSciSyncAttrListFree(consumerSignalAttrs); + if (NULL != consumerWaitAttrs) + NvSciSyncAttrListFree(consumerWaitAttrs); + if (NULL != sendWaitListDesc) + NvSciSyncAttrListFreeDesc(sendWaitListDesc); + if (NULL != producerWaitAttrs) + NvSciSyncAttrListFree(producerWaitAttrs); + if (NULL != consToProdAttrs) + NvSciSyncAttrListFree(consToProdAttrs); + if (NULL != syncConflictAttrs) + NvSciSyncAttrListFree(syncConflictAttrs); + if (NULL != consumerSignalObj) + NvSciSyncObjFree(consumerSignalObj); + if (NULL != sendObjAndListDesc) + NvSciSyncAttrListAndObjFreeDesc(sendObjAndListDesc); + if (NULL != consumerWaitObj) + NvSciSyncObjFree(consumerWaitObj); + if (NULL != consumerReadAttrs) + NvSciBufAttrListFree(consumerReadAttrs); + if (NULL != consumerReadAttrsDesc) + NvSciBufAttrListFreeDesc(consumerReadAttrsDesc); + if (NULL != combinedBufAttrs) + NvSciBufAttrListFree(combinedBufAttrs); + + // Free malloc'd resources + if (NULL != recvWaitListDesc) + free(recvWaitListDesc); + if (NULL != recvObjAndListDesc) + free(recvObjAndListDesc); + if (NULL != recvBufListDesc) + free(recvBufListDesc); + + fprintf(stderr, "Consumer exiting\n"); + return NULL; +} diff --git a/rawstream/rawstream_cuda.c b/rawstream/rawstream_cuda.c new file mode 100644 index 0000000..b0d6e42 --- /dev/null +++ b/rawstream/rawstream_cuda.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2020-2021 NVIDIA Corporation. All Rights Reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. Any + * use, reproduction, disclosure or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation + * is strictly prohibited. + */ + +#include "rawstream.h" + +// Initialize CUDA info +bool setupCuda(CudaClientInfo* info) +{ + int cudaErr; + + info->deviceId = 0; + info->stream = NULL; + info->signalerSem = NULL; + info->waiterSem = NULL; + info->bufCopy = NULL; + + int numOfGPUs = 0; + cudaErr = cudaGetDeviceCount(&numOfGPUs); + if (cudaSuccess != cudaErr) { + fprintf(stderr, "Failed to get compute-capable devices (%d)\n", cudaErr); + return false; + } + + cudaErr = cudaSetDevice(info->deviceId); + if (cudaSuccess != cudaErr) { + fprintf(stderr, "Failed to set CUDA device (%d)\n", cudaErr); + return false; + } + + cudaErr = cuDeviceGetUuid(&info->uuid, info->deviceId); + if (CUDA_SUCCESS != cudaErr) { + fprintf(stderr, "Failed to query CUDA UUID (%d)\n", cudaErr); + return false; + } + + return true; +} + +// Create CUDA sync objects and map to imported NvSciSync +bool setupCudaSync(CudaClientInfo* info, + NvSciSyncObj sciSignalObj, + NvSciSyncObj sciWaitObj) +{ + cudaExternalSemaphoreHandleDesc extSemDesc; + int cudaErr; + + // Create CUDA stream for signaling and waiting + cudaErr = cudaStreamCreateWithFlags(&info->stream, + cudaStreamNonBlocking); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Unable to create CUDA stream (%d)\n", + cudaErr); + return false; + } + + // Import signaler sync object to CUDA semaphore + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = sciSignalObj; + cudaErr = cudaImportExternalSemaphore(&info->signalerSem, &extSemDesc); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Unable to import signal sync object to CUDA (%d)\n", + cudaErr); + return false; + } + + // Import waiter sync object to CUDA semaphore + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = sciWaitObj; + cudaErr = cudaImportExternalSemaphore(&info->waiterSem, &extSemDesc); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Unable to import wait sync object to CUDA (%d)\n", + cudaErr); + return false; + } + + return true; +} + +// Extract info from buffer attributes needed by CUDA +bool setupCudaBufAttr(CudaClientInfo* info, + NvSciBufAttrList attrs) +{ + NvSciBufAttrKeyValuePair queryKeyValue[] = { + { NvSciBufGeneralAttrKey_Types, NULL, 0 }, + { NvSciBufRawBufferAttrKey_Size, NULL, 0 }, + }; + + NvSciError sciErr = NvSciBufAttrListGetAttrs(attrs, queryKeyValue, 2); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to query buffer type/size (%x)\n", sciErr); + return false; + } + + // TODO: Original sample queries BufType but doesn't seem to do anything + // with it. Might not be needed. + info->bufType = *((NvSciBufType*)(queryKeyValue[0].value)); + info->bufSize = *((uint64_t*)(queryKeyValue[1].value)); + + // Allocate storage for a copy of the buffer contents + info->bufCopy = (uint8_t*)malloc(info->bufSize); + if (NULL == info->bufCopy) { + fprintf(stderr, "Unable to allocate buffer copy\n"); + return false; + } + (void)memset(info->bufCopy, 0, info->bufSize); + + return true; +} + +// Import NvSciBuf into CUDA +bool setupCudaBuffer(CudaClientInfo* info, + Buffer* buf) +{ + int cudaErr; + + // Import buffer to cuda as external memory + cudaExternalMemoryHandleDesc memHandleDesc; + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.handle.nvSciBufObject = buf->obj; + memHandleDesc.size = info->bufSize; + + cudaErr = cudaImportExternalMemory(&buf->extMem, &memHandleDesc); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Unable to import buffer to CUDA (%d)\n", + cudaErr); + return false; + } + + // Map to cuda memory buffer + cudaExternalMemoryBufferDesc bufferDesc; + memset(&bufferDesc, 0, sizeof(bufferDesc)); + bufferDesc.size = info->bufSize; + bufferDesc.offset = 0; + cudaErr = cudaExternalMemoryGetMappedBuffer((void *)&buf->ptr, + buf->extMem, + &bufferDesc); + if (cudaSuccess != cudaErr) { + fprintf(stderr, "Unable to map CUDA buffer (%d)\n", cudaErr); + return false; + } + + return true; +} + +// Tell CUDA to wait for the fence associated with a buffer +bool waitCudaFence(CudaClientInfo* info, + Buffer* buf) +{ + cudaExternalSemaphoreWaitParams waitParams; + memset(&waitParams, 0, sizeof(waitParams)); + waitParams.params.nvSciSync.fence = &buf->fence; + waitParams.flags = 0; + int cudaErr = cudaWaitExternalSemaphoresAsync(&info->waiterSem, + &waitParams, + 1, + info->stream); + if (cudaSuccess != cudaErr) { + fprintf(stderr, "Unable to wait for fence (%d)\n", cudaErr); + return false; + } + + NvSciSyncFenceClear(&buf->fence); + + return true; +} + +// Tell CUDA to generate a fence for a buffer +bool signalCudaFence(CudaClientInfo* info, + Buffer* buf) +{ + cudaExternalSemaphoreSignalParams signalParams; + memset(&signalParams, 0, sizeof(signalParams)); + signalParams.params.nvSciSync.fence = &buf->fence; + signalParams.flags = 0; + + int cudaErr = cudaSignalExternalSemaphoresAsync(&info->signalerSem, + &signalParams, + 1, + info->stream); + if (cudaSuccess != cudaErr) { + fprintf(stderr, "Unable to signal fence (%d)\n", cudaErr); + return false; + } + + return true; +} + +void deinitCuda(CudaClientInfo* info) +{ + if (NULL != info->bufCopy) { + free(info->bufCopy); + info->bufCopy = NULL; + } + if (NULL != info->signalerSem) { + (void)cudaDestroyExternalSemaphore(info->signalerSem); + info->signalerSem = NULL; + } + if (NULL != info->waiterSem) { + (void)cudaDestroyExternalSemaphore(info->waiterSem); + info->waiterSem = NULL; + } + if (NULL != info->stream) { + (void)cudaStreamDestroy(info->stream); + info->stream = NULL; + } +} + +void deinitCudaBuffer(Buffer* buf, int num) +{ + int i; + for (i = 0; i < num; ++i) { + if (NULL != buf[i].ptr) + cudaFree(buf[i].ptr); + if (NULL != buf[i].extMem) + (void)cudaDestroyExternalMemory(buf[i].extMem); + if (NULL != buf[i].obj) + NvSciBufObjFree(buf[i].obj); + } +} + diff --git a/rawstream/rawstream_ipc_linux.c b/rawstream/rawstream_ipc_linux.c new file mode 100644 index 0000000..25dc1e7 --- /dev/null +++ b/rawstream/rawstream_ipc_linux.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020-2022 NVIDIA Corporation. All Rights Reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. Any + * use, reproduction, disclosure or distribution of this software and related + * documentation without an express license agreement from NVIDIA Corporation + * is strictly prohibited. + */ + +#include "rawstream.h" + +// Initialize one end of named communcation channel +NvSciError ipcInit(const char* endpointName, IpcWrapper* ipcWrapper) +{ + NvSciError err = NvSciError_Success; + + // Open named endpoint + err = NvSciIpcOpenEndpoint(endpointName, &ipcWrapper->endpoint); + if (err != NvSciError_Success) { + fprintf(stderr, "Unable to open endpoint %s (%x)\n", + endpointName, err); + goto fail; + } + + // initialize IPC event notifier + err = NvSciIpcGetLinuxEventFd(ipcWrapper->endpoint, &ipcWrapper->ipcEventFd); + if (err != NvSciError_Success) { + fprintf(stderr, "Unable to get Linux event fd (%x)\n", err); + goto fail; + } + + // Retrieve endpoint info + err = NvSciIpcGetEndpointInfo(ipcWrapper->endpoint, &ipcWrapper->info); + if (NvSciError_Success != err) { + fprintf(stderr, "Unable to retrieve IPC endpoint info (%x)", err); + goto fail; + } + + err = NvSciIpcResetEndpointSafe(ipcWrapper->endpoint); + if (NvSciError_Success != err) { + fprintf(stderr, "Unable to reset IPC endpoint (%x)", err); + } + +fail: + return err; +} + +// Clean up IPC when done +void ipcDeinit(IpcWrapper* ipcWrapper) +{ + NvSciError err = NvSciIpcCloseEndpointSafe(ipcWrapper->endpoint, false); + if (NvSciError_Success != err) { + fprintf(stderr, "NvSciIpcCloseEndpointSafe failed (%x)\n", err); + } +} + +// Wait for an event on IPC channel +static NvSciError waitEvent(IpcWrapper* ipcWrapper, uint32_t value) +{ + fd_set rfds; + uint32_t event = 0; + NvSciError err; + + while (true) { + // Get pending IPC events + err = NvSciIpcGetEventSafe(ipcWrapper->endpoint, &event); + if (NvSciError_Success != err) { + fprintf(stderr, "NvSciIpcGetEventSafe failed (%x)\n", err); + return err; + } + // Return if event is the kind we're looking for + if (0U != (event & value)) { + break; + } + + FD_ZERO(&rfds); + FD_SET(ipcWrapper->ipcEventFd, &rfds); + + // Wait for signalling indicating new event + if (select(ipcWrapper->ipcEventFd + 1, &rfds, NULL, NULL, NULL) < 0) { + // select failed + return NvSciError_ResourceError; + } + if(!FD_ISSET(ipcWrapper->ipcEventFd, &rfds)) { + return NvSciError_NvSciIpcUnknown; + } + } + return NvSciError_Success; +} + +// Send a message over IPC +NvSciError ipcSend(IpcWrapper* ipcWrapper, const void* buf, const size_t size) +{ + NvSciError err = NvSciError_Success; + bool done = false; + uint32_t bytes; + + // Loop until entire message sent + while (done == false) { + + // Wait for room in channel to send a message + err = waitEvent(ipcWrapper, NV_SCI_IPC_EVENT_WRITE); + if (NvSciError_Success != err) { + goto fail; + } + + assert(size <= UINT32_MAX); + + // Send as much of the message as we can + err = NvSciIpcWriteSafe(ipcWrapper->endpoint, buf, (uint32_t)size, + &bytes); + if (NvSciError_Success != err) { + fprintf(stderr, "IPC write failed (%x)\n", err); + goto fail; + } + + // For this simple sample, we just fail if the entire message wasn't + // sent. Could instead retry to send the rest. + if (size != (size_t)bytes) { + fprintf(stderr, "Failed to send entire message (%u < %zu)\n", + bytes, size); + err = NvSciError_NvSciIpcUnknown; + goto fail; + } + done = true; + } + +fail: + return err; +} + +// Receive a message over IPC +NvSciError ipcRecvFill(IpcWrapper* ipcWrapper, void* buf, const size_t size) +{ + NvSciError err = NvSciError_Success; + bool done = false; + uint32_t bytes; + + // Loop until entire message received + while (done == false) { + + // Wait for incoming data + err = waitEvent(ipcWrapper, NV_SCI_IPC_EVENT_READ); + if (NvSciError_Success != err) { + goto fail; + } + + assert(size <= UINT32_MAX); + + // Read as much of the message as we can + err = NvSciIpcReadSafe(ipcWrapper->endpoint, buf, (uint32_t)size, + &bytes); + if (NvSciError_Success != err) { + fprintf(stderr, "IPC read failed (%x)\n", err); + goto fail; + } + + // For this simple sample, we just fail if the entire message wasn't + // read. Could instead retry to receive the rest. + if (size != (size_t)bytes) { + fprintf(stderr, "Failed to read entire message (%u < %zu)\n", + bytes, size); + err = NvSciError_NvSciIpcUnknown; + goto fail; + } + done = true; + } + +fail: + return err; +} diff --git a/rawstream/rawstream_main.c b/rawstream/rawstream_main.c new file mode 100644 index 0000000..bb324a4 --- /dev/null +++ b/rawstream/rawstream_main.c @@ -0,0 +1,230 @@ +//! \file +//! \brief NvStreams rawstream main file. +//! +//! \copyright +//! SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +//! SPDX-License-Identifier: LicenseRef-NvidiaProprietary +//! +//! NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +//! property and proprietary rights in and to this material, related +//! documentation and any modifications thereto. Any use, reproduction, +//! disclosure or distribution of this material and related documentation +//! without an express license agreement from NVIDIA CORPORATION or +//! its affiliates is strictly prohibited. + +#include "rawstream.h" +#include + + +NvSciSyncModule syncModule = NULL; +NvSciBufModule bufModule = NULL; + +NvSciSyncAttrList producerSignalAttrs = NULL; +NvSciSyncAttrList consumerSignalAttrs = NULL; +NvSciSyncAttrList producerWaitAttrs = NULL; +NvSciSyncAttrList consumerWaitAttrs = NULL; +NvSciSyncAttrList prodToConsAttrs = NULL; +NvSciSyncAttrList consToProdAttrs = NULL; +NvSciSyncObj consumerSignalObj = NULL; +NvSciSyncObj producerSignalObj = NULL; +NvSciSyncObj consumerWaitObj = NULL; +NvSciSyncObj producerWaitObj = NULL; + +NvSciBufAttrList producerWriteAttrs = NULL; +NvSciBufAttrList consumerReadAttrs = NULL; +NvSciBufAttrList combinedBufAttrs = NULL; +Buffer buffers[totalBuffers]; +IpcWrapper ipcWrapper; + +int late_attach = 0; +int main(int argc, char *argv[]) +{ + NvSciError err; + int producer; + const char* endpoint = "Unknown"; + int ret = 0; + + int opt; + producer = -1; + while ((opt = getopt(argc, argv, "pcl")) != -1) { + switch (opt) { + case 'p': + producer = 1; + endpoint = "Producer"; + break; + case 'c': + producer = 0; + endpoint = "Consumer"; + break; + case 'l': + late_attach = 1; + break; + default: + fprintf(stderr, "Unknown option: '%c'\n", opt); + } + } + + if (producer == -1 || optind < argc) { + fprintf(stderr, "Usage: ./rawstream [-l] {-p or -c}\n"); + fprintf(stderr, + "-p denotes producer, -c denotes consumer, -l denotes late-attach \n"); + fprintf(stderr, + "either -p or -c should be provided, while -l is optional param\n"); + return 1; + } + + fprintf(stderr, "%p application starting\n", endpoint); + + // Open sync module (shared by both all threads) + err = NvSciSyncModuleOpen(&syncModule); + if (NvSciError_Success != err) { + fprintf(stderr, "%s unable to open sync module (%x)\n", + endpoint, err); + return 1; + } + + // Open buf module (shared by both all threads) + err = NvSciBufModuleOpen(&bufModule); + if (NvSciError_Success != err) { + fprintf(stderr, "%s unable to open buf module (%x)\n", + endpoint, err); + ret = 1; + goto close_sync_module; + } + + // Initialize IPC library + err = NvSciIpcInit(); + if (NvSciError_Success != err) { + fprintf(stderr, "%s unable to init ipc library (%x)\n", + endpoint, err); + ret = 1; + goto close_buf_module; + } + + // Establish IPC communications based on endpoint + // TODO: Settle on final IPC channel names + if (producer == 1) { + err = ipcInit("nvscisync_a_0", &ipcWrapper); + } else { + err = ipcInit("nvscisync_a_1", &ipcWrapper); + } + if (NvSciError_Success != err) { + fprintf(stderr, "%s unable to initialize communication (%x)\n", + endpoint, err); + ret = 1; + goto deinit_IPC; + } + + // Test communication by exchanging a simple handshake message + const int send_handshake = 12345; + err = ipcSend(&ipcWrapper, &send_handshake, sizeof(send_handshake)); + if (NvSciError_Success != err) { + fprintf(stderr, "%s failed to send handshake (%x)\n", + endpoint, err); + ret = 1; + goto deinit_IPC; + } + + int recv_handshake = 0; + ipcRecvFill(&ipcWrapper, &recv_handshake, sizeof(recv_handshake)); + if (NvSciError_Success != err) { + fprintf(stderr, "%s failed to receive handshake (%x)\n", + endpoint, err); + ret = 1; + goto deinit_IPC; + } + + if (send_handshake != recv_handshake) { + fprintf(stderr, "%s handshake did not match (%x)\n", + endpoint, err); + ret = 1; + goto deinit_IPC; + } + + // Initialize buffer list + for (uint32_t i=0; i 0; j--) { + if (tmp & 1) { + tmp = (tmp >> 1) ^ CRC32_POLYNOMIAL; + } else { + tmp >>= 1; + } + } + crcTable[i] = tmp; + } + initialized = 1; + } + + //Calculate CRC for the data + for (y = 0U; y < height; y++) { + for (x = 0U; x < width; x++) { + tmp = (crc >> 8) & 0x00FFFFFFL; + crc = tmp ^ crcTable[((uint32_t) crc ^ *(data_ptr + x)) & 0xFF]; + } + data_ptr += pitch; + } + + return crc; +} diff --git a/rawstream/rawstream_producer.c b/rawstream/rawstream_producer.c new file mode 100644 index 0000000..b251286 --- /dev/null +++ b/rawstream/rawstream_producer.c @@ -0,0 +1,752 @@ +//! \file +//! \brief NvStreams rawstream producer file. +//! +//! \copyright +//! SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +//! SPDX-License-Identifier: LicenseRef-NvidiaProprietary +//! +//! NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +//! property and proprietary rights in and to this material, related +//! documentation and any modifications thereto. Any use, reproduction, +//! disclosure or distribution of this material and related documentation +//! without an express license agreement from NVIDIA CORPORATION or +//! its affiliates is strictly prohibited. + +#include "rawstream.h" + +extern int late_attach; + +void* producerFunc(void* arg) +{ + CudaClientInfo cudaInfo; + NvSciError sciErr; + int cudaErr; + void* recvWaitListDesc = NULL; + void* recvObjAndListDesc = NULL; + void* consumerReadAttrsDesc = NULL; + + *(int*)arg = 1; + fprintf(stderr, "Producer starting\n"); + + // Do common cuda initialization + if (!setupCuda(&cudaInfo)) { + goto done; + } + + // Create an empty sync attribute list for signaling permissions. + sciErr = NvSciSyncAttrListCreate(syncModule, &producerSignalAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to create producer signal attrs (%x)\n", + sciErr); + goto done; + } + + // Query CUDA for attributes needed to signal syncs + cudaErr = cudaDeviceGetNvSciSyncAttributes(producerSignalAttrs, + cudaInfo.deviceId, + cudaNvSciSyncAttrSignal); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Could not query signal attributes from CUDA (%d)\n", + cudaErr); + goto done; + } + + fprintf(stderr, "Producer signal attributes established\n"); + + // Create an empty sync attribute list for waiting permissions. + sciErr = NvSciSyncAttrListCreate(syncModule, &producerWaitAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to create producer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Query CUDA for attributes needed to wait for syncs + cudaErr = cudaDeviceGetNvSciSyncAttributes(producerWaitAttrs, + cudaInfo.deviceId, + cudaNvSciSyncAttrWait); + if (cudaSuccess != cudaErr) { + fprintf(stderr, + "Could not query wait attributes from CUDA (%d)\n", + cudaErr); + goto done; + } + + fprintf(stderr, "Producer wait attributes established\n"); + + // Export producer's wait attributes to a form suitable for IPC + size_t sendWaitAttrListSize = 0U; + void* sendWaitListDesc = NULL; + sciErr = NvSciSyncAttrListIpcExportUnreconciled(&producerWaitAttrs, + 1, + ipcWrapper.endpoint, + &sendWaitListDesc, + &sendWaitAttrListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to export producer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Send the size of the producer's wait attributes to the consumer, + // so it knows how much data to expect + sciErr = ipcSend(&ipcWrapper, + &sendWaitAttrListSize, + sizeof(sendWaitAttrListSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to send producer wait attrs size (%x)\n", + sciErr); + goto done; + } + + // Send the exported form of the producer's wait attributes + sciErr = ipcSend(&ipcWrapper, + sendWaitListDesc, + sendWaitAttrListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to send producer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Wait to receive the size of the consumer's wait attributes + size_t recvWaitAttrListSize = 0U; + sciErr = ipcRecvFill(&ipcWrapper, + &recvWaitAttrListSize, + sizeof(recvWaitAttrListSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to recv consumer wait attr size (%x)\n", + sciErr); + goto done; + } + + // Allocate a buffer big enough to receive the consumer's wait attributes + recvWaitListDesc = malloc(recvWaitAttrListSize); + if (NULL == recvWaitListDesc) { + sciErr = NvSciError_InsufficientMemory; + fprintf(stderr, + "Sync attr allocation failed (%x)\n", + sciErr); + goto done; + } + + // Wait to receive consumer's wait attributes + sciErr = ipcRecvFill(&ipcWrapper, + recvWaitListDesc, + recvWaitAttrListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to recv consumer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Convert the received consumer wait attributes to an attribute list + sciErr = NvSciSyncAttrListIpcImportUnreconciled(syncModule, + ipcWrapper.endpoint, + recvWaitListDesc, + recvWaitAttrListSize, + &consumerWaitAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to import consumer wait attrs (%x)\n", + sciErr); + goto done; + } + + // Get combined attributes for producer to consumer signaling + NvSciSyncAttrList syncAllAttrs[2], syncConflictAttrs; + syncAllAttrs[0] = producerSignalAttrs; + syncAllAttrs[1] = consumerWaitAttrs; + sciErr = NvSciSyncAttrListReconcile(syncAllAttrs, + 2, + &prodToConsAttrs, + &syncConflictAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't merge producer->consumer attrs (%x)\n", + sciErr); + goto done; + } + + // Allocate producer to consumer sync object + sciErr = NvSciSyncObjAlloc(prodToConsAttrs, &producerSignalObj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't allocate producer->consumer sync (%x)\n", + sciErr); + goto done; + } + + // Export sync attributes and object to a form suitable for IPC + void* sendObjAndListDesc = NULL; + size_t sendObjAndListSize = 0U; + sciErr = NvSciSyncIpcExportAttrListAndObj(producerSignalObj, + NvSciSyncAccessPerm_WaitOnly, + ipcWrapper.endpoint, + &sendObjAndListDesc, + &sendObjAndListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't export producer->consumer sync description (%x)\n", + sciErr); + goto done; + } + + // Send the size of the sync description to the consumer, + // so it knows how much data to expect + sciErr = ipcSend(&ipcWrapper, &sendObjAndListSize, sizeof(size_t)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't send producer->consumer sync description size(%x)\n", + sciErr); + goto done; + } + + // Send the sync description to the consumer + sciErr = ipcSend(&ipcWrapper, sendObjAndListDesc, sendObjAndListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't send producer->consumer sync description (%x)\n", + sciErr); + goto done; + } + + // Wait to receive the size of the consumer->producer sync desription + size_t recvObjAndListSize = 0U; + sciErr = ipcRecvFill(&ipcWrapper, + &recvObjAndListSize, + sizeof(size_t)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't recv consumer->produce sync description size (%x)\n", + sciErr); + goto done; + } + + // Allocate a buffer big enough to receive the description + recvObjAndListDesc = malloc(recvObjAndListSize); + if (NULL == recvObjAndListDesc) { + sciErr = NvSciError_InsufficientMemory; + fprintf(stderr, + "Sync description allocation failed (%x)\n", + sciErr); + goto done; + } + + // Wait to receive consumer->producer sync description + sciErr = ipcRecvFill(&ipcWrapper, + recvObjAndListDesc, + recvObjAndListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't receive consumer->producer sync description (%x)\n", + sciErr); + goto done; + } + + // Convert the received consumer->producer sync description to a + // sync attribute list and object + sciErr = NvSciSyncIpcImportAttrListAndObj(syncModule, + ipcWrapper.endpoint, + recvObjAndListDesc, + recvObjAndListSize, + &producerWaitAttrs, + 1, + NvSciSyncAccessPerm_WaitOnly, + ipcWrapper.endpoint, + &producerWaitObj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Can't import consumer->producer sync (%x)\n", sciErr); + goto done; + } + + // Set up CUDA sync objects, importing NvSciSync objects + if (!setupCudaSync(&cudaInfo, producerSignalObj, producerWaitObj)) { + goto done; + } + + fprintf(stderr, "Producer exchanged sync objects with consumer\n"); + + // Create an empty buffer attribute list for producer buffers + sciErr = NvSciBufAttrListCreate(bufModule, &producerWriteAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to create producer buffer attrs (%x)\n", + sciErr); + goto done; + } + + // Fill producer buffer attribute list with values + NvSciBufAttrKeyValuePair bufKeyValue[7]; + NvSciRmGpuId gpuId; + memcpy(&gpuId.bytes, &cudaInfo.uuid.bytes, sizeof(cudaInfo.uuid.bytes)); + bufKeyValue[0].key = NvSciBufGeneralAttrKey_GpuId; + bufKeyValue[0].value = &gpuId; + bufKeyValue[0].len = sizeof(gpuId); + NvSciBufType bufType = NvSciBufType_RawBuffer; + bufKeyValue[1].key = NvSciBufGeneralAttrKey_Types; + bufKeyValue[1].value = &bufType; + bufKeyValue[1].len = sizeof(bufType); + NvSciBufAttrValAccessPerm bufPerm = NvSciBufAccessPerm_ReadWrite; + bufKeyValue[2].key = NvSciBufGeneralAttrKey_RequiredPerm; + bufKeyValue[2].value = &bufPerm; + bufKeyValue[2].len = sizeof(bufPerm); + bool bufAccessFlag = true; + bufKeyValue[3].key = NvSciBufGeneralAttrKey_NeedCpuAccess; + bufKeyValue[3].value = &bufAccessFlag; + bufKeyValue[3].len = sizeof(bufAccessFlag); + uint64_t rawsize = (128 * 1024); + bufKeyValue[4].key = NvSciBufRawBufferAttrKey_Size; + bufKeyValue[4].value = &rawsize; + bufKeyValue[4].len = sizeof(rawsize); + uint64_t align = (4 * 1024); + bufKeyValue[5].key = NvSciBufRawBufferAttrKey_Align; + bufKeyValue[5].value = &align; + bufKeyValue[5].len = sizeof(align); + if (late_attach) { + // Add late peer location attribute + NvSciBufPeerLocationInfo peerLocation; + peerLocation.socID = NV_SCI_BUF_PEER_INFO_SELF_SOCID; + peerLocation.vmID = NV_SCI_BUF_PEER_INFO_SELF_VMID; + peerLocation.reserved = 0; + bufKeyValue[6].key = NvSciBufGeneralAttrKey_PeerLocationInfo; + bufKeyValue[6].value = &peerLocation; + bufKeyValue[6].len = sizeof(peerLocation); + sciErr = NvSciBufAttrListSetAttrs(producerWriteAttrs, bufKeyValue, 7); + } else { + sciErr = NvSciBufAttrListSetAttrs(producerWriteAttrs, bufKeyValue, 6); + } + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to fill producer buffer attrs (%x)\n", sciErr); + goto done; + } + + fprintf(stderr, "Producer buffer attributes established\n"); + + + void* sendBufListDesc = NULL; + + if (late_attach) { + // We don't have peer attributes. Just reconciled our own attributes + NvSciBufAttrList bufAllAttrs[2], bufConflictAttrs; + bufAllAttrs[0] = producerWriteAttrs; + // bufAllAttrs[1] = consumerReadAttrs; + sciErr = NvSciBufAttrListReconcile(bufAllAttrs, 1, + &combinedBufAttrs, &bufConflictAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Can't merge buffer attrs (%x)\n", sciErr); + goto done; + } + + // Allocate all buffers + for (uint32_t i=0U; iobj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Can't allocate buffer %d (%x)\n", i, sciErr); + goto done; + } + } + } + + // Wait to receive the size of the consumer's buffer attributes + size_t consumerReadAttrsSize = 0U; + sciErr = ipcRecvFill(&ipcWrapper, + &consumerReadAttrsSize, + sizeof(consumerReadAttrsSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to recv consumer buffer attr size (%x)\n", + sciErr); + goto done; + } + + // Allocate a buffer big enough to receive the consumer's buffer attributes + consumerReadAttrsDesc = malloc(consumerReadAttrsSize); + if (NULL == recvWaitListDesc) { + sciErr = NvSciError_InsufficientMemory; + fprintf(stderr, "Buffer attr allocation failed(%x)\n", sciErr); + goto done; + } + + // Wait to receive the consumer's buffer attributes + sciErr = ipcRecvFill(&ipcWrapper, + consumerReadAttrsDesc, + consumerReadAttrsSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to recv consumer buffer attrs (%x)\n", sciErr); + goto done; + } + + // Convert the received consumer buffer attributes to an attribute list + sciErr = NvSciBufAttrListIpcImportUnreconciled(bufModule, + ipcWrapper.endpoint, + consumerReadAttrsDesc, + consumerReadAttrsSize, + &consumerReadAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to import consumer buffer attrs (%x)\n", + sciErr); + goto done; + } + + // Get combined attributes for buffers + NvSciBufAttrList bufAllAttrs[2], bufConflictAttrs; + bufAllAttrs[0] = producerWriteAttrs; + bufAllAttrs[1] = consumerReadAttrs; + sciErr = NvSciBufAttrListReconcile(bufAllAttrs, 2, + &combinedBufAttrs, &bufConflictAttrs); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Can't merge buffer attrs (%x)\n", sciErr); + goto done; + } + + // Export combined buffer attributes to a form suitable for IPC + sendBufListDesc = NULL; + size_t sendBufListSize = 0U; + sciErr = NvSciBufAttrListIpcExportReconciled(combinedBufAttrs, + ipcWrapper.endpoint, + &sendBufListDesc, + &sendBufListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't export reconciled buffer attrs to consumer (%x)\n", + sciErr); + goto done; + } + + // Send the size of the combined buffer attributes to the consumer, + // so it knows how much data to expect + sciErr = ipcSend(&ipcWrapper, + &sendBufListSize, + sizeof(sendBufListSize)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to send combined buffer attrs size (%x)\n", + sciErr); + goto done; + } + + // Send the exported form of the combined buffer attributes + sciErr = ipcSend(&ipcWrapper, + sendBufListDesc, + sendBufListSize); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to send combined buffer attrs (%x)\n", sciErr); + goto done; + } + + + // Extract attributes needed by CUDA + if (!setupCudaBufAttr(&cudaInfo, combinedBufAttrs)) { + goto done; + } + + // Export all buffers + for (uint32_t i=0U; iobj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Can't allocate buffer %d (%x)\n", i, sciErr); + goto done; + } + } else { + NvSciBufAttrList bufAllAttrs[1]; + ///NV + //Use imported consumer attribute list. + // bufAllAttrs[0] = producerWriteAttrs; + bufAllAttrs[0] = consumerReadAttrs; + // Invoke NvSciBufObjAttachPeer() before exporting the SciBufObj + sciErr = NvSciBufObjAttachPeer(buf->obj, bufAllAttrs, 1); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "NvSciBufObjAttachPeer call failed error: %x\n", sciErr); + goto done; + } else { + fprintf(stderr, "NvSciBufObjAttachPeer call succeeded\n"); + } + } + + // Export buffer object to a form suitable for IPC + // Note: Unlike attribute lists, the exported form of objects has + // a fixed size. + NvSciBufObjIpcExportDescriptor objDesc; + sciErr = NvSciBufObjIpcExport(buf->obj, + NvSciBufAccessPerm_ReadWrite, + ipcWrapper.endpoint, + &objDesc); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Unable to export buffer %d object to consumer (%x)\n", + i, sciErr); + goto done; + } + + // Send the buffer description to the consumer + sciErr = ipcSend(&ipcWrapper, + &objDesc, + sizeof(NvSciBufObjIpcExportDescriptor)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to send buffer %d (%x)\n", i, sciErr); + goto done; + } + + // Import the buffer into CUDA + if (!setupCudaBuffer(&cudaInfo, buf)) { + goto done; + } + + { + NvSciBufAttrList reconciledList; + sciErr = NvSciBufObjGetAttrList(buf->obj, &reconciledList); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Can't get the reconciled list from NvSciBufObj %d (%x)\n", i, sciErr); + goto done; + } + + sciErr = NvSciBufAttrListValidateReconciledAgainstAttrs( + reconciledList, bufKeyValue, 6); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Validation of combinedBufAttrs failed (%x)\n", sciErr); + goto done; + } + + sciErr = NvSciBufObjValidate(buf->obj); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Validation of buffer %d failed (%x)\n", i, sciErr); + goto done; + } + } + } + + fprintf(stderr, "Producer buffers established and transmitted\n"); + + // Validate handles before starting the runtime phase + { + NvSciSyncAttrList producerWaitList; + + sciErr = NvSciSyncAttrListValidateReconciledAgainstAttrs( + prodToConsAttrs, + NULL, + 0, + NvSciSyncAccessPerm_SignalOnly); + if (NvSciError_Success != sciErr) { + fprintf( + stderr, + "Validation of prodToCons list failed: %x\n", sciErr); + goto done; + } + + sciErr = NvSciSyncObjGetAttrList(producerWaitObj, + &producerWaitList); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Can't get the reconciled list from producer wait object (%x)\n", + sciErr); + goto done; + } + + sciErr = NvSciSyncAttrListValidateReconciledAgainstAttrs( + producerWaitList, + NULL, + 0, + NvSciSyncAccessPerm_WaitOnly); + if (NvSciError_Success != sciErr) { + fprintf( + stderr, + "Validation of imported reconciled producer wait list failed: %x\n", + sciErr); + goto done; + } + + sciErr = NvSciSyncObjValidate(producerWaitObj); + if (NvSciError_Success != sciErr) { + fprintf( + stderr, + "Validation of imported producer wait object failed: %x\n", + sciErr); + goto done; + } + } + + // Send all frames + uint32_t currFrame = 0; + uint32_t currBuffer = 0; + Packet packet; + while (currFrame < totalFrames) { + fprintf(stderr, "Producer starting frame %d in buffer %d\n", + currFrame, currBuffer); + Buffer* buf = &buffers[currBuffer]; + + // Wait for buffer to be available + // Note: On first frame for each buffer, the producer already owns + // it, so this is skipped. On subsequent frames it must wait + // for the buffer's return. + while (buf->owner != 0U) { + + // Wait for next returned buffer + sciErr = ipcRecvFill(&ipcWrapper, &packet, sizeof(packet)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Failure to recv buffer from consumer (%x)\n", + sciErr); + goto done; + } + + // Import transmitted fence description to a fence + sciErr = NvSciSyncIpcImportFence(producerWaitObj, + &packet.fenceDesc, + &buffers[packet.bufferId].fence); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Failure to import fence from consumer (%x)\n", + sciErr); + goto done; + } + + // Extract checksum from packet + buffers[packet.bufferId].crc = packet.crc; + + // Mark producer as owner of this buffer + buffers[packet.bufferId].owner = 0U; + } + + // Wait for fence returned by consumer before rendering + if (!waitCudaFence(&cudaInfo, buf)) { + goto done; + } + + // CUDA rendering to buffer + (void)memset(cudaInfo.bufCopy, (currFrame & 0xFF), cudaInfo.bufSize); + + cudaErr = cudaMemcpy2DAsync(buf->ptr, + cudaInfo.bufSize, + cudaInfo.bufCopy, + cudaInfo.bufSize, + cudaInfo.bufSize, + 1, + cudaMemcpyHostToDevice, + cudaInfo.stream); + if (cudaSuccess != cudaErr) { + fprintf(stderr, "Unable to initiate CUDA copy (%d)\n", cudaErr); + goto done; + } + + // Generate new fence for the sync object + if (!signalCudaFence(&cudaInfo, buf)) { + goto done; + } + + // Wait for operation to finish and compute checksum + // IMPORTANT NOTE: + // A normal stream application would not perform these steps. + // A checksum is not required for streaming, and waiting for + // operations to finish (which we only need because the + // checksum is calculated by the CPU) introduces bubbles + // in the hardware pipeline. A real application can rely on + // the generated NvSciSync fences for synchronization. + // These steps are only taken in this sample application + // because the consumer has no output visible to the user, + // so the checksum allows us to verify that the application + // is behaving properly. + cudaDeviceSynchronize(); + buf->crc = GenerateCRC(cudaInfo.bufCopy, + 1, + cudaInfo.bufSize, + cudaInfo.bufSize); + + fprintf(stderr, "Producer wrote frame %d in buffer %d\n", + currFrame, currBuffer); + + // Mark buffer as owned by consumer now + buf->owner = 1U; + + // Export buffer index, checksum, and fence for transmission over IPC + packet.bufferId = currBuffer; + packet.crc = buf->crc; + sciErr = NvSciSyncIpcExportFence(&buf->fence, + ipcWrapper.endpoint, + &packet.fenceDesc); + if (NvSciError_Success != sciErr) { + fprintf(stderr, "Unable to export producer fence (%x)\n", sciErr); + goto done; + } + + // Send buffer index and fence to consumer + sciErr = ipcSend(&ipcWrapper, &packet, sizeof(packet)); + if (NvSciError_Success != sciErr) { + fprintf(stderr, + "Failure to send buffer to consumer (%x)\n", + sciErr); + goto done; + } + + fprintf(stderr, "Producer finished frame %d in buffer %d\n", + currFrame, currBuffer); + + // Advance buffer and frame + currBuffer = (currBuffer + 1U) % totalBuffers; + currFrame++; + } + + + // Success + *(int*)arg = 0; +done: + // Free CUDA resources + deinitCudaBuffer(buffers, totalBuffers); + deinitCuda(&cudaInfo); + + // Free NvSci objects + if (NULL != producerSignalAttrs) + NvSciSyncAttrListFree(producerSignalAttrs); + if (NULL != consumerWaitAttrs) + NvSciSyncAttrListFree(consumerWaitAttrs); + if (NULL != sendWaitListDesc) + NvSciSyncAttrListFreeDesc(sendWaitListDesc); + if (NULL != producerWaitAttrs) + NvSciSyncAttrListFree(producerWaitAttrs); + if (NULL != prodToConsAttrs) + NvSciSyncAttrListFree(prodToConsAttrs); + if (NULL != syncConflictAttrs) + NvSciSyncAttrListFree(syncConflictAttrs); + if (NULL != producerSignalObj) + NvSciSyncObjFree(producerSignalObj); + if (NULL != sendObjAndListDesc) + NvSciSyncAttrListAndObjFreeDesc(sendObjAndListDesc); + if (NULL != producerWaitObj) + NvSciSyncObjFree(producerWaitObj); + if (NULL != producerWriteAttrs) + NvSciBufAttrListFree(producerWriteAttrs); + if (NULL != consumerReadAttrs) + NvSciBufAttrListFree(consumerReadAttrs); + if (NULL != combinedBufAttrs) + NvSciBufAttrListFree(combinedBufAttrs); + if (NULL != sendBufListDesc) + NvSciBufAttrListFreeDesc(sendBufListDesc); + + // Free malloc'd resources + if (NULL != recvWaitListDesc) + free(recvWaitListDesc); + if (NULL != recvObjAndListDesc) + free(recvObjAndListDesc); + if (NULL != consumerReadAttrsDesc) + free(consumerReadAttrsDesc); + + fprintf(stderr, "Producer exiting\n"); + return NULL; +}