gpu: nvgpu: Add support for FECS ctxsw tracing

bug 1648908 This commit adds support for FECS ctxsw tracing. Code is compiled conditionnaly under CONFIG_GK20_CTXSW_TRACE. This feature requires an updated FECS ucode that writes one record to a ring buffer on each context switch. On RM/Kernel side, the GPU driver reads records from the master ring buffer and generates trace entries into a user-facing VM ring buffer. For each record in the master ring buffer, RM/Kernel has to retrieve the vmid+pid of the user process that submitted related work. Features currently implemented: - master ring buffer allocation - debugfs to dump master ring buffer - FECS record per context switch (with both current and new contexts) - dedicated device for ctxsw tracing (access to VM ring buffer) - SOF generation (and access to PTIMER) - VM ring buffer allocation, and reconfiguration - enable/disable tracing at user level - event-based trace filtering - context_ptr to vmid+pid mapping - read system call for ctxsw dev - mmap system call for ctxsw dev (direct access to VM ring buffer) - poll system call for ctxsw dev - save/restore register on ELPG/CG6 - separate user ring from FECS ring handling Features requiring ucode changes: - enable/disable tracing at FECS level - actual busy time on engine (bug 1642354) - master ring buffer threshold interrupt (P1) - API for GPU to CPU timestamp conversion (P1) - vmid/pid/uid based filtering (P1) Change-Id: I8e39c648221ee0fa09d5df8524b03dca83fe24f3 Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: http://git-master/r/1022737 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-23 01:50:07 +03:00 · 2015-08-19 14:27:51 -07:00
parent 82da6ed595
commit 1c40d09c4c
19 changed files with 1849 additions and 14 deletions
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -1215,4 +1215,94 @@ struct nvgpu_as_map_buffer_batch_args {
 #define NVGPU_AS_IOCTL_MAX_ARG_SIZE	\
 	sizeof(struct nvgpu_as_map_buffer_ex_args)

+
+/*
+ * /dev/nvhost-ctxsw-gpu device
+ *
+ * Opening a '/dev/nvhost-ctxsw-gpu' device node creates a way to trace
+ * context switches on GR engine
+ */
+
+#define NVGPU_CTXSW_IOCTL_MAGIC 'C'
+
+#define NVGPU_CTXSW_TAG_SOF			0x00
+#define NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST	0x01
+#define NVGPU_CTXSW_TAG_FE_ACK			0x02
+#define NVGPU_CTXSW_TAG_FE_ACK_WFI		0x0a
+#define NVGPU_CTXSW_TAG_FE_ACK_GFXP		0x0b
+#define NVGPU_CTXSW_TAG_FE_ACK_CTAP		0x0c
+#define NVGPU_CTXSW_TAG_FE_ACK_CILP		0x0d
+#define NVGPU_CTXSW_TAG_SAVE_END		0x03
+#define NVGPU_CTXSW_TAG_RESTORE_START		0x04
+#define NVGPU_CTXSW_TAG_CONTEXT_START		0x05
+#define NVGPU_CTXSW_TAG_INVALID_TIMESTAMP	0xff
+#define NVGPU_CTXSW_TAG_LAST			\
+	NVGPU_CTXSW_TAG_INVALID_TIMESTAMP
+
+struct nvgpu_ctxsw_trace_entry {
+	__u8 tag;
+	__u8 vmid;
+	__u16 seqno;		/* sequence number to detect drops */
+	__u32 context_id;	/* context_id as allocated by FECS */
+	__u64 pid;		/* 64-bit is max bits of different OS pid */
+	__u64 timestamp;	/* 64-bit time */
+};
+
+#define NVGPU_CTXSW_RING_HEADER_MAGIC 0x7000fade
+#define NVGPU_CTXSW_RING_HEADER_VERSION 0
+
+struct nvgpu_ctxsw_ring_header {
+	__u32 magic;
+	__u32 version;
+	__u32 num_ents;
+	__u32 ent_size;
+	volatile __u32 drop_count;	/* excluding filtered out events */
+	volatile __u32 write_seqno;
+	volatile __u32 write_idx;
+	volatile __u32 read_idx;
+};
+
+struct nvgpu_ctxsw_ring_setup_args {
+	__u32 size;	/* [in/out] size of ring buffer in bytes (including
+			   header). will be rounded page size. this parameter
+			   is updated with actual allocated size. */
+};
+
+#define NVGPU_CTXSW_FILTER_SIZE	(NVGPU_CTXSW_TAG_LAST + 1)
+#define NVGPU_CTXSW_FILTER_SET(n, p) \
+	((p)->tag_bits[(n) / 64] |=  (1 << ((n) & 63)))
+#define NVGPU_CTXSW_FILTER_CLR(n, p) \
+	((p)->tag_bits[(n) / 64] &= ~(1 << ((n) & 63)))
+#define NVGPU_CTXSW_FILTER_ISSET(n, p) \
+	((p)->tag_bits[(n) / 64] &   (1 << ((n) & 63)))
+#define NVGPU_CTXSW_FILTER_CLR_ALL(p)    memset((void *)(p), 0, sizeof(*(p)))
+#define NVGPU_CTXSW_FILTER_SET_ALL(p)    memset((void *)(p), ~0, sizeof(*(p)))
+
+struct nvgpu_ctxsw_trace_filter {
+	__u64 tag_bits[(NVGPU_CTXSW_FILTER_SIZE + 63) / 64];
+};
+
+struct nvgpu_ctxsw_trace_filter_args {
+	struct nvgpu_ctxsw_trace_filter filter;
+};
+
+#define NVGPU_CTXSW_IOCTL_TRACE_ENABLE \
+	_IO(NVGPU_CTXSW_IOCTL_MAGIC, 1)
+#define NVGPU_CTXSW_IOCTL_TRACE_DISABLE \
+	_IO(NVGPU_CTXSW_IOCTL_MAGIC, 2)
+#define NVGPU_CTXSW_IOCTL_RING_SETUP \
+	_IOWR(NVGPU_CTXSW_IOCTL_MAGIC, 3, struct nvgpu_ctxsw_ring_setup_args)
+#define NVGPU_CTXSW_IOCTL_SET_FILTER \
+	_IOW(NVGPU_CTXSW_IOCTL_MAGIC, 4, struct nvgpu_ctxsw_trace_filter_args)
+#define NVGPU_CTXSW_IOCTL_GET_FILTER \
+	_IOR(NVGPU_CTXSW_IOCTL_MAGIC, 5, struct nvgpu_ctxsw_trace_filter_args)
+#define NVGPU_CTXSW_IOCTL_POLL \
+	_IO(NVGPU_CTXSW_IOCTL_MAGIC, 6)
+
+#define NVGPU_CTXSW_IOCTL_LAST            \
+	_IOC_NR(NVGPU_CTXSW_IOCTL_POLL)
+
+#define NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE	\
+	sizeof(struct nvgpu_ctxsw_trace_filter_args)
+
 #endif