gpu: nvgpu: Add support for FECS ctxsw tracing

bug 1648908

This commit adds support for FECS ctxsw tracing. Code is compiled
conditionnaly under CONFIG_GK20_CTXSW_TRACE.
This feature requires an updated FECS ucode that writes one record to a ring
buffer on each context switch. On RM/Kernel side, the GPU driver reads records
from the master ring buffer and generates trace entries into a user-facing
VM ring buffer. For each record in the master ring buffer, RM/Kernel has
to retrieve the vmid+pid of the user process that submitted related work.

Features currently implemented:
- master ring buffer allocation
- debugfs to dump master ring buffer
- FECS record per context switch (with both current and new contexts)
- dedicated device for ctxsw tracing (access to VM ring buffer)
- SOF generation (and access to PTIMER)
- VM ring buffer allocation, and reconfiguration
- enable/disable tracing at user level
- event-based trace filtering
- context_ptr to vmid+pid mapping
- read system call for ctxsw dev
- mmap system call for ctxsw dev (direct access to VM ring buffer)
- poll system call for ctxsw dev
- save/restore register on ELPG/CG6
- separate user ring from FECS ring handling

Features requiring ucode changes:
- enable/disable tracing at FECS level
- actual busy time on engine (bug 1642354)
- master ring buffer threshold interrupt (P1)
- API for GPU to CPU timestamp conversion (P1)
- vmid/pid/uid based filtering (P1)

Change-Id: I8e39c648221ee0fa09d5df8524b03dca83fe24f3
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: http://git-master/r/1022737
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Anton Vorontsov
2015-08-19 14:27:51 -07:00
committed by Terje Bergstrom
parent 82da6ed595
commit 1c40d09c4c
19 changed files with 1849 additions and 14 deletions

View File

@@ -1215,4 +1215,94 @@ struct nvgpu_as_map_buffer_batch_args {
#define NVGPU_AS_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_as_map_buffer_ex_args)
/*
* /dev/nvhost-ctxsw-gpu device
*
* Opening a '/dev/nvhost-ctxsw-gpu' device node creates a way to trace
* context switches on GR engine
*/
#define NVGPU_CTXSW_IOCTL_MAGIC 'C'
#define NVGPU_CTXSW_TAG_SOF 0x00
#define NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST 0x01
#define NVGPU_CTXSW_TAG_FE_ACK 0x02
#define NVGPU_CTXSW_TAG_FE_ACK_WFI 0x0a
#define NVGPU_CTXSW_TAG_FE_ACK_GFXP 0x0b
#define NVGPU_CTXSW_TAG_FE_ACK_CTAP 0x0c
#define NVGPU_CTXSW_TAG_FE_ACK_CILP 0x0d
#define NVGPU_CTXSW_TAG_SAVE_END 0x03
#define NVGPU_CTXSW_TAG_RESTORE_START 0x04
#define NVGPU_CTXSW_TAG_CONTEXT_START 0x05
#define NVGPU_CTXSW_TAG_INVALID_TIMESTAMP 0xff
#define NVGPU_CTXSW_TAG_LAST \
NVGPU_CTXSW_TAG_INVALID_TIMESTAMP
struct nvgpu_ctxsw_trace_entry {
__u8 tag;
__u8 vmid;
__u16 seqno; /* sequence number to detect drops */
__u32 context_id; /* context_id as allocated by FECS */
__u64 pid; /* 64-bit is max bits of different OS pid */
__u64 timestamp; /* 64-bit time */
};
#define NVGPU_CTXSW_RING_HEADER_MAGIC 0x7000fade
#define NVGPU_CTXSW_RING_HEADER_VERSION 0
struct nvgpu_ctxsw_ring_header {
__u32 magic;
__u32 version;
__u32 num_ents;
__u32 ent_size;
volatile __u32 drop_count; /* excluding filtered out events */
volatile __u32 write_seqno;
volatile __u32 write_idx;
volatile __u32 read_idx;
};
struct nvgpu_ctxsw_ring_setup_args {
__u32 size; /* [in/out] size of ring buffer in bytes (including
header). will be rounded page size. this parameter
is updated with actual allocated size. */
};
#define NVGPU_CTXSW_FILTER_SIZE (NVGPU_CTXSW_TAG_LAST + 1)
#define NVGPU_CTXSW_FILTER_SET(n, p) \
((p)->tag_bits[(n) / 64] |= (1 << ((n) & 63)))
#define NVGPU_CTXSW_FILTER_CLR(n, p) \
((p)->tag_bits[(n) / 64] &= ~(1 << ((n) & 63)))
#define NVGPU_CTXSW_FILTER_ISSET(n, p) \
((p)->tag_bits[(n) / 64] & (1 << ((n) & 63)))
#define NVGPU_CTXSW_FILTER_CLR_ALL(p) memset((void *)(p), 0, sizeof(*(p)))
#define NVGPU_CTXSW_FILTER_SET_ALL(p) memset((void *)(p), ~0, sizeof(*(p)))
struct nvgpu_ctxsw_trace_filter {
__u64 tag_bits[(NVGPU_CTXSW_FILTER_SIZE + 63) / 64];
};
struct nvgpu_ctxsw_trace_filter_args {
struct nvgpu_ctxsw_trace_filter filter;
};
#define NVGPU_CTXSW_IOCTL_TRACE_ENABLE \
_IO(NVGPU_CTXSW_IOCTL_MAGIC, 1)
#define NVGPU_CTXSW_IOCTL_TRACE_DISABLE \
_IO(NVGPU_CTXSW_IOCTL_MAGIC, 2)
#define NVGPU_CTXSW_IOCTL_RING_SETUP \
_IOWR(NVGPU_CTXSW_IOCTL_MAGIC, 3, struct nvgpu_ctxsw_ring_setup_args)
#define NVGPU_CTXSW_IOCTL_SET_FILTER \
_IOW(NVGPU_CTXSW_IOCTL_MAGIC, 4, struct nvgpu_ctxsw_trace_filter_args)
#define NVGPU_CTXSW_IOCTL_GET_FILTER \
_IOR(NVGPU_CTXSW_IOCTL_MAGIC, 5, struct nvgpu_ctxsw_trace_filter_args)
#define NVGPU_CTXSW_IOCTL_POLL \
_IO(NVGPU_CTXSW_IOCTL_MAGIC, 6)
#define NVGPU_CTXSW_IOCTL_LAST \
_IOC_NR(NVGPU_CTXSW_IOCTL_POLL)
#define NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_ctxsw_trace_filter_args)
#endif