gpu: nvgpu: Add support for FECS ctxsw tracing

bug 1648908

This commit adds support for FECS ctxsw tracing. Code is compiled
conditionnaly under CONFIG_GK20_CTXSW_TRACE.
This feature requires an updated FECS ucode that writes one record to a ring
buffer on each context switch. On RM/Kernel side, the GPU driver reads records
from the master ring buffer and generates trace entries into a user-facing
VM ring buffer. For each record in the master ring buffer, RM/Kernel has
to retrieve the vmid+pid of the user process that submitted related work.

Features currently implemented:
- master ring buffer allocation
- debugfs to dump master ring buffer
- FECS record per context switch (with both current and new contexts)
- dedicated device for ctxsw tracing (access to VM ring buffer)
- SOF generation (and access to PTIMER)
- VM ring buffer allocation, and reconfiguration
- enable/disable tracing at user level
- event-based trace filtering
- context_ptr to vmid+pid mapping
- read system call for ctxsw dev
- mmap system call for ctxsw dev (direct access to VM ring buffer)
- poll system call for ctxsw dev
- save/restore register on ELPG/CG6
- separate user ring from FECS ring handling

Features requiring ucode changes:
- enable/disable tracing at FECS level
- actual busy time on engine (bug 1642354)
- master ring buffer threshold interrupt (P1)
- API for GPU to CPU timestamp conversion (P1)
- vmid/pid/uid based filtering (P1)

Change-Id: I8e39c648221ee0fa09d5df8524b03dca83fe24f3
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: http://git-master/r/1022737
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Anton Vorontsov
2015-08-19 14:27:51 -07:00
committed by Terje Bergstrom
parent 82da6ed595
commit 1c40d09c4c
19 changed files with 1849 additions and 14 deletions

View File

@@ -54,6 +54,16 @@ config GK20A_CYCLE_STATS
help
Say Y here to enable the cycle stats debugging features.
config GK20A_CTXSW_TRACE
bool "Support GK20A Context Switch tracing"
depends on GK20A
default n
help
Enable support for the GK20A Context Switch Tracing. In this mode,
FECS collects timestamps for contexts loaded on GR engine. This
allows tracking context switches on GR engine, as well as
identifying processes that submitted work.
config TEGRA_GK20A
bool "Enable the GK20A GPU on Tegra"
depends on TEGRA_GRHOST || TEGRA_HOST1X

View File

@@ -46,6 +46,8 @@ nvgpu-y := \
gk20a/cde_gk20a.o \
gk20a/platform_gk20a_generic.o \
gk20a/tsg_gk20a.o \
gk20a/ctxsw_trace_gk20a.o \
gk20a/fecs_trace_gk20a.o \
gk20a/mc_gk20a.o \
gm20b/hal_gm20b.o \
gm20b/ltc_gm20b.o \
@@ -64,7 +66,6 @@ nvgpu-y := \
gm20b/debug_gm20b.o \
gm20b/cde_gm20b.o \
gm20b/therm_gm20b.o
nvgpu-$(CONFIG_TEGRA_GK20A) += gk20a/platform_gk20a_tegra.o
nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o
@@ -78,6 +79,7 @@ nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
vgpu/debug_vgpu.o \
vgpu/vgpu.o \
vgpu/dbg_vgpu.o \
vgpu/fecs_trace_vgpu.o \
vgpu/gk20a/vgpu_hal_gk20a.o \
vgpu/gk20a/vgpu_gr_gk20a.o \
vgpu/gm20b/vgpu_hal_gm20b.o \

View File

@@ -28,6 +28,7 @@
#include <linux/vmalloc.h>
#include "debug_gk20a.h"
#include "ctxsw_trace_gk20a.h"
#include "gk20a.h"
#include "dbg_gpu_gk20a.h"
@@ -920,6 +921,9 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
gk20a_free_error_notifiers(ch);
if (g->ops.fecs_trace.unbind_channel)
g->ops.fecs_trace.unbind_channel(g, ch);
/* release channel ctx */
g->ops.gr.free_channel_ctx(ch);

View File

@@ -0,0 +1,586 @@
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <asm/barrier.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/circ_buf.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/wait.h>
#include <linux/ktime.h>
#include <linux/nvgpu.h>
#include <linux/hashtable.h>
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <uapi/linux/nvgpu.h>
#include "ctxsw_trace_gk20a.h"
#include "gk20a.h"
#include "gr_gk20a.h"
#include "hw_ctxsw_prog_gk20a.h"
#include "hw_gr_gk20a.h"
#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE (128*PAGE_SIZE)
/* Userland-facing FIFO (one global + eventually one per VM) */
struct gk20a_ctxsw_dev {
struct gk20a *g;
struct nvgpu_ctxsw_ring_header *hdr;
struct nvgpu_ctxsw_trace_entry *ents;
struct nvgpu_ctxsw_trace_filter filter;
bool write_enabled;
wait_queue_head_t readout_wq;
size_t size;
atomic_t vma_ref;
struct mutex lock;
};
struct gk20a_ctxsw_trace {
struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
};
static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
{
return (hdr->write_idx == hdr->read_idx);
}
static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
{
return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
}
static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
{
return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
}
static inline int ring_space(struct nvgpu_ctxsw_ring_header *hdr)
{
return (hdr->read_idx - hdr->write_idx - 1) % hdr->num_ents;
}
ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf, size_t size,
loff_t *off)
{
struct gk20a_ctxsw_dev *dev = filp->private_data;
struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
struct nvgpu_ctxsw_trace_entry __user *entry =
(struct nvgpu_ctxsw_trace_entry *) buf;
size_t copied = 0;
int err;
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
"filp=%p buf=%p size=%zu", filp, buf, size);
mutex_lock(&dev->lock);
while (ring_is_empty(hdr)) {
mutex_unlock(&dev->lock);
if (filp->f_flags & O_NONBLOCK)
return -EAGAIN;
err = wait_event_interruptible(dev->readout_wq,
!ring_is_empty(hdr));
if (err)
return err;
mutex_lock(&dev->lock);
}
while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
if (ring_is_empty(hdr))
break;
if (copy_to_user(entry, &dev->ents[hdr->read_idx],
sizeof(*entry))) {
mutex_unlock(&dev->lock);
return -EFAULT;
}
hdr->read_idx++;
if (hdr->read_idx >= hdr->num_ents)
hdr->read_idx = 0;
entry++;
copied += sizeof(*entry);
size -= sizeof(*entry);
}
gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
hdr->read_idx);
*off = hdr->read_idx;
mutex_unlock(&dev->lock);
return copied;
}
static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
{
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace enabled");
dev->write_enabled = true;
return 0;
}
static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
{
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace disabled");
dev->write_enabled = false;
return 0;
}
static int gk20a_ctxsw_dev_ring_alloc(struct gk20a_ctxsw_dev *dev,
size_t size)
{
struct nvgpu_ctxsw_ring_header *hdr;
if (atomic_read(&dev->vma_ref))
return -EBUSY;
if ((dev->write_enabled) || (atomic_read(&dev->vma_ref)))
return -EBUSY;
size = roundup(size, PAGE_SIZE);
hdr = vmalloc_user(size);
if (!hdr)
return -ENOMEM;
if (dev->hdr)
vfree(dev->hdr);
dev->hdr = hdr;
dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
dev->size = size;
hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
hdr->num_ents = (size - sizeof(struct nvgpu_ctxsw_ring_header))
/ sizeof(struct nvgpu_ctxsw_trace_entry);
hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
hdr->drop_count = 0;
hdr->read_idx = 0;
hdr->write_idx = 0;
hdr->write_seqno = 0;
gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
dev->size, dev->hdr, dev->ents, hdr->num_ents);
return 0;
}
static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
struct nvgpu_ctxsw_ring_setup_args *args)
{
size_t size = args->size;
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "size=%zu", size);
if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
return -EINVAL;
return gk20a_ctxsw_dev_ring_alloc(dev, size);
}
static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
struct nvgpu_ctxsw_trace_filter_args *args)
{
dev->filter = args->filter;
return 0;
}
static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
struct nvgpu_ctxsw_trace_filter_args *args)
{
args->filter = dev->filter;
return 0;
}
static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
{
struct gk20a *g = dev->g;
int err;
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
err = gk20a_busy(g->dev);
if (err)
return err;
if (g->ops.fecs_trace.flush(g))
err = g->ops.fecs_trace.flush(g);
if (likely(!err))
err = g->ops.fecs_trace.poll(g);
gk20a_idle(g->dev);
return err;
}
int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
{
struct gk20a *g;
struct gk20a_ctxsw_trace *trace;
struct gk20a_ctxsw_dev *dev;
int err;
size_t size;
u32 n;
/* only one VM for now */
const int vmid = 0;
g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = gk20a_busy(g->dev);
if (err)
return err;
trace = g->ctxsw_trace;
if (!trace) {
err = -ENODEV;
goto idle;
}
/* Allow only one user for this device */
dev = &trace->devs[vmid];
mutex_lock(&dev->lock);
if (dev->hdr) {
err = -EBUSY;
goto done;
}
/* By default, allocate ring buffer big enough to accommodate
* FECS records with default event filter */
/* enable all traces by default */
NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
/* compute max number of entries generated with this filter */
n = g->ops.fecs_trace.max_entries(g, &dev->filter);
size = sizeof(struct nvgpu_ctxsw_ring_header) +
n * sizeof(struct nvgpu_ctxsw_trace_entry);
gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
err = gk20a_ctxsw_dev_ring_alloc(dev, size);
if (!err) {
filp->private_data = dev;
gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
filp, dev, size);
}
err = g->ops.fecs_trace.enable(g);
done:
mutex_unlock(&dev->lock);
idle:
gk20a_idle(g->dev);
return err;
}
int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
{
struct gk20a_ctxsw_dev *dev = filp->private_data;
struct gk20a *g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "dev: %p", dev);
mutex_lock(&dev->lock);
dev->write_enabled = false;
if (dev->hdr) {
vfree(dev->hdr);
dev->hdr = NULL;
}
g->ops.fecs_trace.disable(g);
mutex_unlock(&dev->lock);
return 0;
}
long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct gk20a_ctxsw_dev *dev = filp->private_data;
struct gk20a *g = dev->g;
u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
int err = 0;
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) || (_IOC_NR(cmd) == 0)
|| (_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST))
return -EINVAL;
BUG_ON(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE);
memset(buf, 0, sizeof(buf));
if (_IOC_DIR(cmd) & _IOC_WRITE) {
if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
return -EFAULT;
}
mutex_lock(&dev->lock);
switch (cmd) {
case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
break;
case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
break;
case NVGPU_CTXSW_IOCTL_RING_SETUP:
err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
(struct nvgpu_ctxsw_ring_setup_args *) buf);
break;
case NVGPU_CTXSW_IOCTL_SET_FILTER:
err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
(struct nvgpu_ctxsw_trace_filter_args *) buf);
break;
case NVGPU_CTXSW_IOCTL_GET_FILTER:
err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
(struct nvgpu_ctxsw_trace_filter_args *) buf);
break;
case NVGPU_CTXSW_IOCTL_POLL:
mutex_unlock(&dev->lock);
err = gk20a_ctxsw_dev_ioctl_poll(dev);
mutex_lock(&dev->lock);
break;
default:
dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
cmd);
err = -ENOTTY;
}
mutex_unlock(&dev->lock);
if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
return err;
}
unsigned int gk20a_ctxsw_dev_poll(struct file *filp, poll_table *wait)
{
struct gk20a_ctxsw_dev *dev = filp->private_data;
struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
unsigned int mask = 0;
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
mutex_lock(&dev->lock);
poll_wait(filp, &dev->readout_wq, wait);
if (!ring_is_empty(hdr))
mask |= POLLIN | POLLRDNORM;
mutex_unlock(&dev->lock);
return mask;
}
static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
{
struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
atomic_inc(&dev->vma_ref);
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
atomic_read(&dev->vma_ref));
}
static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
{
struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
atomic_dec(&dev->vma_ref);
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
atomic_read(&dev->vma_ref));
}
static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
.open = gk20a_ctxsw_dev_vma_open,
.close = gk20a_ctxsw_dev_vma_close,
};
int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct gk20a_ctxsw_dev *dev = filp->private_data;
int ret;
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
vma->vm_start, vma->vm_end);
ret = remap_vmalloc_range(vma, dev->hdr, 0);
if (likely(!ret)) {
vma->vm_private_data = dev;
vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
vma->vm_ops->open(vma);
}
return ret;
}
#ifdef CONFIG_GK20A_CTXSW_TRACE
static int gk20a_ctxsw_init_devs(struct gk20a *g)
{
struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
struct gk20a_ctxsw_dev *dev = trace->devs;
int i;
for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
dev->g = g;
dev->hdr = NULL;
dev->write_enabled = false;
init_waitqueue_head(&dev->readout_wq);
mutex_init(&dev->lock);
atomic_set(&dev->vma_ref, 0);
dev++;
}
return 0;
}
#endif
int gk20a_ctxsw_trace_init(struct gk20a *g)
{
#ifdef CONFIG_GK20A_CTXSW_TRACE
struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
int err;
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
if (likely(trace))
return 0;
trace = kzalloc(sizeof(*trace), GFP_KERNEL);
if (unlikely(!trace))
return -ENOMEM;
g->ctxsw_trace = trace;
err = gk20a_ctxsw_init_devs(g);
if (err)
goto fail;
err = g->ops.fecs_trace.init(g);
if (unlikely(err))
goto fail;
return 0;
fail:
kfree(trace);
g->ctxsw_trace = NULL;
return err;
#else
return 0;
#endif
}
void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
{
#ifdef CONFIG_GK20A_CTXSW_TRACE
kfree(g->ctxsw_trace);
g->ctxsw_trace = NULL;
g->ops.fecs_trace.deinit(g);
#endif
}
int gk20a_ctxsw_trace_write(struct gk20a *g,
struct nvgpu_ctxsw_trace_entry *entry)
{
struct nvgpu_ctxsw_ring_header *hdr;
struct gk20a_ctxsw_dev *dev;
int ret = 0;
const char *reason;
if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
return -ENODEV;
dev = &g->ctxsw_trace->devs[entry->vmid];
hdr = dev->hdr;
gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
"dev=%p hdr=%p", dev, hdr);
mutex_lock(&dev->lock);
if (unlikely(!hdr)) {
/* device has been released */
ret = -ENODEV;
goto done;
}
entry->seqno = hdr->write_seqno++;
if (!dev->write_enabled) {
ret = -EBUSY;
reason = "write disabled";
goto drop;
}
if (unlikely(ring_is_full(hdr))) {
ret = -ENOSPC;
reason = "user fifo full";
goto drop;
}
if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
reason = "filtered out";
goto filter;
}
gk20a_dbg(gpu_dbg_ctxsw,
"seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
entry->seqno, entry->context_id, entry->pid,
entry->tag, entry->timestamp);
dev->ents[hdr->write_idx] = *entry;
/* ensure record is written before updating write index */
smp_wmb();
hdr->write_idx++;
if (unlikely(hdr->write_idx >= hdr->num_ents))
hdr->write_idx = 0;
gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
hdr->read_idx, hdr->write_idx, ring_len(hdr));
mutex_unlock(&dev->lock);
return ret;
drop:
hdr->drop_count++;
filter:
gk20a_dbg(gpu_dbg_ctxsw,
"dropping seqno=%d context_id=%08x pid=%lld "
"tag=%x time=%llx (%s)",
entry->seqno, entry->context_id, entry->pid,
entry->tag, entry->timestamp, reason);
done:
mutex_unlock(&dev->lock);
return ret;
}
void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
{
struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[vmid];
wake_up_interruptible(&dev->readout_wq);
}

View File

@@ -0,0 +1,41 @@
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef __CTXSW_TRACE_GK20A_H
#define __CTXSW_TRACE_GK20A_H
#define GK20A_CTXSW_TRACE_NUM_DEVS 1
struct gk20a;
struct nvgpu_ctxsw_trace_entry;
struct channel_gk20a;
struct channel_ctx_gk20a;
struct gk20a_ctxsw_dev;
struct gk20a_fecs_trace;
int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp);
int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp);
long gk20a_ctxsw_dev_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg);
ssize_t gk20a_ctxsw_dev_read(struct file *, char __user *, size_t, loff_t *);
unsigned int gk20a_ctxsw_dev_poll(struct file *, struct poll_table_struct *);
int gk20a_ctxsw_dev_mmap(struct file *, struct vm_area_struct *);
int gk20a_ctxsw_trace_init(struct gk20a *);
int gk20a_ctxsw_trace_setup(struct gk20a *, void *ctx_ptr);
void gk20a_ctxsw_trace_cleanup(struct gk20a *);
int gk20a_ctxsw_trace_write(struct gk20a *, struct nvgpu_ctxsw_trace_entry *);
void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid);
#endif /* __CTXSW_TRACE_GK20A_H */

View File

@@ -0,0 +1,763 @@
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <asm/barrier.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/circ_buf.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/wait.h>
#include <linux/ktime.h>
#include <linux/nvgpu.h>
#include <linux/hashtable.h>
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <uapi/linux/nvgpu.h>
#include "ctxsw_trace_gk20a.h"
#include "fecs_trace_gk20a.h"
#include "gk20a.h"
#include "gr_gk20a.h"
#include "hw_ctxsw_prog_gk20a.h"
#include "hw_gr_gk20a.h"
/*
* If HW circular buffer is getting too many "buffer full" conditions,
* increasing this constant should help (it drives Linux' internal buffer size).
*/
#define GK20A_FECS_TRACE_NUM_RECORDS (1 << 6)
#define GK20A_FECS_TRACE_HASH_BITS 8 /* 2^8 */
#define GK20A_FECS_TRACE_FRAME_PERIOD_NS (1000000000ULL/60ULL)
#define GK20A_FECS_TRACE_PTIMER_SHIFT 5
struct gk20a_fecs_trace_record {
u32 magic_lo;
u32 magic_hi;
u32 context_id;
u32 context_ptr;
u32 new_context_id;
u32 new_context_ptr;
u64 ts[];
};
struct gk20a_fecs_trace_hash_ent {
u32 context_ptr;
pid_t pid;
struct hlist_node node;
};
struct gk20a_fecs_trace {
struct mem_desc trace_buf;
DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
struct mutex hash_lock;
struct mutex poll_lock;
u64 sof;
u32 sof_mask; /* did we already send a SOF for this VM */
struct task_struct *poll_task;
};
#ifdef CONFIG_GK20A_CTXSW_TRACE
static inline u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts)
{
return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
}
static inline u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts)
{
return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
}
static u32 gk20a_fecs_trace_fecs_context_ptr(struct channel_gk20a *ch)
{
return (u32) (sg_phys(ch->inst_block.sgt->sgl) >> 12LL);
}
static inline int gk20a_fecs_trace_num_ts(void)
{
return (ctxsw_prog_record_timestamp_record_size_in_bytes_v()
- sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64);
}
struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
struct gk20a_fecs_trace *trace, int idx)
{
return (struct gk20a_fecs_trace_record *)
((u8 *) trace->trace_buf.cpu_va
+ (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
}
static bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r)
{
/*
* testing magic_hi should suffice. magic_lo is sometimes used
* as a sequence number in experimental ucode.
*/
return (r->magic_hi
== ctxsw_prog_record_timestamp_magic_value_hi_v_value_v());
}
static int gk20a_fecs_trace_get_read_index(struct gk20a *g)
{
return gr_gk20a_elpg_protected_call(g,
gk20a_readl(g, gr_fecs_mailbox1_r()));
}
static int gk20a_fecs_trace_get_write_index(struct gk20a *g)
{
return gr_gk20a_elpg_protected_call(g,
gk20a_readl(g, gr_fecs_mailbox0_r()));
}
static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index)
{
gk20a_dbg(gpu_dbg_ctxsw, "set read=%d", index);
return gr_gk20a_elpg_protected_call(g,
(gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0));
}
void gk20a_fecs_trace_hash_dump(struct gk20a *g)
{
u32 bkt;
struct gk20a_fecs_trace_hash_ent *ent;
struct gk20a_fecs_trace *trace = g->fecs_trace;
gk20a_dbg(gpu_dbg_ctxsw, "dumping hash table");
mutex_lock(&trace->hash_lock);
hash_for_each(trace->pid_hash_table, bkt, ent, node)
{
gk20a_dbg(gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d",
ent, bkt, ent->context_ptr, ent->pid);
}
mutex_unlock(&trace->hash_lock);
}
static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid)
{
struct gk20a_fecs_trace_hash_ent *he;
struct gk20a_fecs_trace *trace = g->fecs_trace;
gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
"adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid);
he = kzalloc(sizeof(*he), GFP_KERNEL);
if (unlikely(!he)) {
gk20a_warn(dev_from_gk20a(g),
"can't alloc new hash entry for context_ptr=%x pid=%d",
context_ptr, pid);
return -ENOMEM;
}
he->context_ptr = context_ptr;
he->pid = pid;
mutex_lock(&trace->hash_lock);
hash_add(trace->pid_hash_table, &he->node, context_ptr);
mutex_unlock(&trace->hash_lock);
return 0;
}
static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr)
{
struct hlist_node *tmp;
struct gk20a_fecs_trace_hash_ent *ent;
struct gk20a_fecs_trace *trace = g->fecs_trace;
gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
"freeing hash entry context_ptr=%x", context_ptr);
mutex_lock(&trace->hash_lock);
hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node,
context_ptr) {
if (ent->context_ptr == context_ptr) {
hash_del(&ent->node);
gk20a_dbg(gpu_dbg_ctxsw,
"freed hash entry=%p context_ptr=%x", ent,
ent->context_ptr);
kfree(ent);
break;
}
}
mutex_unlock(&trace->hash_lock);
}
static void gk20a_fecs_trace_free_hash_table(struct gk20a *g)
{
u32 bkt;
struct hlist_node *tmp;
struct gk20a_fecs_trace_hash_ent *ent;
struct gk20a_fecs_trace *trace = g->fecs_trace;
gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace);
mutex_lock(&trace->hash_lock);
hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) {
hash_del(&ent->node);
kfree(ent);
}
mutex_unlock(&trace->hash_lock);
}
static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr)
{
struct gk20a_fecs_trace_hash_ent *ent;
struct gk20a_fecs_trace *trace = g->fecs_trace;
pid_t pid = 0;
mutex_lock(&trace->hash_lock);
hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) {
if (ent->context_ptr == context_ptr) {
gk20a_dbg(gpu_dbg_ctxsw,
"found context_ptr=%x -> pid=%d",
ent->context_ptr, ent->pid);
pid = ent->pid;
break;
}
}
mutex_unlock(&trace->hash_lock);
return pid;
}
/*
* Converts HW entry format to userspace-facing format and pushes it to the
* queue.
*/
static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
{
int i;
struct nvgpu_ctxsw_trace_entry entry = { };
struct gk20a_fecs_trace *trace = g->fecs_trace;
pid_t cur_pid;
pid_t new_pid;
/* for now, only one VM */
const int vmid = 0;
struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(
trace, index);
gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
"consuming record trace=%p read=%d record=%p", trace, index, r);
if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) {
gk20a_warn(dev_from_gk20a(g),
"trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
trace, index, r, r->magic_lo, r->magic_hi);
return -EINVAL;
}
cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr);
new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr);
gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
"context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)",
r->context_ptr, cur_pid, r->new_context_ptr, new_pid);
entry.context_id = r->context_id;
entry.vmid = vmid;
/* insert SOF event if needed */
if (!(trace->sof_mask & BIT(vmid))) {
entry.tag = NVGPU_CTXSW_TAG_SOF;
entry.timestamp = trace->sof;
entry.context_id = 0;
entry.pid = 0;
gk20a_dbg(gpu_dbg_ctxsw, "SOF time=%llx", entry.timestamp);
gk20a_ctxsw_trace_write(g, &entry);
trace->sof_mask |= BIT(vmid);
}
/* break out FECS record into trace events */
for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
gk20a_dbg(gpu_dbg_ctxsw,
"tag=%x timestamp=%llx context_id=%08x new_context_id=%08x",
entry.tag, entry.timestamp, r->context_id,
r->new_context_id);
switch (entry.tag) {
case NVGPU_CTXSW_TAG_RESTORE_START:
case NVGPU_CTXSW_TAG_CONTEXT_START:
entry.context_id = r->new_context_id;
entry.pid = new_pid;
break;
case NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST:
case NVGPU_CTXSW_TAG_FE_ACK:
case NVGPU_CTXSW_TAG_FE_ACK_WFI:
case NVGPU_CTXSW_TAG_FE_ACK_GFXP:
case NVGPU_CTXSW_TAG_FE_ACK_CTAP:
case NVGPU_CTXSW_TAG_FE_ACK_CILP:
case NVGPU_CTXSW_TAG_SAVE_END:
entry.context_id = r->context_id;
entry.pid = cur_pid;
break;
default:
/* tags are not guaranteed to start at the beginning */
WARN_ON(entry.tag && (entry.tag != NVGPU_CTXSW_TAG_INVALID_TIMESTAMP));
continue;
}
gk20a_dbg(gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld",
entry.tag, entry.context_id, entry.pid);
if (!entry.context_id)
continue;
gk20a_ctxsw_trace_write(g, &entry);
}
gk20a_ctxsw_trace_wake_up(g, vmid);
return 0;
}
static int gk20a_fecs_trace_poll(struct gk20a *g)
{
struct gk20a_fecs_trace *trace = g->fecs_trace;
int read = 0;
int write = 0;
int cnt;
int err;
err = gk20a_busy(g->dev);
if (unlikely(err))
return err;
mutex_lock(&trace->poll_lock);
write = gk20a_fecs_trace_get_write_index(g);
if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) {
gk20a_err(dev_from_gk20a(g),
"failed to acquire write index, write=%d", write);
err = write;
goto done;
}
read = gk20a_fecs_trace_get_read_index(g);
cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS);
if (!cnt)
goto done;
gk20a_dbg(gpu_dbg_ctxsw,
"circular buffer: read=%d (mailbox=%d) write=%d cnt=%d",
read, gk20a_fecs_trace_get_read_index(g), write, cnt);
/* we did not send any SOF yet */
trace->sof_mask = 0;
/* consume all records */
while (read != write) {
gk20a_fecs_trace_ring_read(g, read);
/* Get to next record. */
read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
gk20a_fecs_trace_set_read_index(g, read);
}
done:
/*
* OK, we read out all the entries... a new "frame" starts here.
* We remember the Start Of Frame time and insert it on the next
* iteration.
*/
trace->sof = gk20a_read_ptimer(g);
mutex_unlock(&trace->poll_lock);
gk20a_idle(g->dev);
return err;
}
static int gk20a_fecs_trace_periodic_polling(void *arg)
{
struct gk20a *g = (struct gk20a *)arg;
struct timespec ts = ns_to_timespec(GK20A_FECS_TRACE_FRAME_PERIOD_NS);
pr_info("%s: running\n", __func__);
while (!kthread_should_stop()) {
hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
gk20a_fecs_trace_poll(g);
}
return 0;
}
static int gk20a_fecs_trace_alloc_ring(struct gk20a *g)
{
struct gk20a_fecs_trace *trace = g->fecs_trace;
return gk20a_gmmu_alloc(g, GK20A_FECS_TRACE_NUM_RECORDS
* ctxsw_prog_record_timestamp_record_size_in_bytes_v(),
&trace->trace_buf);
}
static void gk20a_fecs_trace_free_ring(struct gk20a *g)
{
struct gk20a_fecs_trace *trace = g->fecs_trace;
gk20a_gmmu_free(g, &trace->trace_buf);
}
#ifdef CONFIG_DEBUG_FS
/*
* The sequence iterator functions. We simply use the count of the
* next line as our internal position.
*/
static void *gk20a_fecs_trace_debugfs_ring_seq_start(
struct seq_file *s, loff_t *pos)
{
if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
return NULL;
return pos;
}
static void *gk20a_fecs_trace_debugfs_ring_seq_next(
struct seq_file *s, void *v, loff_t *pos)
{
++(*pos);
if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
return NULL;
return pos;
}
static void gk20a_fecs_trace_debugfs_ring_seq_stop(
struct seq_file *s, void *v)
{
}
static int gk20a_fecs_trace_debugfs_ring_seq_show(
struct seq_file *s, void *v)
{
loff_t *pos = (loff_t *) v;
struct gk20a *g = *(struct gk20a **)s->private;
struct gk20a_fecs_trace *trace = g->fecs_trace;
struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(trace, *pos);
int i;
const u32 invalid_tag =
ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
u32 tag;
u64 timestamp;
seq_printf(s, "record #%lld (%p)\n", *pos, r);
seq_printf(s, "\tmagic_lo=%08x\n", r->magic_lo);
seq_printf(s, "\tmagic_hi=%08x\n", r->magic_hi);
if (gk20a_fecs_trace_is_valid_record(r)) {
seq_printf(s, "\tcontext_ptr=%08x\n", r->context_ptr);
seq_printf(s, "\tcontext_id=%08x\n", r->context_id);
seq_printf(s, "\tnew_context_ptr=%08x\n", r->new_context_ptr);
seq_printf(s, "\tnew_context_id=%08x\n", r->new_context_id);
for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
if (tag == invalid_tag)
continue;
timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
seq_printf(s, "\ttag=%02x timestamp=%012llx\n", tag, timestamp);
}
}
return 0;
}
/*
* Tie them all together into a set of seq_operations.
*/
const struct seq_operations gk20a_fecs_trace_debugfs_ring_seq_ops = {
.start = gk20a_fecs_trace_debugfs_ring_seq_start,
.next = gk20a_fecs_trace_debugfs_ring_seq_next,
.stop = gk20a_fecs_trace_debugfs_ring_seq_stop,
.show = gk20a_fecs_trace_debugfs_ring_seq_show
};
/*
* Time to set up the file operations for our /proc file. In this case,
* all we need is an open function which sets up the sequence ops.
*/
static int gk20a_ctxsw_debugfs_ring_open(struct inode *inode,
struct file *file)
{
struct gk20a **p;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
p = __seq_open_private(file, &gk20a_fecs_trace_debugfs_ring_seq_ops,
sizeof(struct gk20a *));
if (!p)
return -ENOMEM;
*p = (struct gk20a *)inode->i_private;
return 0;
};
/*
* The file operations structure contains our open function along with
* set of the canned seq_ ops.
*/
const struct file_operations gk20a_fecs_trace_debugfs_ring_fops = {
.owner = THIS_MODULE,
.open = gk20a_ctxsw_debugfs_ring_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private
};
static int gk20a_fecs_trace_debugfs_read(void *arg, u64 *val)
{
*val = gk20a_fecs_trace_get_read_index((struct gk20a *)arg);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_read_fops,
gk20a_fecs_trace_debugfs_read, NULL, "%llu\n");
static int gk20a_fecs_trace_debugfs_write(void *arg, u64 *val)
{
*val = gk20a_fecs_trace_get_write_index((struct gk20a *)arg);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_write_fops,
gk20a_fecs_trace_debugfs_write, NULL, "%llu\n");
static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
{
struct gk20a_platform *plat = platform_get_drvdata(g->dev);
debugfs_create_file("ctxsw_trace_read", 0600, plat->debugfs, g,
&gk20a_fecs_trace_debugfs_read_fops);
debugfs_create_file("ctxsw_trace_write", 0600, plat->debugfs, g,
&gk20a_fecs_trace_debugfs_write_fops);
debugfs_create_file("ctxsw_trace_ring", 0600, plat->debugfs, g,
&gk20a_fecs_trace_debugfs_ring_fops);
}
static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
{
struct gk20a_platform *plat = platform_get_drvdata(g->dev);
debugfs_remove_recursive(plat->debugfs);
}
#else
static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
{
}
static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
{
}
#endif /* CONFIG_DEBUG_FS */
static int gk20a_fecs_trace_init(struct gk20a *g)
{
struct gk20a_fecs_trace *trace;
int err;
trace = kzalloc(sizeof(struct gk20a_fecs_trace), GFP_KERNEL);
if (!trace) {
gk20a_warn(dev_from_gk20a(g), "failed to allocate fecs_trace");
return -ENOMEM;
}
g->fecs_trace = trace;
BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
err = gk20a_fecs_trace_alloc_ring(g);
if (err) {
gk20a_warn(dev_from_gk20a(g), "failed to allocate FECS ring");
goto clean;
}
mutex_init(&trace->poll_lock);
mutex_init(&trace->hash_lock);
hash_init(trace->pid_hash_table);
gk20a_fecs_trace_debugfs_init(g);
return 0;
clean:
kfree(trace);
g->fecs_trace = NULL;
return err;
}
static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
struct channel_gk20a *ch)
{
/*
* map our circ_buf to the context space and store the GPU VA
* in the context header.
*/
u32 lo;
u32 hi;
phys_addr_t pa;
struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
struct gk20a_fecs_trace *trace = g->fecs_trace;
void *ctx_ptr;
u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
"hw_chid=%d context_ptr=%x inst_block=%llx",
ch->hw_chid, context_ptr, gk20a_mem_phys(&ch->inst_block));
if (!trace)
return -ENOMEM;
pa = gk20a_mem_phys(&trace->trace_buf);
if (!pa)
return -ENOMEM;
ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
pgprot_writecombine(PAGE_KERNEL));
if (!ctx_ptr)
return -ENOMEM;
lo = u64_lo32(pa);
hi = u64_hi32(pa);
gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
lo, GK20A_FECS_TRACE_NUM_RECORDS);
gk20a_mem_wr32(ctx_ptr
+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
0, lo);
gk20a_mem_wr32(ctx_ptr
+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
gk20a_mem_wr32(ctx_ptr
+ ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
GK20A_FECS_TRACE_NUM_RECORDS));
vunmap(ctx_ptr);
gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
return 0;
}
static int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
{
u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
"ch=%p context_ptr=%x", ch, context_ptr);
if (g->ops.fecs_trace.flush)
g->ops.fecs_trace.flush(g);
gk20a_fecs_trace_poll(g);
gk20a_fecs_trace_hash_del(g, context_ptr);
return 0;
}
static int gk20a_fecs_trace_reset(struct gk20a *g)
{
gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
if (g->ops.fecs_trace.flush)
g->ops.fecs_trace.flush(g);
gk20a_fecs_trace_poll(g);
return gk20a_fecs_trace_set_read_index(g, 0);
}
static int gk20a_fecs_trace_deinit(struct gk20a *g)
{
struct gk20a_fecs_trace *trace = g->fecs_trace;
gk20a_fecs_trace_debugfs_cleanup(g);
kthread_stop(trace->poll_task);
gk20a_fecs_trace_free_ring(g);
gk20a_fecs_trace_free_hash_table(g);
kfree(g->fecs_trace);
g->fecs_trace = NULL;
return 0;
}
static int gk20a_gr_max_entries(struct gk20a *g,
struct nvgpu_ctxsw_trace_filter *filter)
{
int n;
int tag;
/* Compute number of entries per record, with given filter */
for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++)
n += (NVGPU_CTXSW_FILTER_ISSET(tag, filter) != 0);
/* Return max number of entries generated for the whole ring */
return n * GK20A_FECS_TRACE_NUM_RECORDS;
}
static int gk20a_fecs_trace_enable(struct gk20a *g)
{
struct gk20a_fecs_trace *trace = g->fecs_trace;
struct task_struct *task;
if (!trace->poll_task) {
task = kthread_run(gk20a_fecs_trace_periodic_polling, g, __func__);
if (unlikely(IS_ERR(task))) {
gk20a_warn(dev_from_gk20a(g), "failed to create FECS polling task");
return PTR_ERR(task);
}
trace->poll_task = task;
}
return 0;
}
static int gk20a_fecs_trace_disable(struct gk20a *g)
{
struct gk20a_fecs_trace *trace = g->fecs_trace;
if (trace->poll_task) {
kthread_stop(trace->poll_task);
trace->poll_task = NULL;
}
return -EPERM;
}
void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
{
ops->fecs_trace.init = gk20a_fecs_trace_init;
ops->fecs_trace.deinit = gk20a_fecs_trace_deinit;
ops->fecs_trace.enable = gk20a_fecs_trace_enable;
ops->fecs_trace.disable = gk20a_fecs_trace_disable;
ops->fecs_trace.reset = gk20a_fecs_trace_reset;
ops->fecs_trace.flush = NULL;
ops->fecs_trace.poll = gk20a_fecs_trace_poll;
ops->fecs_trace.bind_channel = gk20a_fecs_trace_bind_channel;
ops->fecs_trace.unbind_channel = gk20a_fecs_trace_unbind_channel;
ops->fecs_trace.max_entries = gk20a_gr_max_entries;
}
#else
void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
{
}
#endif /* CONFIG_GK20A_CTXSW_TRACE */

View File

@@ -0,0 +1,20 @@
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef __FECS_TRACE_GK20A_H
#define __FECS_TRACE_GK20A_H
struct gpu_ops;
void gk20a_init_fecs_trace_ops(struct gpu_ops *ops);
#endif /* __FECS_TRACE_GK20A_H */

View File

@@ -25,6 +25,7 @@
#include "gk20a.h"
#include "debug_gk20a.h"
#include "ctxsw_trace_gk20a.h"
#include "semaphore_gk20a.h"
#include "hw_fifo_gk20a.h"
#include "hw_pbdma_gk20a.h"
@@ -778,8 +779,12 @@ void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
gk20a_pmu_disable_elpg(g);
/*HALT_PIPELINE method, halt GR engine*/
if (gr_gk20a_halt_pipe(g))
gk20a_err(dev_from_gk20a(g),
"failed to HALT gr pipe");
gk20a_err(dev_from_gk20a(g), "failed to HALT gr pipe");
/* resetting engine will alter read/write index.
* need to flush circular buffer before re-enabling FECS.
*/
if (g->ops.fecs_trace.reset)
g->ops.fecs_trace.reset(g);
/* resetting engine using mc_enable_r() is not
enough, we do full init sequence */
gk20a_gr_reset(g);

View File

@@ -60,6 +60,7 @@
#include "hw_gr_gk20a.h"
#include "hw_fb_gk20a.h"
#include "gk20a_scale.h"
#include "ctxsw_trace_gk20a.h"
#include "dbg_gpu_gk20a.h"
#include "gk20a_allocator.h"
#include "hal.h"
@@ -80,7 +81,7 @@
/* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */
#define INTERFACE_NAME "nvhost%s-gpu"
#define GK20A_NUM_CDEVS 6
#define GK20A_NUM_CDEVS 7
#define EMC3D_DEFAULT_RATIO 750
@@ -169,6 +170,19 @@ static const struct file_operations gk20a_tsg_ops = {
.unlocked_ioctl = gk20a_tsg_dev_ioctl,
};
static const struct file_operations gk20a_ctxsw_ops = {
.owner = THIS_MODULE,
.release = gk20a_ctxsw_dev_release,
.open = gk20a_ctxsw_dev_open,
#ifdef CONFIG_COMPAT
.compat_ioctl = gk20a_ctxsw_dev_ioctl,
#endif
.unlocked_ioctl = gk20a_ctxsw_dev_ioctl,
.poll = gk20a_ctxsw_dev_poll,
.read = gk20a_ctxsw_dev_read,
.mmap = gk20a_ctxsw_dev_mmap,
};
static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
{
writel(v, g->sim.regs+r);
@@ -881,6 +895,10 @@ static int gk20a_pm_finalize_poweron(struct device *dev)
goto done;
}
err = gk20a_ctxsw_trace_init(g);
if (err)
gk20a_warn(dev, "could not initialize ctxsw tracing");
/* Restore the debug setting */
g->ops.mm.set_debug_mode(g, g->mmu_debug_ctrl);
@@ -1009,6 +1027,11 @@ void gk20a_user_deinit(struct platform_device *dev)
cdev_del(&g->tsg.cdev);
}
if (g->ctxsw.node) {
device_destroy(g->class, g->ctxsw.cdev.dev);
cdev_del(&g->ctxsw.cdev);
}
if (g->cdev_region)
unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS);
@@ -1074,6 +1097,15 @@ int gk20a_user_init(struct platform_device *dev)
if (err)
goto fail;
#ifdef CONFIG_GK20A_CTXSW_TRACE
err = gk20a_create_device(dev, devno++, "-ctxsw",
&g->ctxsw.cdev, &g->ctxsw.node,
&gk20a_ctxsw_ops);
if (err)
goto fail;
#endif
return 0;
fail:
gk20a_user_deinit(dev);
@@ -1554,6 +1586,8 @@ static int __exit gk20a_remove(struct platform_device *dev)
if (platform->has_cde)
gk20a_cde_destroy(g);
gk20a_ctxsw_trace_cleanup(g);
if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
gk20a_scale_exit(dev);
@@ -2091,6 +2125,19 @@ gk20a_request_firmware(struct gk20a *g, const char *fw_name)
return fw;
}
u64 gk20a_read_ptimer(struct gk20a *g)
{
u32 time_hi0 = gk20a_readl(g, timer_time_1_r());
u32 time_lo = gk20a_readl(g, timer_time_0_r());
u32 time_hi1 = gk20a_readl(g, timer_time_1_r());
u32 time_hi = (time_lo & (1L << 31)) ? time_hi0 : time_hi1;
u64 time = ((u64)time_hi << 32) | time_lo;
return time;
}
MODULE_LICENSE("GPL v2");
module_init(gk20a_init);
module_exit(gk20a_exit);

View File

@@ -25,6 +25,8 @@ struct channel_gk20a;
struct gr_gk20a;
struct sim_gk20a;
struct gk20a_ctxsw_ucode_segments;
struct gk20a_fecs_trace;
struct gk20a_ctxsw_trace;
struct acr_gm20b;
#include <linux/sched.h>
@@ -372,6 +374,19 @@ struct gpu_ops {
bool (*is_fw_defined)(void);
bool use_dma_for_fw_bootstrap;
} gr_ctx;
struct {
int (*init)(struct gk20a *g);
int (*max_entries)(struct gk20a *,
struct nvgpu_ctxsw_trace_filter *);
int (*flush)(struct gk20a *g);
int (*poll)(struct gk20a *g);
int (*enable)(struct gk20a *g);
int (*disable)(struct gk20a *g);
int (*reset)(struct gk20a *g);
int (*bind_channel)(struct gk20a *, struct channel_gk20a *);
int (*unbind_channel)(struct gk20a *, struct channel_gk20a *);
int (*deinit)(struct gk20a *g);
} fecs_trace;
struct {
bool (*support_sparse)(struct gk20a *g);
bool (*is_debug_mode_enabled)(struct gk20a *g);
@@ -613,6 +628,11 @@ struct gk20a {
struct device *node;
} tsg;
struct {
struct cdev cdev;
struct device *node;
} ctxsw;
struct mutex client_lock;
int client_refcount; /* open channels and ctrl nodes */
@@ -639,6 +659,9 @@ struct gk20a {
struct gk20a_scale_profile *scale_profile;
struct gk20a_ctxsw_trace *ctxsw_trace;
struct gk20a_fecs_trace *fecs_trace;
struct device_dma_parameters dma_parms;
struct gk20a_cde_app cde_app;
@@ -716,6 +739,7 @@ enum gk20a_dbg_categories {
gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */
gpu_dbg_cde = BIT(10), /* cde info messages */
gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
gpu_dbg_ctxsw = BIT(12), /* ctxsw tracing */
gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */
};
@@ -962,4 +986,6 @@ static inline u32 scale_ptimer(u32 timeout , u32 scale10x)
else
return (timeout * 10) / scale10x;
}
u64 gk20a_read_ptimer(struct gk20a *g);
#endif /* GK20A_H */

View File

@@ -56,6 +56,7 @@
#include "debug_gk20a.h"
#include "semaphore_gk20a.h"
#include "platform_gk20a.h"
#include "ctxsw_trace_gk20a.h"
#define BLK_SIZE (256)
@@ -2855,6 +2856,13 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
"fail to load golden ctx image");
goto out;
}
if (g->ops.fecs_trace.bind_channel) {
err = g->ops.fecs_trace.bind_channel(g, c);
if (err) {
gk20a_warn(dev_from_gk20a(g),
"fail to bind channel for ctxsw trace");
}
}
c->first_init = true;
}

View File

@@ -22,6 +22,7 @@
#include "gk20a_gating_reglist.h"
#include "channel_gk20a.h"
#include "gr_ctx_gk20a.h"
#include "fecs_trace_gk20a.h"
#include "mm_gk20a.h"
#include "mc_gk20a.h"
#include "pmu_gk20a.h"
@@ -57,6 +58,7 @@ int gk20a_init_hal(struct gk20a *g)
gk20a_init_mc(gops);
gk20a_init_ltc(gops);
gk20a_init_gr_ops(gops);
gk20a_init_fecs_trace_ops(gops);
gk20a_init_fb(gops);
gk20a_init_fifo(gops);
gk20a_init_ce2(gops);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2012-2015, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2012-2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -246,4 +246,192 @@ static inline u32 ctxsw_prog_main_image_context_id_o(void)
{
return 0x000000f0;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_o(void)
{
return 0x000000ac;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(u32 v)
{
return (v & 0xffff) << 0;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(void)
{
return 0x000000b0;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_v_m(void)
{
return 0xfffffff << 0;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_m(void)
{
return 0x3 << 28;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f(void)
{
return 0x0;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(void)
{
return 0x20000000;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(void)
{
return 0x30000000;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(void)
{
return 0x000000b4;
}
static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(u32 v)
{
return (v & 0xffffffff) << 0;
}
static inline u32 ctxsw_prog_record_timestamp_record_size_in_bytes_v(void)
{
return 0x00000080;
}
static inline u32 ctxsw_prog_record_timestamp_record_size_in_words_v(void)
{
return 0x00000020;
}
static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_o(void)
{
return 0x00000000;
}
static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_v_value_v(void)
{
return 0x00000000;
}
static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_o(void)
{
return 0x00000004;
}
static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_v_value_v(void)
{
return 0x600dbeef;
}
static inline u32 ctxsw_prog_record_timestamp_context_id_o(void)
{
return 0x00000008;
}
static inline u32 ctxsw_prog_record_timestamp_context_ptr_o(void)
{
return 0x0000000c;
}
static inline u32 ctxsw_prog_record_timestamp_new_context_id_o(void)
{
return 0x00000010;
}
static inline u32 ctxsw_prog_record_timestamp_new_context_ptr_o(void)
{
return 0x00000014;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_lo_o(void)
{
return 0x00000018;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_o(void)
{
return 0x0000001c;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_f(u32 v)
{
return (v & 0xffffff) << 0;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_v(u32 r)
{
return (r >> 0) & 0xffffff;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_f(u32 v)
{
return (v & 0xff) << 24;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_m(void)
{
return 0xff << 24;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_v(u32 r)
{
return (r >> 24) & 0xff;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_v(void)
{
return 0x00000001;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_f(void)
{
return 0x1000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_v(void)
{
return 0x00000002;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_f(void)
{
return 0x2000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_v(void)
{
return 0x0000000a;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_f(void)
{
return 0xa000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_v(void)
{
return 0x0000000b;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_f(void)
{
return 0xb000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_v(void)
{
return 0x0000000c;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_f(void)
{
return 0xc000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_v(void)
{
return 0x0000000d;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_f(void)
{
return 0xd000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_v(void)
{
return 0x00000003;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_f(void)
{
return 0x3000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_v(void)
{
return 0x00000004;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_f(void)
{
return 0x4000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_v(void)
{
return 0x00000005;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_f(void)
{
return 0x5000000;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v(void)
{
return 0x000000ff;
}
static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_f(void)
{
return 0xff000000;
}
#endif

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2013-2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,

View File

@@ -0,0 +1,21 @@
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/string.h>
#include "gk20a/gk20a.h"
#include "fecs_trace_vgpu.h"
void vgpu_init_fecs_trace_ops(struct gpu_ops *ops)
{
memset(&ops->fecs_trace, 0, sizeof(ops->fecs_trace));
}

View File

@@ -0,0 +1,20 @@
/*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#ifndef __FECS_TRACE_VGPU_H
#define __FECS_TRACE_VGPU_H
struct gpu_ops;
void vgpu_init_fecs_trace_ops(struct gpu_ops *ops);
#endif /* __FECS_TRACE_VGPU_H */

View File

@@ -18,6 +18,7 @@
#include <linux/dma-mapping.h>
#include <linux/pm_runtime.h>
#include "vgpu/vgpu.h"
#include "vgpu/fecs_trace_vgpu.h"
#include "gk20a/debug_gk20a.h"
#include "gk20a/hal_gk20a.h"
#include "gk20a/hw_mc_gk20a.h"
@@ -259,6 +260,7 @@ void vgpu_init_hal_common(struct gk20a *g)
vgpu_init_ltc_ops(gops);
vgpu_init_mm_ops(gops);
vgpu_init_debug_ops(gops);
vgpu_init_fecs_trace_ops(gops);
}
static int vgpu_init_hal(struct gk20a *g)

View File

@@ -387,7 +387,7 @@ TRACE_EVENT(gk20a_as_ioctl_get_va_regions,
TRACE_EVENT(gk20a_mmu_fault,
TP_PROTO(u32 fault_hi, u32 fault_lo,
u32 fault_info,
u32 instance,
u64 instance,
u32 engine_id,
const char *engine,
const char *client,
@@ -398,7 +398,7 @@ TRACE_EVENT(gk20a_mmu_fault,
__field(u32, fault_hi)
__field(u32, fault_lo)
__field(u32, fault_info)
__field(u32, instance)
__field(u64, instance)
__field(u32, engine_id)
__field(const char *, engine)
__field(const char *, client)
@@ -414,7 +414,7 @@ TRACE_EVENT(gk20a_mmu_fault,
__entry->client = client;
__entry->fault_type = fault_type;
),
TP_printk("fault=0x%x,%08x info=0x%x instance=0x%x engine_id=%d engine=%s client=%s type=%s",
TP_printk("fault=0x%x,%08x info=0x%x instance=0x%llx engine_id=%d engine=%s client=%s type=%s",
__entry->fault_hi, __entry->fault_lo,
__entry->fault_info, __entry->instance, __entry->engine_id,
__entry->engine, __entry->client, __entry->fault_type)

View File

@@ -1215,4 +1215,94 @@ struct nvgpu_as_map_buffer_batch_args {
#define NVGPU_AS_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_as_map_buffer_ex_args)
/*
* /dev/nvhost-ctxsw-gpu device
*
* Opening a '/dev/nvhost-ctxsw-gpu' device node creates a way to trace
* context switches on GR engine
*/
#define NVGPU_CTXSW_IOCTL_MAGIC 'C'
#define NVGPU_CTXSW_TAG_SOF 0x00
#define NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST 0x01
#define NVGPU_CTXSW_TAG_FE_ACK 0x02
#define NVGPU_CTXSW_TAG_FE_ACK_WFI 0x0a
#define NVGPU_CTXSW_TAG_FE_ACK_GFXP 0x0b
#define NVGPU_CTXSW_TAG_FE_ACK_CTAP 0x0c
#define NVGPU_CTXSW_TAG_FE_ACK_CILP 0x0d
#define NVGPU_CTXSW_TAG_SAVE_END 0x03
#define NVGPU_CTXSW_TAG_RESTORE_START 0x04
#define NVGPU_CTXSW_TAG_CONTEXT_START 0x05
#define NVGPU_CTXSW_TAG_INVALID_TIMESTAMP 0xff
#define NVGPU_CTXSW_TAG_LAST \
NVGPU_CTXSW_TAG_INVALID_TIMESTAMP
struct nvgpu_ctxsw_trace_entry {
__u8 tag;
__u8 vmid;
__u16 seqno; /* sequence number to detect drops */
__u32 context_id; /* context_id as allocated by FECS */
__u64 pid; /* 64-bit is max bits of different OS pid */
__u64 timestamp; /* 64-bit time */
};
#define NVGPU_CTXSW_RING_HEADER_MAGIC 0x7000fade
#define NVGPU_CTXSW_RING_HEADER_VERSION 0
struct nvgpu_ctxsw_ring_header {
__u32 magic;
__u32 version;
__u32 num_ents;
__u32 ent_size;
volatile __u32 drop_count; /* excluding filtered out events */
volatile __u32 write_seqno;
volatile __u32 write_idx;
volatile __u32 read_idx;
};
struct nvgpu_ctxsw_ring_setup_args {
__u32 size; /* [in/out] size of ring buffer in bytes (including
header). will be rounded page size. this parameter
is updated with actual allocated size. */
};
#define NVGPU_CTXSW_FILTER_SIZE (NVGPU_CTXSW_TAG_LAST + 1)
#define NVGPU_CTXSW_FILTER_SET(n, p) \
((p)->tag_bits[(n) / 64] |= (1 << ((n) & 63)))
#define NVGPU_CTXSW_FILTER_CLR(n, p) \
((p)->tag_bits[(n) / 64] &= ~(1 << ((n) & 63)))
#define NVGPU_CTXSW_FILTER_ISSET(n, p) \
((p)->tag_bits[(n) / 64] & (1 << ((n) & 63)))
#define NVGPU_CTXSW_FILTER_CLR_ALL(p) memset((void *)(p), 0, sizeof(*(p)))
#define NVGPU_CTXSW_FILTER_SET_ALL(p) memset((void *)(p), ~0, sizeof(*(p)))
struct nvgpu_ctxsw_trace_filter {
__u64 tag_bits[(NVGPU_CTXSW_FILTER_SIZE + 63) / 64];
};
struct nvgpu_ctxsw_trace_filter_args {
struct nvgpu_ctxsw_trace_filter filter;
};
#define NVGPU_CTXSW_IOCTL_TRACE_ENABLE \
_IO(NVGPU_CTXSW_IOCTL_MAGIC, 1)
#define NVGPU_CTXSW_IOCTL_TRACE_DISABLE \
_IO(NVGPU_CTXSW_IOCTL_MAGIC, 2)
#define NVGPU_CTXSW_IOCTL_RING_SETUP \
_IOWR(NVGPU_CTXSW_IOCTL_MAGIC, 3, struct nvgpu_ctxsw_ring_setup_args)
#define NVGPU_CTXSW_IOCTL_SET_FILTER \
_IOW(NVGPU_CTXSW_IOCTL_MAGIC, 4, struct nvgpu_ctxsw_trace_filter_args)
#define NVGPU_CTXSW_IOCTL_GET_FILTER \
_IOR(NVGPU_CTXSW_IOCTL_MAGIC, 5, struct nvgpu_ctxsw_trace_filter_args)
#define NVGPU_CTXSW_IOCTL_POLL \
_IO(NVGPU_CTXSW_IOCTL_MAGIC, 6)
#define NVGPU_CTXSW_IOCTL_LAST \
_IOC_NR(NVGPU_CTXSW_IOCTL_POLL)
#define NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_ctxsw_trace_filter_args)
#endif