Files
linux-nvgpu/drivers/gpu/nvgpu/os/linux/debug.c
Alex Waterman 6b1302f23c gpu: nvgpu: Reduce linux debug log spew
Currently when nvgpu prints debug information for something like an MMU
fault the result includes a lot of usless boiler plate logging spew. In
some cases this can be helpful in identifying where the log message came
from in the nvgpu code base. However, for debug spews from faults, the
viewer of that info does not care which function printed the log (for
example).

Instead having a fast and readable debug dump is more valuable. So to
that end, add a special debug dump printing function that does not use
the normal log format. Instead, it prints only a breif prefix to use as
a grep search query. The new print out is listed below.

Since often the kernel logs are impressively long and obtuse, having a
clear debug search string can be helpful. With this log format, one can
simply do:

  $ grep __$CHIP__ kernel.log

And find any debug logs for the desired chip.

New log format - collected on a gv11b under L4T running
`nvgpu_submit_mmu_fault':

[   32.005793] nvgpu: 17000000.gv11b      gv11b_fb_mmu_fault_info_dump:311  [ERR]  [MMU FAULT] mmu engine id:  32, ch id:  511, fault addr: 0x1000, fault addr aperture: 0, fault type: invalid pde, access type: virt read,
[   32.006137] nvgpu: 17000000.gv11b      gv11b_fb_mmu_fault_info_dump:320  [ERR]  [MMU FAULT] protected mode: 0, client type: hub, client id:  host, gpc id if client type is gpc: 0,
[   32.006417] nvgpu: 17000000.gv11b                nvgpu_rc_mmu_fault:296  [ERR]  mmu fault id=0 id_type=1 act_eng_bitmask=00000000
[   32.007125] __gv11b__ Channel Status - chip gv11b
[   32.007128] __gv11b__ ---------------------------
[   32.007241] __gv11b__ 511-gv11b, TSG: 0, pid 955, refs: 2, deterministic:
[   32.007364] __gv11b__ channel status:  in use pending busy
[   32.007509] __gv11b__ RAMFC : TOP: 8000000000001000 PUT: 0000000000001030 GET: 0000000000001000 FETCH: 0000600000001000HEADER: 60400000 COUNT: 00000000SEMAPHORE: addr 0000000000000000payload 0000000000000000 execute 00000000
[   32.007601] __gv11b__
[   32.008696] __gv11b__
[   32.008700] __gv11b__ PBDMA Status - chip gv11b
[   32.008894] __gv11b__ -------------------------
[   32.013477] __gv11b__ pbdma 0:
[   32.017840] __gv11b__   id: -1 - [channel] next_id: - -1 [channel] | status: invalid
[   32.020992] __gv11b__   PBDMA_PUT 0000000000001030 PBDMA_GET 0000000000001000
[   32.029037] __gv11b__   GP_PUT    00000001  GP_GET  00000001  FETCH   00000001 HEADER 60400000
[   32.036386] __gv11b__   HDR       00000000  SHADOW0 00001000  SHADOW1 80003000
[   32.044787] __gv11b__ pbdma 1:
[   32.051964] __gv11b__   id: -1 - [channel] next_id: - -1 [channel] | status: invalid
[   32.055099] __gv11b__   PBDMA_PUT 0000000042003200 PBDMA_GET 00000050728bc914
[   32.062997] __gv11b__   GP_PUT    00000000  GP_GET  2080a000  FETCH   00000000 HEADER e1850010
[   32.070424] __gv11b__   HDR       00110000  SHADOW0 02000000  SHADOW1 10000004
[   32.078652] __gv11b__ pbdma 2:
[   32.085913] __gv11b__   id: -1 - [channel] next_id: - -1 [channel] | status: invalid
[   32.088973] __gv11b__   PBDMA_PUT 00000021040c0004 PBDMA_GET 0000000140020000
[   32.096502] __gv11b__   GP_PUT    00000000  GP_GET  8080a440  FETCH   00000000 HEADER 61400040
[   32.103679] __gv11b__   HDR       14000010  SHADOW0 00000000  SHADOW1 00000400
[   32.112336] __gv11b__
[   32.119860] __gv11b__ gv11b eng 0:
[   32.122119] __gv11b__ id: -1 (channel), next_id: -1 (channel), ctx status: invalid
[   32.125807] __gv11b__
[   32.135954] __gv11b__ gv11b eng 1:
[   32.135958] __gv11b__ id: -1 (channel), next_id: -1 (channel), ctx status: invalid
[   32.139457] __gv11b__
[   32.149945] __gv11b__ gv11b eng 2:
[   32.149950] __gv11b__ id: -1 (channel), next_id: -1 (channel), ctx status: invalid
[   32.153543] __gv11b__
[   32.163598] __gv11b__ gv11b eng 3:
[   32.163601] __gv11b__ id: -1 (channel), next_id: -1 (channel), ctx status: invalid
[   32.167278] __gv11b__
[   32.177076] __gv11b__
[   32.186145] nvgpu: 17000000.gv11b       nvgpu_tsg_set_ctx_mmu_error:492  [ERR]  TSG 0 generated a mmu fault
[   32.189443] nvgpu: 17000000.gv11b     nvgpu_set_err_notifier_locked:140  [ERR]  error notifier set to 31 for ch 511

JIRA NVGPU-5541

Change-Id: Iad60adfab5198ee11dd2ec595f2422ea541b7a2a
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2349166
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2020-12-15 14:13:28 -06:00

444 lines
11 KiB
C

/*
* Copyright (C) 2017-2020 NVIDIA Corporation. All rights reserved.
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
* may be copied, distributed, and modified under those terms.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#include "debug_cde.h"
#include "debug_ce.h"
#include "debug_fifo.h"
#include "debug_gr.h"
#include "debug_allocator.h"
#include "debug_kmem.h"
#include "debug_pmu.h"
#include "debug_sched.h"
#include "debug_hal.h"
#include "debug_xve.h"
#include "debug_ltc.h"
#include "debug_bios.h"
#include "os_linux.h"
#include "platform_gk20a.h"
#include <nvgpu/gk20a.h>
#include <nvgpu/power_features/pg.h>
#include <nvgpu/nvgpu_init.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <nvgpu/debug.h>
unsigned int gk20a_debug_trace_cmdbuf;
static inline void gk20a_debug_write_printk(void *ctx, const char *str)
{
struct gk20a *g = ctx;
nvgpu_dbg_dump_impl(g, str);
}
static inline void gk20a_debug_write_to_seqfile(void *ctx, const char *str)
{
seq_printf((struct seq_file *)ctx, "%s\n", str);
}
void gk20a_debug_output(struct nvgpu_debug_context *o, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
vsnprintf(o->buf, sizeof(o->buf), fmt, args);
va_end(args);
o->fn(o->ctx, o->buf);
}
void gk20a_debug_show_dump(struct gk20a *g, struct nvgpu_debug_context *o)
{
nvgpu_channel_debug_dump_all(g, o);
g->ops.pbdma.dump_status(g, o);
g->ops.engine_status.dump_engine_status(g, o);
}
static int gk20a_gr_dump_regs(struct gk20a *g,
struct nvgpu_debug_context *o)
{
if (g->ops.gr.dump_gr_regs)
nvgpu_pg_elpg_protected_call(g, g->ops.gr.dump_gr_regs(g, o));
return 0;
}
void gk20a_gr_debug_dump(struct gk20a *g)
{
struct nvgpu_debug_context o = {
.fn = gk20a_debug_write_printk,
.ctx = g,
};
gk20a_gr_dump_regs(g, &o);
}
static int gk20a_gr_debug_show(struct seq_file *s, void *unused)
{
struct device *dev = s->private;
struct gk20a *g = gk20a_get_platform(dev)->g;
struct nvgpu_debug_context o = {
.fn = gk20a_debug_write_to_seqfile,
.ctx = s,
};
int err;
err = gk20a_busy(g);
if (err) {
nvgpu_err(g, "failed to power on gpu: %d", err);
return -EINVAL;
}
gk20a_gr_dump_regs(g, &o);
gk20a_idle(g);
return 0;
}
void gk20a_debug_dump(struct gk20a *g)
{
struct gk20a_platform *platform = gk20a_get_platform(dev_from_gk20a(g));
struct nvgpu_debug_context o = {
.fn = gk20a_debug_write_printk,
.ctx = g,
};
/* HAL only initialized after 1st power-on */
if (g->ops.debug.show_dump)
g->ops.debug.show_dump(g, &o);
if (platform->dump_platform_dependencies)
platform->dump_platform_dependencies(dev_from_gk20a(g));
}
static int gk20a_debug_show(struct seq_file *s, void *unused)
{
struct device *dev = s->private;
struct nvgpu_debug_context o = {
.fn = gk20a_debug_write_to_seqfile,
.ctx = s,
};
struct gk20a *g;
int err;
g = gk20a_get_platform(dev)->g;
err = gk20a_busy(g);
if (err) {
nvgpu_err(g, "failed to power on gpu: %d", err);
return -EFAULT;
}
/* HAL only initialized after 1st power-on */
if (g->ops.debug.show_dump)
g->ops.debug.show_dump(g, &o);
gk20a_idle(g);
return 0;
}
static int gk20a_gr_debug_open(struct inode *inode, struct file *file)
{
return single_open(file, gk20a_gr_debug_show, inode->i_private);
}
static int gk20a_debug_open(struct inode *inode, struct file *file)
{
return single_open(file, gk20a_debug_show, inode->i_private);
}
static const struct file_operations gk20a_gr_debug_fops = {
.open = gk20a_gr_debug_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static const struct file_operations gk20a_debug_fops = {
.open = gk20a_debug_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static ssize_t disable_bigpage_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[3];
struct gk20a *g = file->private_data;
if (g->mm.disable_bigpage)
buf[0] = 'Y';
else
buf[0] = 'N';
buf[1] = '\n';
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
static ssize_t disable_bigpage_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
int buf_size;
bool bv;
struct gk20a *g = file->private_data;
int err;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
if (strtobool(buf, &bv) == 0) {
g->mm.disable_bigpage = bv;
err = nvgpu_init_gpu_characteristics(g);
if (err != 0) {
nvgpu_err(g, "failed to init GPU characteristics");
return -ENOSYS;
}
}
return count;
}
static struct file_operations disable_bigpage_fops = {
.open = simple_open,
.read = disable_bigpage_read,
.write = disable_bigpage_write,
};
static int railgate_residency_show(struct seq_file *s, void *data)
{
struct gk20a *g = s->private;
struct gk20a_platform *platform = dev_get_drvdata(dev_from_gk20a(g));
unsigned long time_since_last_state_transition_ms;
unsigned long total_rail_gate_time_ms;
unsigned long total_rail_ungate_time_ms;
if (platform->is_railgated(dev_from_gk20a(g))) {
time_since_last_state_transition_ms =
jiffies_to_msecs(jiffies -
g->pstats.last_rail_gate_complete);
total_rail_ungate_time_ms = g->pstats.total_rail_ungate_time_ms;
total_rail_gate_time_ms =
g->pstats.total_rail_gate_time_ms +
time_since_last_state_transition_ms;
} else {
time_since_last_state_transition_ms =
jiffies_to_msecs(jiffies -
g->pstats.last_rail_ungate_complete);
total_rail_gate_time_ms = g->pstats.total_rail_gate_time_ms;
total_rail_ungate_time_ms =
g->pstats.total_rail_ungate_time_ms +
time_since_last_state_transition_ms;
}
seq_printf(s, "Time with Rails Gated: %lu ms\n"
"Time with Rails UnGated: %lu ms\n"
"Total railgating cycles: %lu\n",
total_rail_gate_time_ms,
total_rail_ungate_time_ms,
g->pstats.railgating_cycle_count - 1);
return 0;
}
static int railgate_residency_open(struct inode *inode, struct file *file)
{
return single_open(file, railgate_residency_show, inode->i_private);
}
static const struct file_operations railgate_residency_fops = {
.open = railgate_residency_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int gk20a_railgating_debugfs_init(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct dentry *d;
d = debugfs_create_file(
"railgate_residency", S_IRUGO|S_IWUSR, l->debugfs, g,
&railgate_residency_fops);
if (!d)
return -ENOMEM;
return 0;
}
static ssize_t timeouts_enabled_read(struct file *file,
char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[3];
struct gk20a *g = file->private_data;
if (nvgpu_is_timeouts_enabled(g))
buf[0] = 'Y';
else
buf[0] = 'N';
buf[1] = '\n';
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
static ssize_t timeouts_enabled_write(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[3];
int buf_size;
bool timeouts_enabled;
struct gk20a *g = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
if (strtobool(buf, &timeouts_enabled) == 0) {
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
if (timeouts_enabled == false) {
/* requesting to disable timeouts */
if (g->timeouts_disabled_by_user == false) {
nvgpu_atomic_inc(&g->timeouts_disabled_refcount);
g->timeouts_disabled_by_user = true;
}
} else {
/* requesting to enable timeouts */
if (g->timeouts_disabled_by_user == true) {
nvgpu_atomic_dec(&g->timeouts_disabled_refcount);
g->timeouts_disabled_by_user = false;
}
}
nvgpu_mutex_release(&g->dbg_sessions_lock);
}
return count;
}
static const struct file_operations timeouts_enabled_fops = {
.open = simple_open,
.read = timeouts_enabled_read,
.write = timeouts_enabled_write,
};
void gk20a_debug_init(struct gk20a *g, const char *debugfs_symlink)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct device *dev = dev_from_gk20a(g);
l->debugfs = debugfs_create_dir(dev_name(dev), NULL);
if (!l->debugfs)
return;
if (debugfs_symlink)
l->debugfs_alias =
debugfs_create_symlink(debugfs_symlink,
NULL, dev_name(dev));
debugfs_create_file("status", S_IRUGO, l->debugfs,
dev, &gk20a_debug_fops);
debugfs_create_file("gr_status", S_IRUGO, l->debugfs,
dev, &gk20a_gr_debug_fops);
debugfs_create_u32("trace_cmdbuf", S_IRUGO|S_IWUSR,
l->debugfs, &gk20a_debug_trace_cmdbuf);
debugfs_create_u32("ch_wdt_init_limit_ms", S_IRUGO|S_IWUSR,
l->debugfs, &g->ch_wdt_init_limit_ms);
debugfs_create_bool("disable_syncpoints", S_IRUGO,
l->debugfs, &l->disable_syncpoints);
/* New debug logging API. */
debugfs_create_u64("log_mask", S_IRUGO|S_IWUSR,
l->debugfs, &g->log_mask);
debugfs_create_u32("log_trace", S_IRUGO|S_IWUSR,
l->debugfs, &g->log_trace);
l->debugfs_ltc_enabled =
debugfs_create_bool("ltc_enabled", S_IRUGO|S_IWUSR,
l->debugfs,
&g->mm.ltc_enabled_target);
debugfs_create_u32("poll_timeout_default_ms", S_IRUGO|S_IWUSR,
l->debugfs, &g->poll_timeout_default);
l->debugfs_timeouts_enabled =
debugfs_create_file("timeouts_enabled",
S_IRUGO|S_IWUSR,
l->debugfs,
g,
&timeouts_enabled_fops);
l->debugfs_disable_bigpage =
debugfs_create_file("disable_bigpage",
S_IRUGO|S_IWUSR,
l->debugfs,
g,
&disable_bigpage_fops);
debugfs_create_u32("tsg_timeslice_low_priority_us", S_IRUGO|S_IWUSR,
l->debugfs, &g->tsg_timeslice_low_priority_us);
debugfs_create_u32("tsg_timeslice_medium_priority_us", S_IRUGO|S_IWUSR,
l->debugfs,
&g->tsg_timeslice_medium_priority_us);
debugfs_create_u32("tsg_timeslice_high_priority_us", S_IRUGO|S_IWUSR,
l->debugfs, &g->tsg_timeslice_high_priority_us);
l->debugfs_runlist_interleave =
debugfs_create_bool("runlist_interleave",
S_IRUGO|S_IWUSR,
l->debugfs,
&g->runlist_interleave);
gr_gk20a_debugfs_init(g);
gk20a_pmu_debugfs_init(g);
gk20a_railgating_debugfs_init(g);
#ifdef CONFIG_NVGPU_SUPPORT_CDE
gk20a_cde_debugfs_init(g);
#endif
nvgpu_ce_debugfs_init(g);
nvgpu_alloc_debugfs_init(g);
nvgpu_hal_debugfs_init(g);
gk20a_fifo_debugfs_init(g);
gk20a_sched_debugfs_init(g);
#ifdef CONFIG_NVGPU_TRACK_MEM_USAGE
nvgpu_kmem_debugfs_init(g);
#endif
nvgpu_ltc_debugfs_init(g);
#ifdef CONFIG_NVGPU_DGPU
if (g->pci_vendor_id) {
nvgpu_xve_debugfs_init(g);
nvgpu_bios_debugfs_init(g);
}
#endif
}
void gk20a_debug_deinit(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
if (!l->debugfs)
return;
gk20a_fifo_debugfs_deinit(g);
debugfs_remove_recursive(l->debugfs);
debugfs_remove(l->debugfs_alias);
}