gpu: nvgpu: Add a generic profiler

Add a generic profiler based on the channel kickoff profiler. This
aims to provide a mechanism to allow engineers to (more) easily profile
arbitrary software paths within nvgpu.

Usage of this profiler is still primarily through debugfs. Next up is
a generic debugfs interface for this profiler in the Linux code.

The end goal for this is to profile the recovery code and generate
interesting statistics.

JIRA NVGPU-5606

Signed-off-by: Alex Waterman <alexw@nvidia.com>
Change-Id: I99783ec7e5143855845bde4e98760ff43350456d
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2355319
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Alex Waterman
2020-06-01 19:56:37 -05:00
parent 59eb714c48
commit 70ce67df2d
18 changed files with 579 additions and 292 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2017-2019 NVIDIA Corporation. All rights reserved.
* Copyright (C) 2017-2020 NVIDIA Corporation. All rights reserved.
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
@@ -9,7 +9,6 @@
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#include "debug_fifo.h"
@@ -24,7 +23,9 @@
#include <nvgpu/gr/ctx.h>
#include <nvgpu/engines.h>
#include <nvgpu/runlist.h>
#include <nvgpu/profile.h>
#include <nvgpu/swprofile.h>
#include <nvgpu/fifo/swprofile.h>
void __gk20a_fifo_profile_free(struct nvgpu_ref *ref);
@@ -147,41 +148,12 @@ static int gk20a_fifo_profile_enable(void *data, u64 val)
struct gk20a *g = (struct gk20a *) data;
struct nvgpu_fifo *f = &g->fifo;
nvgpu_mutex_acquire(&f->profile.lock);
if (val == 0) {
if (f->profile.enabled) {
f->profile.enabled = false;
nvgpu_ref_put(&f->profile.ref,
__gk20a_fifo_profile_free);
}
nvgpu_swprofile_close(&f->kickoff_profiler);
return 0;
} else {
if (!f->profile.enabled) {
/* not kref init as it can have a running condition if
* we enable/disable/enable while kickoff is happening
*/
if (!nvgpu_ref_get_unless_zero(&f->profile.ref)) {
f->profile.data = nvgpu_vzalloc(g,
FIFO_PROFILING_ENTRIES *
sizeof(struct nvgpu_profile));
f->profile.sorted = nvgpu_vzalloc(g,
FIFO_PROFILING_ENTRIES *
sizeof(u64));
if (!(f->profile.data && f->profile.sorted)) {
nvgpu_vfree(g, f->profile.data);
nvgpu_vfree(g, f->profile.sorted);
nvgpu_mutex_release(&f->profile.lock);
return -ENOMEM;
}
nvgpu_ref_init(&f->profile.ref);
}
atomic_set(&f->profile.get.atomic_var, 0);
f->profile.enabled = true;
}
return nvgpu_swprofile_open(g, &f->kickoff_profiler);
}
nvgpu_mutex_release(&f->profile.lock);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(
@@ -191,96 +163,20 @@ DEFINE_SIMPLE_ATTRIBUTE(
"%llu\n"
);
static int __profile_cmp(const void *a, const void *b)
static void gk20a_fifo_write_to_seqfile_no_nl(void *ctx, const char *str)
{
return *((unsigned long long *) a) - *((unsigned long long *) b);
}
/*
* This uses about 800b in the stack, but the function using it is not part
* of a callstack where much memory is being used, so it is fine
*/
#define PERCENTILE_WIDTH 5
#define PERCENTILE_RANGES (100/PERCENTILE_WIDTH)
static unsigned int __gk20a_fifo_create_stats(struct gk20a *g,
u64 *percentiles, u32 index_end, u32 index_start)
{
unsigned int nelem = 0;
unsigned int index;
struct nvgpu_profile *profile;
for (index = 0; index < FIFO_PROFILING_ENTRIES; index++) {
profile = &g->fifo.profile.data[index];
if (profile->timestamp[index_end] >
profile->timestamp[index_start]) {
/* This is a valid element */
g->fifo.profile.sorted[nelem] =
profile->timestamp[index_end] -
profile->timestamp[index_start];
nelem++;
}
}
/* sort it */
sort(g->fifo.profile.sorted, nelem, sizeof(unsigned long long),
__profile_cmp, NULL);
/* build ranges */
for (index = 0; index < PERCENTILE_RANGES; index++) {
percentiles[index] = nelem < PERCENTILE_RANGES ? 0 :
g->fifo.profile.sorted[(PERCENTILE_WIDTH * (index + 1) *
nelem)/100 - 1];
}
return nelem;
seq_printf((struct seq_file *)ctx, str);
}
static int gk20a_fifo_profile_stats(struct seq_file *s, void *unused)
{
struct gk20a *g = s->private;
unsigned int get, nelem, index;
/*
* 800B in the stack, but function is declared statically and only
* called from debugfs handler
*/
u64 percentiles_ioctl[PERCENTILE_RANGES];
u64 percentiles_kickoff[PERCENTILE_RANGES];
u64 percentiles_jobtracking[PERCENTILE_RANGES];
u64 percentiles_append[PERCENTILE_RANGES];
u64 percentiles_userd[PERCENTILE_RANGES];
struct nvgpu_debug_context o = {
.fn = gk20a_fifo_write_to_seqfile_no_nl,
.ctx = s,
};
if (!nvgpu_ref_get_unless_zero(&g->fifo.profile.ref)) {
seq_printf(s, "Profiling disabled\n");
return 0;
}
get = atomic_read(&g->fifo.profile.get.atomic_var);
__gk20a_fifo_create_stats(g, percentiles_ioctl,
PROFILE_IOCTL_EXIT, PROFILE_IOCTL_ENTRY);
__gk20a_fifo_create_stats(g, percentiles_kickoff,
PROFILE_END, PROFILE_ENTRY);
__gk20a_fifo_create_stats(g, percentiles_jobtracking,
PROFILE_JOB_TRACKING, PROFILE_IOCTL_ENTRY);
__gk20a_fifo_create_stats(g, percentiles_append,
PROFILE_APPEND, PROFILE_JOB_TRACKING);
nelem = __gk20a_fifo_create_stats(g, percentiles_userd,
PROFILE_END, PROFILE_APPEND);
seq_printf(s, "Number of kickoffs: %d\n", nelem);
seq_printf(s, "Perc \t ioctl(ns) \t kickoff(ns) \t pbcopy(ns) \t jobtrack(ns) \t userd(ns)\n");
for (index = 0; index < PERCENTILE_RANGES; index++)
seq_printf(s, "[%2dpc]\t%8lld\t%8lld\t%8lld\t%8lld\t%8lld\n",
PERCENTILE_WIDTH * (index+1),
percentiles_ioctl[index],
percentiles_kickoff[index],
percentiles_append[index],
percentiles_jobtracking[index],
percentiles_userd[index]);
nvgpu_ref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
nvgpu_swprofile_print_ranges(g, &g->fifo.kickoff_profiler, &o);
return 0;
}
@@ -297,7 +193,6 @@ static const struct file_operations gk20a_fifo_profile_stats_debugfs_fops = {
.release = single_release,
};
void gk20a_fifo_debugfs_init(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
@@ -318,11 +213,6 @@ void gk20a_fifo_debugfs_init(struct gk20a *g)
if (IS_ERR_OR_NULL(profile_root))
return;
nvgpu_mutex_init(&g->fifo.profile.lock);
g->fifo.profile.enabled = false;
atomic_set(&g->fifo.profile.get.atomic_var, 0);
atomic_set(&g->fifo.profile.ref.refcount.atomic_var, 0);
debugfs_create_file("enable", 0600, profile_root, g,
&gk20a_fifo_profile_enable_debugfs_fops);
@@ -330,54 +220,3 @@ void gk20a_fifo_debugfs_init(struct gk20a *g)
&gk20a_fifo_profile_stats_debugfs_fops);
}
void nvgpu_profile_snapshot(struct nvgpu_profile *profile, int idx)
{
if (profile)
profile->timestamp[idx] = nvgpu_current_time_ns();
}
void __gk20a_fifo_profile_free(struct nvgpu_ref *ref)
{
struct nvgpu_fifo *f = container_of(ref, struct nvgpu_fifo,
profile.ref);
nvgpu_vfree(f->g, f->profile.data);
nvgpu_vfree(f->g, f->profile.sorted);
}
/* Get the next element in the ring buffer of profile entries
* and grab a reference to the structure
*/
struct nvgpu_profile *nvgpu_profile_acquire(struct gk20a *g)
{
struct nvgpu_fifo *f = &g->fifo;
struct nvgpu_profile *profile;
unsigned int index;
/* If kref is zero, profiling is not enabled */
if (!nvgpu_ref_get_unless_zero(&f->profile.ref))
return NULL;
index = atomic_inc_return(&f->profile.get.atomic_var);
profile = &f->profile.data[index % FIFO_PROFILING_ENTRIES];
return profile;
}
/* Free the reference to the structure. This allows deferred cleanups */
void nvgpu_profile_release(struct gk20a *g,
struct nvgpu_profile *profile)
{
nvgpu_ref_put(&g->fifo.profile.ref, __gk20a_fifo_profile_free);
}
void gk20a_fifo_debugfs_deinit(struct gk20a *g)
{
struct nvgpu_fifo *f = &g->fifo;
nvgpu_mutex_acquire(&f->profile.lock);
if (f->profile.enabled) {
f->profile.enabled = false;
nvgpu_ref_put(&f->profile.ref, __gk20a_fifo_profile_free);
}
nvgpu_mutex_release(&f->profile.lock);
}