gpu: nvgpu: add support for refcount tracking

If enabled, track actions (gets and puts) on channel reference counters.
Dump the most recent actions to syslog when
gk20a_wait_until_counter_is_N gets stuck when closing a channel.
GK20A_CHANNEL_REFCOUNT_TRACKING specifies the size of the action
history. Default is to disable completely, as this has some runtime
overhead.

Bug 1826754

Change-Id: I880b0efe8881044d02ae224c243a51cb6c2db8c1
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1262424
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Konsta Holtta
2017-01-04 20:59:01 +02:00
committed by mobile promotions
parent 318524ee2f
commit 5e68c6e971
2 changed files with 126 additions and 2 deletions

View File

@@ -1,7 +1,7 @@
/* /*
* GK20A Graphics channel * GK20A Graphics channel
* *
* Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
@@ -62,6 +62,7 @@ struct channel_priv {
static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f); static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f);
static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c); static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c);
static void free_priv_cmdbuf(struct channel_gk20a *c, static void free_priv_cmdbuf(struct channel_gk20a *c,
struct priv_cmd_entry *e); struct priv_cmd_entry *e);
@@ -886,6 +887,8 @@ static void gk20a_wait_until_counter_is_N(
"%s: channel %d, still waiting, %s left: %d, waiting for: %d", "%s: channel %d, still waiting, %s left: %d, waiting for: %d",
caller, ch->hw_chid, counter_name, caller, ch->hw_chid, counter_name,
atomic_read(counter), wait_value); atomic_read(counter), wait_value);
gk20a_channel_dump_ref_actions(ch);
} }
} }
@@ -1054,6 +1057,11 @@ unbind:
if (channel_gk20a_is_prealloc_enabled(ch)) if (channel_gk20a_is_prealloc_enabled(ch))
channel_gk20a_free_prealloc_resources(ch); channel_gk20a_free_prealloc_resources(ch);
#if GK20A_CHANNEL_REFCOUNT_TRACKING
memset(ch->ref_actions, 0, sizeof(ch->ref_actions));
ch->ref_actions_put = 0;
#endif
/* make sure we catch accesses of unopened channels in case /* make sure we catch accesses of unopened channels in case
* there's non-refcounted channel pointers hanging around */ * there's non-refcounted channel pointers hanging around */
ch->g = NULL; ch->g = NULL;
@@ -1063,6 +1071,71 @@ unbind:
free_channel(f, ch); free_channel(f, ch);
} }
static void gk20a_channel_dump_ref_actions(struct channel_gk20a *ch)
{
#if GK20A_CHANNEL_REFCOUNT_TRACKING
size_t i, get;
unsigned long now = jiffies;
unsigned long prev_jiffies = 0;
struct device *dev = dev_from_gk20a(ch->g);
spin_lock(&ch->ref_actions_lock);
dev_info(d, "ch %d: refs %d. Actions, most recent last:\n",
ch->hw_chid, atomic_read(&ch->ref_count));
/* start at the oldest possible entry. put is next insertion point */
get = ch->ref_actions_put;
/*
* If the buffer is not full, this will first loop to the oldest entry,
* skipping not-yet-initialized entries. There is no ref_actions_get.
*/
for (i = 0; i < GK20A_CHANNEL_REFCOUNT_TRACKING; i++) {
struct channel_gk20a_ref_action *act = &ch->ref_actions[get];
if (act->trace.nr_entries) {
dev_info(d, "%s ref %zu steps ago (age %d ms, diff %d ms)\n",
act->type == channel_gk20a_ref_action_get
? "GET" : "PUT",
GK20A_CHANNEL_REFCOUNT_TRACKING - 1 - i,
jiffies_to_msecs(now - act->jiffies),
jiffies_to_msecs(act->jiffies - prev_jiffies));
print_stack_trace(&act->trace, 0);
prev_jiffies = act->jiffies;
}
get = (get + 1) % GK20A_CHANNEL_REFCOUNT_TRACKING;
}
spin_unlock(&ch->ref_actions_lock);
#endif
}
static void gk20a_channel_save_ref_source(struct channel_gk20a *ch,
enum channel_gk20a_ref_action_type type)
{
#if GK20A_CHANNEL_REFCOUNT_TRACKING
struct channel_gk20a_ref_action *act;
spin_lock(&ch->ref_actions_lock);
act = &ch->ref_actions[ch->ref_actions_put];
act->type = type;
act->trace.max_entries = GK20A_CHANNEL_REFCOUNT_TRACKING_STACKLEN;
act->trace.nr_entries = 0;
act->trace.skip = 3; /* onwards from the caller of this */
act->trace.entries = act->trace_entries;
save_stack_trace(&act->trace);
act->jiffies = jiffies;
ch->ref_actions_put = (ch->ref_actions_put + 1) %
GK20A_CHANNEL_REFCOUNT_TRACKING;
spin_unlock(&ch->ref_actions_lock);
#endif
}
/* Try to get a reference to the channel. Return nonzero on success. If fails, /* Try to get a reference to the channel. Return nonzero on success. If fails,
* the channel is dead or being freed elsewhere and you must not touch it. * the channel is dead or being freed elsewhere and you must not touch it.
* *
@@ -1082,6 +1155,7 @@ struct channel_gk20a *_gk20a_channel_get(struct channel_gk20a *ch,
spin_lock(&ch->ref_obtain_lock); spin_lock(&ch->ref_obtain_lock);
if (likely(ch->referenceable)) { if (likely(ch->referenceable)) {
gk20a_channel_save_ref_source(ch, channel_gk20a_ref_action_get);
atomic_inc(&ch->ref_count); atomic_inc(&ch->ref_count);
ret = ch; ret = ch;
} else } else
@@ -1097,6 +1171,7 @@ struct channel_gk20a *_gk20a_channel_get(struct channel_gk20a *ch,
void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller) void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller)
{ {
gk20a_channel_save_ref_source(ch, channel_gk20a_ref_action_put);
trace_gk20a_channel_put(ch->hw_chid, caller); trace_gk20a_channel_put(ch->hw_chid, caller);
atomic_dec(&ch->ref_count); atomic_dec(&ch->ref_count);
wake_up_all(&ch->ref_count_dec_wq); wake_up_all(&ch->ref_count_dec_wq);
@@ -2861,6 +2936,9 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
atomic_set(&c->ref_count, 0); atomic_set(&c->ref_count, 0);
c->referenceable = false; c->referenceable = false;
init_waitqueue_head(&c->ref_count_dec_wq); init_waitqueue_head(&c->ref_count_dec_wq);
#if GK20A_CHANNEL_REFCOUNT_TRACKING
spin_lock_init(&c->ref_actions_lock);
#endif
mutex_init(&c->ioctl_lock); mutex_init(&c->ioctl_lock);
mutex_init(&c->error_notifier_mutex); mutex_init(&c->error_notifier_mutex);
spin_lock_init(&c->joblist.dynamic.lock); spin_lock_init(&c->joblist.dynamic.lock);

View File

@@ -1,7 +1,7 @@
/* /*
* GK20A graphics channel * GK20A graphics channel
* *
* Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
@@ -24,6 +24,7 @@
#include <linux/semaphore.h> #include <linux/semaphore.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/stacktrace.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <uapi/linux/nvgpu.h> #include <uapi/linux/nvgpu.h>
@@ -115,6 +116,40 @@ struct channel_gk20a_clean_up {
struct delayed_work wq; struct delayed_work wq;
}; };
/*
* Track refcount actions, saving their stack traces. This number specifies how
* many most recent actions are stored in a buffer. Set to 0 to disable. 128
* should be enough to track moderately hard problems from the start.
*/
#define GK20A_CHANNEL_REFCOUNT_TRACKING 0
/* Stack depth for the saved actions. */
#define GK20A_CHANNEL_REFCOUNT_TRACKING_STACKLEN 8
/*
* Because the puts and gets are not linked together explicitly (although they
* should always come in pairs), it's not possible to tell which ref holder to
* delete from the list when doing a put. So, just store some number of most
* recent gets and puts in a ring buffer, to obtain a history.
*
* These are zeroed when a channel is closed, so a new one starts fresh.
*/
enum channel_gk20a_ref_action_type {
channel_gk20a_ref_action_get,
channel_gk20a_ref_action_put
};
struct channel_gk20a_ref_action {
enum channel_gk20a_ref_action_type type;
unsigned long jiffies;
/*
* Many of these traces will be similar. Simpler to just capture
* duplicates than to have a separate database for the entries.
*/
struct stack_trace trace;
unsigned long trace_entries[GK20A_CHANNEL_REFCOUNT_TRACKING_STACKLEN];
};
/* this is the priv element of struct nvhost_channel */ /* this is the priv element of struct nvhost_channel */
struct channel_gk20a { struct channel_gk20a {
struct gk20a *g; /* set only when channel is active */ struct gk20a *g; /* set only when channel is active */
@@ -125,6 +160,17 @@ struct channel_gk20a {
bool referenceable; bool referenceable;
atomic_t ref_count; atomic_t ref_count;
wait_queue_head_t ref_count_dec_wq; wait_queue_head_t ref_count_dec_wq;
#if GK20A_CHANNEL_REFCOUNT_TRACKING
/*
* Ring buffer for most recent refcount gets and puts. Protected by
* ref_actions_lock when getting or putting refs (i.e., adding
* entries), and when reading entries.
*/
struct channel_gk20a_ref_action ref_actions[
GK20A_CHANNEL_REFCOUNT_TRACKING];
size_t ref_actions_put; /* index of next write */
spinlock_t ref_actions_lock;
#endif
struct gk20a_semaphore_int *hw_sema; struct gk20a_semaphore_int *hw_sema;