gpu: nvgpu: support per-channel wdt timeouts

Replace the padding in nvgpu_channel_wdt_args with a timeout value in
milliseconds, and add NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT to
signify the existence of this new field. When the new flag is included
in the value of wdt_status, the field is used to set a per-channel
timeout to override the per-GPU default.

Add NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP to disable the long debug
dump when a timed out channel gets recovered by the watchdog. Printing
the dump to serial console takes easily several seconds. (Note that
there is NVGPU_TIMEOUT_FLAG_DISABLE_DUMP about ctxsw timeout separately
for NVGPU_IOCTL_CHANNEL_SET_TIMEOUT_EX as well.)

The behaviour of NVGPU_IOCTL_CHANNEL_WDT is changed so that either
NVGPU_IOCTL_CHANNEL_ENABLE_WDT or NVGPU_IOCTL_CHANNEL_DISABLE_WDT has to
be set. The old behaviour was that other values were silently ignored.

The usage of the global default debugfs-controlled ch_wdt_timeout_ms is
changed so that its value takes effect only for newly opened channels
instead of in realtime. Also, zero value no longer means that the
watchdog is disabled; there is a separate flag for that after all.

gk20a_fifo_recover_tsg used to ignore the value of "verbose" when no
engines were found. Correct this.

Bug 1982826
Bug 1985845
Jira NVGPU-73

Change-Id: Iea6213a646a66cb7c631ed7d7c91d8c2ba8a92a4
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1510898
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Konsta Holtta
2018-02-21 16:42:37 +02:00
committed by mobile promotions
parent 4f9368522e
commit cb6ed949e2
7 changed files with 50 additions and 30 deletions

View File

@@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
*/ */
need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
c->wdt_enabled || c->timeout.enabled ||
(g->can_railgate && !c->deterministic) || (g->can_railgate && !c->deterministic) ||
!skip_buffer_refcounting; !skip_buffer_refcounting;
@@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
*/ */
need_deferred_cleanup = !c->deterministic || need_deferred_cleanup = !c->deterministic ||
need_sync_framework || need_sync_framework ||
c->wdt_enabled || c->timeout.enabled ||
(g->can_railgate && (g->can_railgate &&
!c->deterministic) || !c->deterministic) ||
!skip_buffer_refcounting; !skip_buffer_refcounting;

View File

@@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch,
static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch, static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
struct nvgpu_channel_wdt_args *args) struct nvgpu_channel_wdt_args *args)
{ {
if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT) u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
ch->wdt_enabled = false; NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
ch->wdt_enabled = true; if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
ch->timeout.enabled = false;
else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
ch->timeout.enabled = true;
else
return -EINVAL;
if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT)
ch->timeout.limit_ms = args->timeout_ms;
ch->timeout.debug_dump = (args->wdt_status &
NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
return 0; return 0;
} }

View File

@@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g,
err = -ENOMEM; err = -ENOMEM;
goto end; goto end;
} }
ce_ctx->ch->wdt_enabled = false; ce_ctx->ch->timeout.enabled = false;
/* bind the channel to the vm */ /* bind the channel to the vm */
err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch); err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);

View File

@@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
static struct channel_gk20a_job *channel_gk20a_joblist_peek( static struct channel_gk20a_job *channel_gk20a_joblist_peek(
struct channel_gk20a *c); struct channel_gk20a *c);
static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
/* allocate GPU channel */ /* allocate GPU channel */
static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
{ {
@@ -696,14 +694,19 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
/* By default, channel is regular (non-TSG) channel */ /* By default, channel is regular (non-TSG) channel */
ch->tsgid = NVGPU_INVALID_TSG_ID; ch->tsgid = NVGPU_INVALID_TSG_ID;
/* reset timeout counter and update timestamp */ /* clear ctxsw timeout counter and update timestamp */
ch->timeout_accumulated_ms = 0; ch->timeout_accumulated_ms = 0;
ch->timeout_gpfifo_get = 0; ch->timeout_gpfifo_get = 0;
/* set gr host default timeout */ /* set gr host default timeout */
ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g); ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
ch->timeout_debug_dump = true; ch->timeout_debug_dump = true;
ch->has_timedout = false; ch->has_timedout = false;
ch->wdt_enabled = true;
/* init kernel watchdog timeout */
ch->timeout.enabled = true;
ch->timeout.limit_ms = g->ch_wdt_timeout_ms;
ch->timeout.debug_dump = true;
ch->obj_class = 0; ch->obj_class = 0;
ch->subctx_id = 0; ch->subctx_id = 0;
ch->runqueue_sel = 0; ch->runqueue_sel = 0;
@@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
} }
} }
if (!c->g->timeouts_enabled || !c->wdt_enabled) if (!c->g->timeouts_enabled || !c->timeout.enabled)
acquire_timeout = 0; acquire_timeout = 0;
else else
acquire_timeout = gk20a_get_channel_watchdog_timeout(c); acquire_timeout = c->timeout.limit_ms;
err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va, err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
c->gpfifo.entry_num, c->gpfifo.entry_num,
@@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
ch->timeout_accumulated_ms > ch->timeout_ms_max; ch->timeout_accumulated_ms > ch->timeout_ms_max;
} }
static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
{
return ch->g->ch_wdt_timeout_ms;
}
u32 nvgpu_get_gp_free_count(struct channel_gk20a *c) u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
{ {
update_gp_get(c->g, c); update_gp_get(c->g, c);
@@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch); ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);
ch->timeout.running = true; ch->timeout.running = true;
nvgpu_timeout_init(ch->g, &ch->timeout.timer, nvgpu_timeout_init(ch->g, &ch->timeout.timer,
gk20a_get_channel_watchdog_timeout(ch), ch->timeout.limit_ms,
NVGPU_TIMER_CPU_TIMER); NVGPU_TIMER_CPU_TIMER);
} }
@@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
*/ */
static void gk20a_channel_timeout_start(struct channel_gk20a *ch) static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
{ {
if (!ch->g->timeouts_enabled || !gk20a_get_channel_watchdog_timeout(ch)) if (!ch->g->timeouts_enabled)
return; return;
if (!ch->wdt_enabled) if (!ch->timeout.enabled)
return; return;
nvgpu_raw_spinlock_acquire(&ch->timeout.lock); nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
@@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
nvgpu_err(g, "Job on channel %d timed out", nvgpu_err(g, "Job on channel %d timed out",
ch->chid); ch->chid);
gk20a_debug_dump(g); /* force reset calls gk20a_debug_dump but not this */
if (ch->timeout.debug_dump)
gk20a_gr_debug_dump(g); gk20a_gr_debug_dump(g);
g->ops.fifo.force_reset_ch(ch, g->ops.fifo.force_reset_ch(ch,
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true); NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
ch->timeout.debug_dump);
} }
/** /**

View File

@@ -96,11 +96,17 @@ struct channel_gk20a_joblist {
}; };
struct channel_gk20a_timeout { struct channel_gk20a_timeout {
/* lock protects the running timer state */
struct nvgpu_raw_spinlock lock; struct nvgpu_raw_spinlock lock;
struct nvgpu_timeout timer; struct nvgpu_timeout timer;
bool running; bool running;
u32 gp_get; u32 gp_get;
u64 pb_get; u64 pb_get;
/* lock not needed */
u32 limit_ms;
bool enabled;
bool debug_dump;
}; };
/* /*
@@ -167,7 +173,6 @@ struct channel_gk20a {
struct nvgpu_semaphore_int *hw_sema; struct nvgpu_semaphore_int *hw_sema;
int chid; int chid;
bool wdt_enabled;
nvgpu_atomic_t bound; nvgpu_atomic_t bound;
bool vpr; bool vpr;
bool deterministic; bool deterministic;
@@ -203,7 +208,9 @@ struct channel_gk20a {
u32 timeout_accumulated_ms; u32 timeout_accumulated_ms;
u32 timeout_gpfifo_get; u32 timeout_gpfifo_get;
/* kernel watchdog to kill stuck jobs */
struct channel_gk20a_timeout timeout; struct channel_gk20a_timeout timeout;
/* for job cleanup handling in the background worker */ /* for job cleanup handling in the background worker */
struct nvgpu_list_node worker_item; struct nvgpu_list_node worker_item;

View File

@@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
else { else {
struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
if (gk20a_fifo_error_tsg(g, tsg)) if (gk20a_fifo_error_tsg(g, tsg) && verbose)
gk20a_debug_dump(g); gk20a_debug_dump(g);
gk20a_fifo_abort_tsg(g, tsgid, false); gk20a_fifo_abort_tsg(g, tsgid, false);

View File

@@ -1577,13 +1577,15 @@ struct nvgpu_cycle_stats_snapshot_args {
#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1 #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1
#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2 #define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2
/* disable watchdog per-channel */ /* configure watchdog per-channel */
struct nvgpu_channel_wdt_args { struct nvgpu_channel_wdt_args {
__u32 wdt_status; __u32 wdt_status;
__u32 padding; __u32 timeout_ms;
}; };
#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT 1 #define NVGPU_IOCTL_CHANNEL_DISABLE_WDT (1 << 0)
#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT 2 #define NVGPU_IOCTL_CHANNEL_ENABLE_WDT (1 << 1)
#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT (1 << 2)
#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP (1 << 3)
/* /*
* Interleaving channels in a runlist is an approach to improve * Interleaving channels in a runlist is an approach to improve