diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index 09425c040..57d201e3f 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -459,6 +459,114 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker) }; #ifdef CONFIG_NVGPU_CHANNEL_WDT +void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, bool dump) +{ + ch->wdt_debug_dump = dump; +} + +static struct nvgpu_channel_wdt_state nvgpu_channel_collect_wdt_state( + struct nvgpu_channel *ch) +{ + struct gk20a *g = ch->g; + struct nvgpu_channel_wdt_state state = { 0, 0 }; + + /* + * Note: just checking for nvgpu_channel_wdt_enabled() is not enough at + * the moment because system suspend puts g->regs away but doesn't stop + * the worker thread that runs the watchdog. This might need to be + * cleared up in the future. + */ + if (nvgpu_channel_wdt_running(ch->wdt)) { + /* + * Read the state only if the wdt is on to avoid unnecessary + * accesses. The kernel mem for userd may not even exist; this + * channel could be in usermode submit mode. + */ + state.gp_get = g->ops.userd.gp_get(g, ch); + state.pb_get = g->ops.userd.pb_get(g, ch); + } + + return state; +} + +static void nvgpu_channel_launch_wdt(struct nvgpu_channel *ch) +{ + struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch); + + /* + * FIXME: channel recovery can race the submit path and can start even + * after this, but this check is the best we can do for now. + */ + if (!nvgpu_channel_check_unserviceable(ch)) { + nvgpu_channel_wdt_start(ch->wdt, &state); + } +} + + +void nvgpu_channel_restart_all_wdts(struct gk20a *g) +{ + struct nvgpu_fifo *f = &g->fifo; + u32 chid; + + for (chid = 0; chid < f->num_channels; chid++) { + struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid); + + if (ch != NULL) { + if ((ch->wdt != NULL) && + !nvgpu_channel_check_unserviceable(ch)) { + struct nvgpu_channel_wdt_state state = + nvgpu_channel_collect_wdt_state(ch); + + nvgpu_channel_wdt_rewind(ch->wdt, &state); + } + nvgpu_channel_put(ch); + } + } +} + +static void nvgpu_channel_recover_from_wdt(struct nvgpu_channel *ch) +{ + struct gk20a *g = ch->g; + + nvgpu_log_fn(g, " "); + + if (nvgpu_channel_check_unserviceable(ch)) { + /* channel is already recovered */ + nvgpu_info(g, "chid: %d unserviceable but wdt was ON", ch->chid); + return; + } + + nvgpu_err(g, "Job on channel %d timed out", ch->chid); + + /* force reset calls gk20a_debug_dump but not this */ + if (ch->wdt_debug_dump) { + gk20a_gr_debug_dump(g); + } + +#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL + if (g->ops.tsg.force_reset(ch, + NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, + ch->wdt_debug_dump) != 0) { + nvgpu_err(g, "failed tsg force reset for chid: %d", ch->chid); + } +#endif +} + +/* + * Test the watchdog progress. If the channel is stuck, reset it. + * + * The gpu is implicitly on at this point because the watchdog can only run on + * channels that have submitted jobs pending for cleanup. + */ +static void nvgpu_channel_check_wdt(struct nvgpu_channel *ch) +{ + struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch); + + if (nvgpu_channel_wdt_check(ch->wdt, &state)) { + nvgpu_channel_recover_from_wdt(ch); + } +} + static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker) { struct nvgpu_channel_worker *ch_worker = @@ -486,7 +594,7 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g) if (ch != NULL) { if (!nvgpu_channel_check_unserviceable(ch)) { - nvgpu_channel_wdt_check(ch->wdt, ch); + nvgpu_channel_check_wdt(ch); } nvgpu_channel_put(ch); } @@ -521,6 +629,8 @@ static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout( return ch_worker->watchdog_interval; } +#else +static void nvgpu_channel_launch_wdt(struct nvgpu_channel *ch) {} #endif /* CONFIG_NVGPU_CHANNEL_WDT */ static inline struct nvgpu_channel * @@ -635,7 +745,7 @@ int nvgpu_channel_add_job(struct nvgpu_channel *c, job->num_mapped_buffers = num_mapped_buffers; job->mapped_buffers = mapped_buffers; - nvgpu_channel_wdt_start(c->wdt, c); + nvgpu_channel_launch_wdt(c); nvgpu_channel_joblist_lock(c); nvgpu_channel_joblist_add(c, job); @@ -1456,11 +1566,12 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6)) ch->unserviceable = true; #ifdef CONFIG_NVGPU_CHANNEL_WDT - ch->wdt = nvgpu_channel_wdt_alloc(ch); + ch->wdt = nvgpu_channel_wdt_alloc(g); if (ch->wdt == NULL) { nvgpu_err(g, "wdt alloc failed"); goto clean_up; } + ch->wdt_debug_dump = true; #endif ch->obj_class = 0; diff --git a/drivers/gpu/nvgpu/common/fifo/watchdog.c b/drivers/gpu/nvgpu/common/fifo/watchdog.c index 02f55cfa1..aad6f8ab1 100644 --- a/drivers/gpu/nvgpu/common/fifo/watchdog.c +++ b/drivers/gpu/nvgpu/common/fifo/watchdog.c @@ -25,6 +25,7 @@ #include #include #include +#include struct nvgpu_channel_wdt { struct gk20a *g; @@ -33,18 +34,15 @@ struct nvgpu_channel_wdt { struct nvgpu_spinlock lock; struct nvgpu_timeout timer; bool running; - u32 gp_get; - u64 pb_get; + struct nvgpu_channel_wdt_state ch_state; /* lock not needed */ u32 limit_ms; bool enabled; - bool debug_dump; }; -struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch) +struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct gk20a *g) { - struct gk20a *g = ch->g; struct nvgpu_channel_wdt *wdt = nvgpu_kzalloc(g, sizeof(*wdt)); if (wdt == NULL) { @@ -55,7 +53,6 @@ struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch) nvgpu_spinlock_init(&wdt->lock); wdt->enabled = true; wdt->limit_ms = g->ch_wdt_init_limit_ms; - wdt->debug_dump = true; return wdt; } @@ -90,22 +87,12 @@ u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt) return wdt->limit_ms; } -void nvgpu_channel_wdt_set_debug_dump(struct nvgpu_channel_wdt *wdt, bool dump) -{ - wdt->debug_dump = dump; -} - static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch) + struct nvgpu_channel_wdt_state *state) { struct gk20a *g = wdt->g; int ret; - if (nvgpu_channel_check_unserviceable(ch)) { - wdt->running = false; - return; - } - ret = nvgpu_timeout_init(g, &wdt->timer, wdt->limit_ms, NVGPU_TIMER_CPU_TIMER); @@ -114,8 +101,7 @@ static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt, return; } - wdt->gp_get = g->ops.userd.gp_get(g, ch); - wdt->pb_get = g->ops.userd.pb_get(g, ch); + wdt->ch_state = *state; wdt->running = true; } @@ -129,13 +115,9 @@ static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt, * If the timeout is already running, do nothing. This should be called when * new jobs are submitted. The timeout will stop when the last tracked job * finishes, making the channel idle. - * - * The channel's gpfifo read pointer will be used to determine if the job has - * actually stuck at that time. After the timeout duration has expired, a - * worker thread will consider the channel stuck and recover it if stuck. */ void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch) + struct nvgpu_channel_wdt_state *state) { if (!nvgpu_is_timeouts_enabled(wdt->g)) { return; @@ -151,7 +133,7 @@ void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt, nvgpu_spinlock_release(&wdt->lock); return; } - nvgpu_channel_wdt_init(wdt, ch); + nvgpu_channel_wdt_init(wdt, state); nvgpu_spinlock_release(&wdt->lock); } @@ -203,103 +185,69 @@ void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) * timeouts. Stopped timeouts can only be started (which is technically a * rewind too) or continued (where the stop is actually pause). */ -static void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch) +void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel_wdt_state *state) { nvgpu_spinlock_acquire(&wdt->lock); if (wdt->running) { - nvgpu_channel_wdt_init(wdt, ch); + nvgpu_channel_wdt_init(wdt, state); } nvgpu_spinlock_release(&wdt->lock); } /** - * Rewind the timeout on each non-dormant channel. + * Check if the watchdog is running. * - * Reschedule the timeout of each active channel for which timeouts are running - * as if something was happened on each channel right now. This should be - * called when a global hang is detected that could cause a false positive on - * other innocent channels. + * A running watchdog means one that is requested to run and expire in the + * future. The state of a running watchdog has to be checked periodically to + * see if it's expired. */ -void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g) +bool nvgpu_channel_wdt_running(struct nvgpu_channel_wdt *wdt) { - struct nvgpu_fifo *f = &g->fifo; - u32 chid; + bool running; - for (chid = 0; chid < f->num_channels; chid++) { - struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid); + nvgpu_spinlock_acquire(&wdt->lock); + running = wdt->running; + nvgpu_spinlock_release(&wdt->lock); - if (ch != NULL) { - if ((ch->wdt != NULL) && - (!nvgpu_channel_check_unserviceable(ch))) { - nvgpu_channel_wdt_rewind(ch->wdt, ch); - } - nvgpu_channel_put(ch); - } - } + return running; } /** - * Check if a timed out channel has hung and recover it if it has. + * Check if a channel has been stuck for the watchdog limit. * * Test if this channel has really got stuck at this point by checking if its - * {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since - * when the watchdog was started and it's timed out, force-reset the channel. - * - * The gpu is implicitly on at this point, because the watchdog can only run on - * channels that have submitted jobs pending for cleanup. + * {gp,pb}_get have advanced or not. If progress was detected, start the timer + * from zero again. If no {gp,pb}_get action happened in the watchdog time + * limit, return true. Else return false. */ -static void nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch) +static bool nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel_wdt_state *state) { struct gk20a *g = wdt->g; - u32 gp_get; - u32 new_gp_get; - u64 pb_get; - u64 new_pb_get; + struct nvgpu_channel_wdt_state previous_state; nvgpu_log_fn(g, " "); - if (nvgpu_channel_check_unserviceable(ch)) { - /* channel is already recovered */ - if (nvgpu_channel_wdt_stop(wdt) == true) { - nvgpu_info(g, "chid: %d unserviceable but wdt was ON", - ch->chid); - } - return; - } - /* Get status but keep timer running */ nvgpu_spinlock_acquire(&wdt->lock); - gp_get = wdt->gp_get; - pb_get = wdt->pb_get; + previous_state = wdt->ch_state; nvgpu_spinlock_release(&wdt->lock); - new_gp_get = g->ops.userd.gp_get(g, ch); - new_pb_get = g->ops.userd.pb_get(g, ch); - - if (new_gp_get != gp_get || new_pb_get != pb_get) { + if (nvgpu_memcmp((const u8 *)state, + (const u8 *)&previous_state, + sizeof(*state)) != 0) { /* Channel has advanced, timer keeps going but resets */ - nvgpu_channel_wdt_rewind(wdt, ch); - } else if (!nvgpu_timeout_peek_expired(&wdt->timer)) { - /* Seems stuck but waiting to time out */ - } else { - nvgpu_err(g, "Job on channel %d timed out", ch->chid); - - /* force reset calls gk20a_debug_dump but not this */ - if (wdt->debug_dump) { - gk20a_gr_debug_dump(g); - } - -#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL - if (g->ops.tsg.force_reset(ch, - NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, - wdt->debug_dump) != 0) { - nvgpu_err(g, "failed tsg force reset for chid: %d", - ch->chid); - } -#endif + nvgpu_channel_wdt_rewind(wdt, state); + return false; } + + if (!nvgpu_timeout_peek_expired(&wdt->timer)) { + /* Seems stuck but waiting to time out */ + return false; + } + + return true; } /** @@ -313,8 +261,8 @@ static void nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt, * The timeout is stopped (disabled) after the last job in a row finishes * and marks the channel idle. */ -void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch) +bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel_wdt_state *state) { bool running; @@ -323,6 +271,8 @@ void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, nvgpu_spinlock_release(&wdt->lock); if (running) { - nvgpu_channel_wdt_handler(wdt, ch); + return nvgpu_channel_wdt_handler(wdt, state); + } else { + return false; } } diff --git a/drivers/gpu/nvgpu/common/rc/rc.c b/drivers/gpu/nvgpu/common/rc/rc.c index f48d34c96..a8e57b903 100644 --- a/drivers/gpu/nvgpu/common/rc/rc.c +++ b/drivers/gpu/nvgpu/common/rc/rc.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -71,10 +70,12 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, #ifdef CONFIG_NVGPU_RECOVERY /* - * Cancel all channels' wdt since ctxsw timeout might - * trigger multiple watchdogs at a time + * Cancel all channels' wdt since ctxsw timeout causes the runlist to + * stuck and might falsely trigger multiple watchdogs at a time. We + * won't detect proper wdt timeouts that would have happened, but if + * they're stuck, they will trigger the wdt soon enough again. */ - nvgpu_channel_wdt_restart_all_channels(g); + nvgpu_channel_restart_all_wdts(g); nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump, RC_TYPE_CTXSW_TIMEOUT); diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index 27fee066e..6217dac39 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -377,6 +377,7 @@ struct nvgpu_channel { /* kernel watchdog to kill stuck jobs */ struct nvgpu_channel_wdt *wdt; + bool wdt_debug_dump; /** Fence allocator in case of deterministic submit. */ struct nvgpu_allocator fence_allocator; @@ -1161,4 +1162,27 @@ int nvgpu_channel_deferred_reset_engines(struct gk20a *g, struct nvgpu_channel *ch); #endif +#ifdef CONFIG_NVGPU_CHANNEL_WDT +/** + * @brief Rewind the timeout on each non-dormant channel. + * + * Reschedule the timeout of each active channel for which timeouts are running + * as if something was happened on each channel right now. This should be + * called when a global hang is detected that could cause a false positive on + * other innocent channels. + */ +void nvgpu_channel_restart_all_wdts(struct gk20a *g); +/** + * @brief Enable or disable full debug dump on wdt error. + * + * Set the policy on whether or not to do the verbose channel and gr debug dump + * when the channel gets recovered as a result of a watchdog timeout. + */ +void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, bool dump); +#else +static inline void nvgpu_channel_restart_all_wdts(struct gk20a *g) {} +static inline void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, + bool dump) {} +#endif + #endif diff --git a/drivers/gpu/nvgpu/include/nvgpu/watchdog.h b/drivers/gpu/nvgpu/include/nvgpu/watchdog.h index 4f2c8e78a..cf4ed157a 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/watchdog.h +++ b/drivers/gpu/nvgpu/include/nvgpu/watchdog.h @@ -23,14 +23,19 @@ #ifndef NVGPU_WATCHDOG_H #define NVGPU_WATCHDOG_H -#ifdef CONFIG_NVGPU_CHANNEL_WDT +#include struct gk20a; -struct nvgpu_channel; -struct nvgpu_worker; struct nvgpu_channel_wdt; -struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch); +struct nvgpu_channel_wdt_state { + u64 gp_get; + u64 pb_get; +}; + +#ifdef CONFIG_NVGPU_CHANNEL_WDT + +struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct gk20a *g); void nvgpu_channel_wdt_destroy(struct nvgpu_channel_wdt *wdt); void nvgpu_channel_wdt_enable(struct nvgpu_channel_wdt *wdt); @@ -39,21 +44,21 @@ bool nvgpu_channel_wdt_enabled(struct nvgpu_channel_wdt *wdt); void nvgpu_channel_wdt_set_limit(struct nvgpu_channel_wdt *wdt, u32 limit_ms); u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt); -void nvgpu_channel_wdt_set_debug_dump(struct nvgpu_channel_wdt *wdt, bool dump); void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch); -void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt); + struct nvgpu_channel_wdt_state *state); bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt); -void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch); - -void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g); +void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt); +void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel_wdt_state *state); +bool nvgpu_channel_wdt_running(struct nvgpu_channel_wdt *wdt); +bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel_wdt_state *state); #else /* CONFIG_NVGPU_CHANNEL_WDT */ static inline struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc( - struct nvgpu_channel *ch) + struct gk20a *g) { return NULL; } @@ -71,21 +76,19 @@ static inline u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt) { return 0U; } -static inline void nvgpu_channel_wdt_set_debug_dump( - struct nvgpu_channel_wdt *wdt, - bool dump) {} - static inline void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch) {} -static inline void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) {} + struct nvgpu_channel_wdt_state *state) {} static inline bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt) { return false; } -static inline void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, - struct nvgpu_channel *ch) {} - -static inline void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g) {} +static inline void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) {} +static inline void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel_wdt_state *state) {} +static inline bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel_wdt_state *state) { + return false; +} #endif /* CONFIG_NVGPU_CHANNEL_WDT */ diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c index 2d41e129c..2611a67a7 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c @@ -322,7 +322,7 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch, if (set_timeout) nvgpu_channel_wdt_set_limit(ch->wdt, args->timeout_ms); - nvgpu_channel_wdt_set_debug_dump(ch->wdt, !disable_dump); + nvgpu_channel_set_wdt_debug_dump(ch, !disable_dump); return 0; #else