mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-23 01:50:07 +03:00
The channel code needs the watchdog code and vice versa. Cut this circular dependency with a few simplifications so that the watchdog wouldn't depend on so much. When calling watchdog APIs that cause stores or comparisons of channel progress, provide a snapshot of the current progress instead of a whole channel pointer. struct nvgpu_channel_wdt_state is added as an interface for this to track gp_get and pb_get. When periodically checking the watchdog state, make the channel code ask whether a hang has been detected and abort the channel from within channel code instead of asking the watchdog to abort the channel. The debug dump verbosity flag is also moved back to the channel data. Move the functionality to restart all channels' watchdogs to channel code from watchdog code. Looping over active channels is not a good feature for the watchdog; it's better for the channel handling to just use the watchdog as a tracking tool. Move a few unserviceable checks up in the stack to the callers of the wdt code. They're a kludge but this will do for now and demonstrates what needs to be eventually fixed. This does not leave much code in the watchdog unit. Now the purpose of the watchdog is to only isolate the logic to couple a timer and progress snapshots with careful locking to start and stop the tracking. Jira NVGPU-5582 Change-Id: I7c728542ff30d88b1414500210be3fbaf61e6e8a Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2369820 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
279 lines
7.4 KiB
C
279 lines
7.4 KiB
C
/*
|
|
* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include <nvgpu/gk20a.h>
|
|
#include <nvgpu/channel.h>
|
|
#include <nvgpu/watchdog.h>
|
|
#include <nvgpu/error_notifier.h>
|
|
#include <nvgpu/watchdog.h>
|
|
#include <nvgpu/string.h>
|
|
|
|
struct nvgpu_channel_wdt {
|
|
struct gk20a *g;
|
|
|
|
/* lock protects the running timer state */
|
|
struct nvgpu_spinlock lock;
|
|
struct nvgpu_timeout timer;
|
|
bool running;
|
|
struct nvgpu_channel_wdt_state ch_state;
|
|
|
|
/* lock not needed */
|
|
u32 limit_ms;
|
|
bool enabled;
|
|
};
|
|
|
|
struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct gk20a *g)
|
|
{
|
|
struct nvgpu_channel_wdt *wdt = nvgpu_kzalloc(g, sizeof(*wdt));
|
|
|
|
if (wdt == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
wdt->g = g;
|
|
nvgpu_spinlock_init(&wdt->lock);
|
|
wdt->enabled = true;
|
|
wdt->limit_ms = g->ch_wdt_init_limit_ms;
|
|
|
|
return wdt;
|
|
}
|
|
|
|
void nvgpu_channel_wdt_destroy(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
nvgpu_kfree(wdt->g, wdt);
|
|
}
|
|
|
|
void nvgpu_channel_wdt_enable(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
wdt->enabled = true;
|
|
}
|
|
|
|
void nvgpu_channel_wdt_disable(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
wdt->enabled = false;
|
|
}
|
|
|
|
bool nvgpu_channel_wdt_enabled(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
return wdt->enabled;
|
|
}
|
|
|
|
void nvgpu_channel_wdt_set_limit(struct nvgpu_channel_wdt *wdt, u32 limit_ms)
|
|
{
|
|
wdt->limit_ms = limit_ms;
|
|
}
|
|
|
|
u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
return wdt->limit_ms;
|
|
}
|
|
|
|
static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt,
|
|
struct nvgpu_channel_wdt_state *state)
|
|
{
|
|
struct gk20a *g = wdt->g;
|
|
int ret;
|
|
|
|
ret = nvgpu_timeout_init(g, &wdt->timer,
|
|
wdt->limit_ms,
|
|
NVGPU_TIMER_CPU_TIMER);
|
|
if (ret != 0) {
|
|
nvgpu_err(g, "timeout_init failed: %d", ret);
|
|
return;
|
|
}
|
|
|
|
wdt->ch_state = *state;
|
|
wdt->running = true;
|
|
}
|
|
|
|
/**
|
|
* Start a timeout counter (watchdog) on this channel.
|
|
*
|
|
* Trigger a watchdog to recover the channel after the per-platform timeout
|
|
* duration (but strictly no earlier) if the channel hasn't advanced within
|
|
* that time.
|
|
*
|
|
* If the timeout is already running, do nothing. This should be called when
|
|
* new jobs are submitted. The timeout will stop when the last tracked job
|
|
* finishes, making the channel idle.
|
|
*/
|
|
void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt,
|
|
struct nvgpu_channel_wdt_state *state)
|
|
{
|
|
if (!nvgpu_is_timeouts_enabled(wdt->g)) {
|
|
return;
|
|
}
|
|
|
|
if (!wdt->enabled) {
|
|
return;
|
|
}
|
|
|
|
nvgpu_spinlock_acquire(&wdt->lock);
|
|
|
|
if (wdt->running) {
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
return;
|
|
}
|
|
nvgpu_channel_wdt_init(wdt, state);
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
}
|
|
|
|
/**
|
|
* Stop a running timeout counter (watchdog) on this channel.
|
|
*
|
|
* Make the watchdog consider the channel not running, so that it won't get
|
|
* recovered even if no progress is detected. Progress is not tracked if the
|
|
* watchdog is turned off.
|
|
*
|
|
* No guarantees are made about concurrent execution of the timeout handler.
|
|
* (This should be called from an update handler running in the same thread
|
|
* with the watchdog.)
|
|
*/
|
|
bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
bool was_running;
|
|
|
|
nvgpu_spinlock_acquire(&wdt->lock);
|
|
was_running = wdt->running;
|
|
wdt->running = false;
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
return was_running;
|
|
}
|
|
|
|
/**
|
|
* Continue a previously stopped timeout
|
|
*
|
|
* Enable the timeout again but don't reinitialize its timer.
|
|
*
|
|
* No guarantees are made about concurrent execution of the timeout handler.
|
|
* (This should be called from an update handler running in the same thread
|
|
* with the watchdog.)
|
|
*/
|
|
void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
nvgpu_spinlock_acquire(&wdt->lock);
|
|
wdt->running = true;
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
}
|
|
|
|
/**
|
|
* Reset the counter of a timeout that is in effect.
|
|
*
|
|
* If this channel has an active timeout, act as if something happened on the
|
|
* channel right now.
|
|
*
|
|
* Rewinding a stopped counter is irrelevant; this is a no-op for non-running
|
|
* timeouts. Stopped timeouts can only be started (which is technically a
|
|
* rewind too) or continued (where the stop is actually pause).
|
|
*/
|
|
void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt,
|
|
struct nvgpu_channel_wdt_state *state)
|
|
{
|
|
nvgpu_spinlock_acquire(&wdt->lock);
|
|
if (wdt->running) {
|
|
nvgpu_channel_wdt_init(wdt, state);
|
|
}
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
}
|
|
|
|
/**
|
|
* Check if the watchdog is running.
|
|
*
|
|
* A running watchdog means one that is requested to run and expire in the
|
|
* future. The state of a running watchdog has to be checked periodically to
|
|
* see if it's expired.
|
|
*/
|
|
bool nvgpu_channel_wdt_running(struct nvgpu_channel_wdt *wdt)
|
|
{
|
|
bool running;
|
|
|
|
nvgpu_spinlock_acquire(&wdt->lock);
|
|
running = wdt->running;
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
|
|
return running;
|
|
}
|
|
|
|
/**
|
|
* Check if a channel has been stuck for the watchdog limit.
|
|
*
|
|
* Test if this channel has really got stuck at this point by checking if its
|
|
* {gp,pb}_get have advanced or not. If progress was detected, start the timer
|
|
* from zero again. If no {gp,pb}_get action happened in the watchdog time
|
|
* limit, return true. Else return false.
|
|
*/
|
|
static bool nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt,
|
|
struct nvgpu_channel_wdt_state *state)
|
|
{
|
|
struct gk20a *g = wdt->g;
|
|
struct nvgpu_channel_wdt_state previous_state;
|
|
|
|
nvgpu_log_fn(g, " ");
|
|
|
|
/* Get status but keep timer running */
|
|
nvgpu_spinlock_acquire(&wdt->lock);
|
|
previous_state = wdt->ch_state;
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
|
|
if (nvgpu_memcmp((const u8 *)state,
|
|
(const u8 *)&previous_state,
|
|
sizeof(*state)) != 0) {
|
|
/* Channel has advanced, timer keeps going but resets */
|
|
nvgpu_channel_wdt_rewind(wdt, state);
|
|
return false;
|
|
}
|
|
|
|
if (!nvgpu_timeout_peek_expired(&wdt->timer)) {
|
|
/* Seems stuck but waiting to time out */
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Test if the per-channel watchdog is on; check the timeout in that case.
|
|
*
|
|
* Each channel has an expiration time based watchdog. The timer is
|
|
* (re)initialized in two situations: when a new job is submitted on an idle
|
|
* channel and when the timeout is checked but progress is detected. The
|
|
* watchdog timeout limit is a coarse sliding window.
|
|
*
|
|
* The timeout is stopped (disabled) after the last job in a row finishes
|
|
* and marks the channel idle.
|
|
*/
|
|
bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
|
struct nvgpu_channel_wdt_state *state)
|
|
{
|
|
bool running;
|
|
|
|
nvgpu_spinlock_acquire(&wdt->lock);
|
|
running = wdt->running;
|
|
nvgpu_spinlock_release(&wdt->lock);
|
|
|
|
if (running) {
|
|
return nvgpu_channel_wdt_handler(wdt, state);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|