mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: move wdt code out of channel.c
Cut and paste the existing channel watchdog functions to another file for better isolation of units. Jira NVGPU-5494 Change-Id: Id437f0939e69a4a8b495eaee164c4d7a9f283fa9 Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2345934 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
Alex Waterman
parent
22987182a3
commit
21e02878f4
@@ -330,7 +330,9 @@ fifo:
|
|||||||
channel:
|
channel:
|
||||||
safe: yes
|
safe: yes
|
||||||
sources: [ common/fifo/channel.c,
|
sources: [ common/fifo/channel.c,
|
||||||
|
common/fifo/watchdog.c,
|
||||||
include/nvgpu/channel.h,
|
include/nvgpu/channel.h,
|
||||||
|
include/nvgpu/watchdog.h,
|
||||||
include/nvgpu/gops_channel.h,
|
include/nvgpu/gops_channel.h,
|
||||||
include/nvgpu/gops_ramfc.h,
|
include/nvgpu/gops_ramfc.h,
|
||||||
include/nvgpu/gops_ramin.h,
|
include/nvgpu/gops_ramin.h,
|
||||||
|
|||||||
@@ -535,6 +535,7 @@ nvgpu-y += \
|
|||||||
common/fifo/engines.o \
|
common/fifo/engines.o \
|
||||||
common/fifo/pbdma_status.o \
|
common/fifo/pbdma_status.o \
|
||||||
common/fifo/userd.o \
|
common/fifo/userd.o \
|
||||||
|
common/fifo/watchdog.o \
|
||||||
common/fence/fence.o \
|
common/fence/fence.o \
|
||||||
common/ecc.o \
|
common/ecc.o \
|
||||||
common/log_common.o \
|
common/log_common.o \
|
||||||
|
|||||||
@@ -243,6 +243,7 @@ CONFIG_NVGPU_SW_SEMAPHORE := 1
|
|||||||
NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_SW_SEMAPHORE
|
NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_SW_SEMAPHORE
|
||||||
|
|
||||||
# Enable Channel WDT for safety build until we switch to user mode submits only
|
# Enable Channel WDT for safety build until we switch to user mode submits only
|
||||||
|
CONFIG_NVGPU_CHANNEL_WDT := 1
|
||||||
NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_CHANNEL_WDT
|
NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_CHANNEL_WDT
|
||||||
|
|
||||||
# Enable Kernel Mode submit for safety build until we switch to user mode
|
# Enable Kernel Mode submit for safety build until we switch to user mode
|
||||||
|
|||||||
@@ -396,6 +396,10 @@ srcs += common/fifo/submit.c \
|
|||||||
common/sync/channel_sync_syncpt.c
|
common/sync/channel_sync_syncpt.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CONFIG_NVGPU_CHANNEL_WDT),1)
|
||||||
|
srcs += common/fifo/watchdog.c
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CONFIG_NVGPU_SW_SEMAPHORE),1)
|
ifeq ($(CONFIG_NVGPU_SW_SEMAPHORE),1)
|
||||||
srcs += common/semaphore/semaphore_sea.c \
|
srcs += common/semaphore/semaphore_sea.c \
|
||||||
common/semaphore/semaphore_pool.c \
|
common/semaphore/semaphore_pool.c \
|
||||||
|
|||||||
@@ -54,6 +54,7 @@
|
|||||||
#include <nvgpu/channel_sync_semaphore.h>
|
#include <nvgpu/channel_sync_semaphore.h>
|
||||||
#include <nvgpu/channel_user_syncpt.h>
|
#include <nvgpu/channel_user_syncpt.h>
|
||||||
#include <nvgpu/runlist.h>
|
#include <nvgpu/runlist.h>
|
||||||
|
#include <nvgpu/watchdog.h>
|
||||||
#include <nvgpu/fifo/userd.h>
|
#include <nvgpu/fifo/userd.h>
|
||||||
#include <nvgpu/nvhost.h>
|
#include <nvgpu/nvhost.h>
|
||||||
#include <nvgpu/fence.h>
|
#include <nvgpu/fence.h>
|
||||||
@@ -454,257 +455,6 @@ u32 nvgpu_channel_update_gpfifo_get_and_get_free_count(struct nvgpu_channel *ch)
|
|||||||
return nvgpu_channel_get_gpfifo_free_count(ch);
|
return nvgpu_channel_get_gpfifo_free_count(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
|
||||||
|
|
||||||
static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch)
|
|
||||||
{
|
|
||||||
struct gk20a *g = ch->g;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
if (nvgpu_channel_check_unserviceable(ch)) {
|
|
||||||
ch->wdt.running = false;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = nvgpu_timeout_init(g, &ch->wdt.timer,
|
|
||||||
ch->wdt.limit_ms,
|
|
||||||
NVGPU_TIMER_CPU_TIMER);
|
|
||||||
if (ret != 0) {
|
|
||||||
nvgpu_err(g, "timeout_init failed: %d", ret);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ch->wdt.gp_get = g->ops.userd.gp_get(g, ch);
|
|
||||||
ch->wdt.pb_get = g->ops.userd.pb_get(g, ch);
|
|
||||||
ch->wdt.running = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Start a timeout counter (watchdog) on this channel.
|
|
||||||
*
|
|
||||||
* Trigger a watchdog to recover the channel after the per-platform timeout
|
|
||||||
* duration (but strictly no earlier) if the channel hasn't advanced within
|
|
||||||
* that time.
|
|
||||||
*
|
|
||||||
* If the timeout is already running, do nothing. This should be called when
|
|
||||||
* new jobs are submitted. The timeout will stop when the last tracked job
|
|
||||||
* finishes, making the channel idle.
|
|
||||||
*
|
|
||||||
* The channel's gpfifo read pointer will be used to determine if the job has
|
|
||||||
* actually stuck at that time. After the timeout duration has expired, a
|
|
||||||
* worker thread will consider the channel stuck and recover it if stuck.
|
|
||||||
*/
|
|
||||||
static void nvgpu_channel_wdt_start(struct nvgpu_channel *ch)
|
|
||||||
{
|
|
||||||
if (!nvgpu_is_timeouts_enabled(ch->g)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!ch->wdt.enabled) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
|
||||||
|
|
||||||
if (ch->wdt.running) {
|
|
||||||
nvgpu_spinlock_release(&ch->wdt.lock);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
nvgpu_channel_wdt_init(ch);
|
|
||||||
nvgpu_spinlock_release(&ch->wdt.lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Stop a running timeout counter (watchdog) on this channel.
|
|
||||||
*
|
|
||||||
* Make the watchdog consider the channel not running, so that it won't get
|
|
||||||
* recovered even if no progress is detected. Progress is not tracked if the
|
|
||||||
* watchdog is turned off.
|
|
||||||
*
|
|
||||||
* No guarantees are made about concurrent execution of the timeout handler.
|
|
||||||
* (This should be called from an update handler running in the same thread
|
|
||||||
* with the watchdog.)
|
|
||||||
*/
|
|
||||||
static bool nvgpu_channel_wdt_stop(struct nvgpu_channel *ch)
|
|
||||||
{
|
|
||||||
bool was_running;
|
|
||||||
|
|
||||||
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
|
||||||
was_running = ch->wdt.running;
|
|
||||||
ch->wdt.running = false;
|
|
||||||
nvgpu_spinlock_release(&ch->wdt.lock);
|
|
||||||
return was_running;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Continue a previously stopped timeout
|
|
||||||
*
|
|
||||||
* Enable the timeout again but don't reinitialize its timer.
|
|
||||||
*
|
|
||||||
* No guarantees are made about concurrent execution of the timeout handler.
|
|
||||||
* (This should be called from an update handler running in the same thread
|
|
||||||
* with the watchdog.)
|
|
||||||
*/
|
|
||||||
static void nvgpu_channel_wdt_continue(struct nvgpu_channel *ch)
|
|
||||||
{
|
|
||||||
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
|
||||||
ch->wdt.running = true;
|
|
||||||
nvgpu_spinlock_release(&ch->wdt.lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reset the counter of a timeout that is in effect.
|
|
||||||
*
|
|
||||||
* If this channel has an active timeout, act as if something happened on the
|
|
||||||
* channel right now.
|
|
||||||
*
|
|
||||||
* Rewinding a stopped counter is irrelevant; this is a no-op for non-running
|
|
||||||
* timeouts. Stopped timeouts can only be started (which is technically a
|
|
||||||
* rewind too) or continued (where the stop is actually pause).
|
|
||||||
*/
|
|
||||||
static void nvgpu_channel_wdt_rewind(struct nvgpu_channel *ch)
|
|
||||||
{
|
|
||||||
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
|
||||||
if (ch->wdt.running) {
|
|
||||||
nvgpu_channel_wdt_init(ch);
|
|
||||||
}
|
|
||||||
nvgpu_spinlock_release(&ch->wdt.lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Rewind the timeout on each non-dormant channel.
|
|
||||||
*
|
|
||||||
* Reschedule the timeout of each active channel for which timeouts are running
|
|
||||||
* as if something was happened on each channel right now. This should be
|
|
||||||
* called when a global hang is detected that could cause a false positive on
|
|
||||||
* other innocent channels.
|
|
||||||
*/
|
|
||||||
void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g)
|
|
||||||
{
|
|
||||||
struct nvgpu_fifo *f = &g->fifo;
|
|
||||||
u32 chid;
|
|
||||||
|
|
||||||
for (chid = 0; chid < f->num_channels; chid++) {
|
|
||||||
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
|
|
||||||
|
|
||||||
if (ch != NULL) {
|
|
||||||
if (!nvgpu_channel_check_unserviceable(ch)) {
|
|
||||||
nvgpu_channel_wdt_rewind(ch);
|
|
||||||
}
|
|
||||||
nvgpu_channel_put(ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if a timed out channel has hung and recover it if it has.
|
|
||||||
*
|
|
||||||
* Test if this channel has really got stuck at this point by checking if its
|
|
||||||
* {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since
|
|
||||||
* when the watchdog was started and it's timed out, force-reset the channel.
|
|
||||||
*
|
|
||||||
* The gpu is implicitly on at this point, because the watchdog can only run on
|
|
||||||
* channels that have submitted jobs pending for cleanup.
|
|
||||||
*/
|
|
||||||
static void nvgpu_channel_wdt_handler(struct nvgpu_channel *ch)
|
|
||||||
{
|
|
||||||
struct gk20a *g = ch->g;
|
|
||||||
u32 gp_get;
|
|
||||||
u32 new_gp_get;
|
|
||||||
u64 pb_get;
|
|
||||||
u64 new_pb_get;
|
|
||||||
|
|
||||||
nvgpu_log_fn(g, " ");
|
|
||||||
|
|
||||||
if (nvgpu_channel_check_unserviceable(ch)) {
|
|
||||||
/* channel is already recovered */
|
|
||||||
if (nvgpu_channel_wdt_stop(ch) == true) {
|
|
||||||
nvgpu_info(g, "chid: %d unserviceable but wdt was ON",
|
|
||||||
ch->chid);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Get status but keep timer running */
|
|
||||||
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
|
||||||
gp_get = ch->wdt.gp_get;
|
|
||||||
pb_get = ch->wdt.pb_get;
|
|
||||||
nvgpu_spinlock_release(&ch->wdt.lock);
|
|
||||||
|
|
||||||
new_gp_get = g->ops.userd.gp_get(g, ch);
|
|
||||||
new_pb_get = g->ops.userd.pb_get(g, ch);
|
|
||||||
|
|
||||||
if (new_gp_get != gp_get || new_pb_get != pb_get) {
|
|
||||||
/* Channel has advanced, timer keeps going but resets */
|
|
||||||
nvgpu_channel_wdt_rewind(ch);
|
|
||||||
} else if (!nvgpu_timeout_peek_expired(&ch->wdt.timer)) {
|
|
||||||
/* Seems stuck but waiting to time out */
|
|
||||||
} else {
|
|
||||||
nvgpu_err(g, "Job on channel %d timed out",
|
|
||||||
ch->chid);
|
|
||||||
|
|
||||||
/* force reset calls gk20a_debug_dump but not this */
|
|
||||||
if (ch->wdt.debug_dump) {
|
|
||||||
gk20a_gr_debug_dump(g);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
|
|
||||||
if (g->ops.tsg.force_reset(ch,
|
|
||||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
|
|
||||||
ch->wdt.debug_dump) != 0) {
|
|
||||||
nvgpu_err(g, "failed tsg force reset for chid: %d",
|
|
||||||
ch->chid);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test if the per-channel watchdog is on; check the timeout in that case.
|
|
||||||
*
|
|
||||||
* Each channel has an expiration time based watchdog. The timer is
|
|
||||||
* (re)initialized in two situations: when a new job is submitted on an idle
|
|
||||||
* channel and when the timeout is checked but progress is detected. The
|
|
||||||
* watchdog timeout limit is a coarse sliding window.
|
|
||||||
*
|
|
||||||
* The timeout is stopped (disabled) after the last job in a row finishes
|
|
||||||
* and marks the channel idle.
|
|
||||||
*/
|
|
||||||
static void nvgpu_channel_wdt_check(struct nvgpu_channel *ch)
|
|
||||||
{
|
|
||||||
bool running;
|
|
||||||
|
|
||||||
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
|
||||||
running = ch->wdt.running;
|
|
||||||
nvgpu_spinlock_release(&ch->wdt.lock);
|
|
||||||
|
|
||||||
if (running) {
|
|
||||||
nvgpu_channel_wdt_handler(ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Loop every living channel, check timeouts and handle stuck channels.
|
|
||||||
*/
|
|
||||||
static void nvgpu_channel_poll_wdt(struct gk20a *g)
|
|
||||||
{
|
|
||||||
unsigned int chid;
|
|
||||||
|
|
||||||
|
|
||||||
for (chid = 0; chid < g->fifo.num_channels; chid++) {
|
|
||||||
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
|
|
||||||
|
|
||||||
if (ch != NULL) {
|
|
||||||
if (!nvgpu_channel_check_unserviceable(ch)) {
|
|
||||||
nvgpu_channel_wdt_check(ch);
|
|
||||||
}
|
|
||||||
nvgpu_channel_put(ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* CONFIG_NVGPU_CHANNEL_WDT */
|
|
||||||
|
|
||||||
static inline struct nvgpu_channel_worker *
|
static inline struct nvgpu_channel_worker *
|
||||||
nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
|
nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
|
||||||
{
|
{
|
||||||
@@ -713,7 +463,6 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
|
|||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
||||||
|
|
||||||
static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
|
static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
|
||||||
{
|
{
|
||||||
struct nvgpu_channel_worker *ch_worker =
|
struct nvgpu_channel_worker *ch_worker =
|
||||||
@@ -729,6 +478,25 @@ static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loop every living channel, check timeouts and handle stuck channels.
|
||||||
|
*/
|
||||||
|
static void nvgpu_channel_poll_wdt(struct gk20a *g)
|
||||||
|
{
|
||||||
|
unsigned int chid;
|
||||||
|
|
||||||
|
for (chid = 0; chid < g->fifo.num_channels; chid++) {
|
||||||
|
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
|
||||||
|
|
||||||
|
if (ch != NULL) {
|
||||||
|
if (!nvgpu_channel_check_unserviceable(ch)) {
|
||||||
|
nvgpu_channel_wdt_check(ch);
|
||||||
|
}
|
||||||
|
nvgpu_channel_put(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void nvgpu_channel_worker_poll_wakeup_post_process_item(
|
static void nvgpu_channel_worker_poll_wakeup_post_process_item(
|
||||||
struct nvgpu_worker *worker)
|
struct nvgpu_worker *worker)
|
||||||
{
|
{
|
||||||
@@ -757,9 +525,15 @@ static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
|
|||||||
|
|
||||||
return ch_worker->watchdog_interval;
|
return ch_worker->watchdog_interval;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* CONFIG_NVGPU_CHANNEL_WDT */
|
#endif /* CONFIG_NVGPU_CHANNEL_WDT */
|
||||||
|
|
||||||
|
static inline struct nvgpu_channel *
|
||||||
|
nvgpu_channel_from_worker_item(struct nvgpu_list_node *node)
|
||||||
|
{
|
||||||
|
return (struct nvgpu_channel *)
|
||||||
|
((uintptr_t)node - offsetof(struct nvgpu_channel, worker_item));
|
||||||
|
};
|
||||||
|
|
||||||
static void nvgpu_channel_worker_poll_wakeup_process_item(
|
static void nvgpu_channel_worker_poll_wakeup_process_item(
|
||||||
struct nvgpu_list_node *work_item)
|
struct nvgpu_list_node *work_item)
|
||||||
{
|
{
|
||||||
|
|||||||
253
drivers/gpu/nvgpu/common/fifo/watchdog.c
Normal file
253
drivers/gpu/nvgpu/common/fifo/watchdog.c
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <nvgpu/gk20a.h>
|
||||||
|
#include <nvgpu/channel.h>
|
||||||
|
#include <nvgpu/error_notifier.h>
|
||||||
|
#include <nvgpu/watchdog.h>
|
||||||
|
|
||||||
|
static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch)
|
||||||
|
{
|
||||||
|
struct gk20a *g = ch->g;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
if (nvgpu_channel_check_unserviceable(ch)) {
|
||||||
|
ch->wdt.running = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = nvgpu_timeout_init(g, &ch->wdt.timer,
|
||||||
|
ch->wdt.limit_ms,
|
||||||
|
NVGPU_TIMER_CPU_TIMER);
|
||||||
|
if (ret != 0) {
|
||||||
|
nvgpu_err(g, "timeout_init failed: %d", ret);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ch->wdt.gp_get = g->ops.userd.gp_get(g, ch);
|
||||||
|
ch->wdt.pb_get = g->ops.userd.pb_get(g, ch);
|
||||||
|
ch->wdt.running = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start a timeout counter (watchdog) on this channel.
|
||||||
|
*
|
||||||
|
* Trigger a watchdog to recover the channel after the per-platform timeout
|
||||||
|
* duration (but strictly no earlier) if the channel hasn't advanced within
|
||||||
|
* that time.
|
||||||
|
*
|
||||||
|
* If the timeout is already running, do nothing. This should be called when
|
||||||
|
* new jobs are submitted. The timeout will stop when the last tracked job
|
||||||
|
* finishes, making the channel idle.
|
||||||
|
*
|
||||||
|
* The channel's gpfifo read pointer will be used to determine if the job has
|
||||||
|
* actually stuck at that time. After the timeout duration has expired, a
|
||||||
|
* worker thread will consider the channel stuck and recover it if stuck.
|
||||||
|
*/
|
||||||
|
void nvgpu_channel_wdt_start(struct nvgpu_channel *ch)
|
||||||
|
{
|
||||||
|
if (!nvgpu_is_timeouts_enabled(ch->g)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ch->wdt.enabled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
||||||
|
|
||||||
|
if (ch->wdt.running) {
|
||||||
|
nvgpu_spinlock_release(&ch->wdt.lock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
nvgpu_channel_wdt_init(ch);
|
||||||
|
nvgpu_spinlock_release(&ch->wdt.lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop a running timeout counter (watchdog) on this channel.
|
||||||
|
*
|
||||||
|
* Make the watchdog consider the channel not running, so that it won't get
|
||||||
|
* recovered even if no progress is detected. Progress is not tracked if the
|
||||||
|
* watchdog is turned off.
|
||||||
|
*
|
||||||
|
* No guarantees are made about concurrent execution of the timeout handler.
|
||||||
|
* (This should be called from an update handler running in the same thread
|
||||||
|
* with the watchdog.)
|
||||||
|
*/
|
||||||
|
bool nvgpu_channel_wdt_stop(struct nvgpu_channel *ch)
|
||||||
|
{
|
||||||
|
bool was_running;
|
||||||
|
|
||||||
|
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
||||||
|
was_running = ch->wdt.running;
|
||||||
|
ch->wdt.running = false;
|
||||||
|
nvgpu_spinlock_release(&ch->wdt.lock);
|
||||||
|
return was_running;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Continue a previously stopped timeout
|
||||||
|
*
|
||||||
|
* Enable the timeout again but don't reinitialize its timer.
|
||||||
|
*
|
||||||
|
* No guarantees are made about concurrent execution of the timeout handler.
|
||||||
|
* (This should be called from an update handler running in the same thread
|
||||||
|
* with the watchdog.)
|
||||||
|
*/
|
||||||
|
void nvgpu_channel_wdt_continue(struct nvgpu_channel *ch)
|
||||||
|
{
|
||||||
|
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
||||||
|
ch->wdt.running = true;
|
||||||
|
nvgpu_spinlock_release(&ch->wdt.lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset the counter of a timeout that is in effect.
|
||||||
|
*
|
||||||
|
* If this channel has an active timeout, act as if something happened on the
|
||||||
|
* channel right now.
|
||||||
|
*
|
||||||
|
* Rewinding a stopped counter is irrelevant; this is a no-op for non-running
|
||||||
|
* timeouts. Stopped timeouts can only be started (which is technically a
|
||||||
|
* rewind too) or continued (where the stop is actually pause).
|
||||||
|
*/
|
||||||
|
static void nvgpu_channel_wdt_rewind(struct nvgpu_channel *ch)
|
||||||
|
{
|
||||||
|
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
||||||
|
if (ch->wdt.running) {
|
||||||
|
nvgpu_channel_wdt_init(ch);
|
||||||
|
}
|
||||||
|
nvgpu_spinlock_release(&ch->wdt.lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rewind the timeout on each non-dormant channel.
|
||||||
|
*
|
||||||
|
* Reschedule the timeout of each active channel for which timeouts are running
|
||||||
|
* as if something was happened on each channel right now. This should be
|
||||||
|
* called when a global hang is detected that could cause a false positive on
|
||||||
|
* other innocent channels.
|
||||||
|
*/
|
||||||
|
void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g)
|
||||||
|
{
|
||||||
|
struct nvgpu_fifo *f = &g->fifo;
|
||||||
|
u32 chid;
|
||||||
|
|
||||||
|
for (chid = 0; chid < f->num_channels; chid++) {
|
||||||
|
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
|
||||||
|
|
||||||
|
if (ch != NULL) {
|
||||||
|
if (!nvgpu_channel_check_unserviceable(ch)) {
|
||||||
|
nvgpu_channel_wdt_rewind(ch);
|
||||||
|
}
|
||||||
|
nvgpu_channel_put(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a timed out channel has hung and recover it if it has.
|
||||||
|
*
|
||||||
|
* Test if this channel has really got stuck at this point by checking if its
|
||||||
|
* {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since
|
||||||
|
* when the watchdog was started and it's timed out, force-reset the channel.
|
||||||
|
*
|
||||||
|
* The gpu is implicitly on at this point, because the watchdog can only run on
|
||||||
|
* channels that have submitted jobs pending for cleanup.
|
||||||
|
*/
|
||||||
|
static void nvgpu_channel_wdt_handler(struct nvgpu_channel *ch)
|
||||||
|
{
|
||||||
|
struct gk20a *g = ch->g;
|
||||||
|
u32 gp_get;
|
||||||
|
u32 new_gp_get;
|
||||||
|
u64 pb_get;
|
||||||
|
u64 new_pb_get;
|
||||||
|
|
||||||
|
nvgpu_log_fn(g, " ");
|
||||||
|
|
||||||
|
if (nvgpu_channel_check_unserviceable(ch)) {
|
||||||
|
/* channel is already recovered */
|
||||||
|
if (nvgpu_channel_wdt_stop(ch) == true) {
|
||||||
|
nvgpu_info(g, "chid: %d unserviceable but wdt was ON",
|
||||||
|
ch->chid);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get status but keep timer running */
|
||||||
|
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
||||||
|
gp_get = ch->wdt.gp_get;
|
||||||
|
pb_get = ch->wdt.pb_get;
|
||||||
|
nvgpu_spinlock_release(&ch->wdt.lock);
|
||||||
|
|
||||||
|
new_gp_get = g->ops.userd.gp_get(g, ch);
|
||||||
|
new_pb_get = g->ops.userd.pb_get(g, ch);
|
||||||
|
|
||||||
|
if (new_gp_get != gp_get || new_pb_get != pb_get) {
|
||||||
|
/* Channel has advanced, timer keeps going but resets */
|
||||||
|
nvgpu_channel_wdt_rewind(ch);
|
||||||
|
} else if (!nvgpu_timeout_peek_expired(&ch->wdt.timer)) {
|
||||||
|
/* Seems stuck but waiting to time out */
|
||||||
|
} else {
|
||||||
|
nvgpu_err(g, "Job on channel %d timed out",
|
||||||
|
ch->chid);
|
||||||
|
|
||||||
|
/* force reset calls gk20a_debug_dump but not this */
|
||||||
|
if (ch->wdt.debug_dump) {
|
||||||
|
gk20a_gr_debug_dump(g);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
|
||||||
|
if (g->ops.tsg.force_reset(ch,
|
||||||
|
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
|
||||||
|
ch->wdt.debug_dump) != 0) {
|
||||||
|
nvgpu_err(g, "failed tsg force reset for chid: %d",
|
||||||
|
ch->chid);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test if the per-channel watchdog is on; check the timeout in that case.
|
||||||
|
*
|
||||||
|
* Each channel has an expiration time based watchdog. The timer is
|
||||||
|
* (re)initialized in two situations: when a new job is submitted on an idle
|
||||||
|
* channel and when the timeout is checked but progress is detected. The
|
||||||
|
* watchdog timeout limit is a coarse sliding window.
|
||||||
|
*
|
||||||
|
* The timeout is stopped (disabled) after the last job in a row finishes
|
||||||
|
* and marks the channel idle.
|
||||||
|
*/
|
||||||
|
void nvgpu_channel_wdt_check(struct nvgpu_channel *ch)
|
||||||
|
{
|
||||||
|
bool running;
|
||||||
|
|
||||||
|
nvgpu_spinlock_acquire(&ch->wdt.lock);
|
||||||
|
running = ch->wdt.running;
|
||||||
|
nvgpu_spinlock_release(&ch->wdt.lock);
|
||||||
|
|
||||||
|
if (running) {
|
||||||
|
nvgpu_channel_wdt_handler(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -26,6 +26,7 @@
|
|||||||
#include <nvgpu/engines.h>
|
#include <nvgpu/engines.h>
|
||||||
#include <nvgpu/debug.h>
|
#include <nvgpu/debug.h>
|
||||||
#include <nvgpu/channel.h>
|
#include <nvgpu/channel.h>
|
||||||
|
#include <nvgpu/watchdog.h>
|
||||||
#include <nvgpu/tsg.h>
|
#include <nvgpu/tsg.h>
|
||||||
#include <nvgpu/error_notifier.h>
|
#include <nvgpu/error_notifier.h>
|
||||||
#include <nvgpu/nvgpu_err.h>
|
#include <nvgpu/nvgpu_err.h>
|
||||||
|
|||||||
@@ -578,12 +578,6 @@ struct nvgpu_channel {
|
|||||||
|
|
||||||
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
|
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
|
||||||
|
|
||||||
static inline struct nvgpu_channel *
|
|
||||||
nvgpu_channel_from_worker_item(struct nvgpu_list_node *node)
|
|
||||||
{
|
|
||||||
return (struct nvgpu_channel *)
|
|
||||||
((uintptr_t)node - offsetof(struct nvgpu_channel, worker_item));
|
|
||||||
};
|
|
||||||
int nvgpu_channel_worker_init(struct gk20a *g);
|
int nvgpu_channel_worker_init(struct gk20a *g);
|
||||||
void nvgpu_channel_worker_deinit(struct gk20a *g);
|
void nvgpu_channel_worker_deinit(struct gk20a *g);
|
||||||
void nvgpu_channel_update(struct nvgpu_channel *c);
|
void nvgpu_channel_update(struct nvgpu_channel *c);
|
||||||
@@ -617,10 +611,6 @@ bool nvgpu_channel_is_prealloc_enabled(struct nvgpu_channel *c);
|
|||||||
bool nvgpu_channel_update_and_check_ctxsw_timeout(struct nvgpu_channel *ch,
|
bool nvgpu_channel_update_and_check_ctxsw_timeout(struct nvgpu_channel *ch,
|
||||||
u32 timeout_delta_ms, bool *progress);
|
u32 timeout_delta_ms, bool *progress);
|
||||||
|
|
||||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
|
||||||
void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* CONFIG_NVGPU_KERNEL_MODE_SUBMIT */
|
#endif /* CONFIG_NVGPU_KERNEL_MODE_SUBMIT */
|
||||||
|
|
||||||
static inline bool nvgpu_channel_is_deterministic(struct nvgpu_channel *c)
|
static inline bool nvgpu_channel_is_deterministic(struct nvgpu_channel *c)
|
||||||
|
|||||||
37
drivers/gpu/nvgpu/include/nvgpu/watchdog.h
Normal file
37
drivers/gpu/nvgpu/include/nvgpu/watchdog.h
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NVGPU_WATCHDOG_H
|
||||||
|
#define NVGPU_WATCHDOG_H
|
||||||
|
|
||||||
|
struct gk20a;
|
||||||
|
struct nvgpu_channel;
|
||||||
|
struct nvgpu_worker;
|
||||||
|
|
||||||
|
void nvgpu_channel_wdt_start(struct nvgpu_channel *ch);
|
||||||
|
void nvgpu_channel_wdt_continue(struct nvgpu_channel *ch);
|
||||||
|
bool nvgpu_channel_wdt_stop(struct nvgpu_channel *ch);
|
||||||
|
void nvgpu_channel_wdt_check(struct nvgpu_channel *ch);
|
||||||
|
|
||||||
|
void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g);
|
||||||
|
|
||||||
|
#endif
|
||||||
Reference in New Issue
Block a user