mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 09:12:24 +03:00
gpu: nvgpu: decouple channel watchdog dependencies
The channel code needs the watchdog code and vice versa. Cut this circular dependency with a few simplifications so that the watchdog wouldn't depend on so much. When calling watchdog APIs that cause stores or comparisons of channel progress, provide a snapshot of the current progress instead of a whole channel pointer. struct nvgpu_channel_wdt_state is added as an interface for this to track gp_get and pb_get. When periodically checking the watchdog state, make the channel code ask whether a hang has been detected and abort the channel from within channel code instead of asking the watchdog to abort the channel. The debug dump verbosity flag is also moved back to the channel data. Move the functionality to restart all channels' watchdogs to channel code from watchdog code. Looping over active channels is not a good feature for the watchdog; it's better for the channel handling to just use the watchdog as a tracking tool. Move a few unserviceable checks up in the stack to the callers of the wdt code. They're a kludge but this will do for now and demonstrates what needs to be eventually fixed. This does not leave much code in the watchdog unit. Now the purpose of the watchdog is to only isolate the logic to couple a timer and progress snapshots with careful locking to start and stop the tracking. Jira NVGPU-5582 Change-Id: I7c728542ff30d88b1414500210be3fbaf61e6e8a Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2369820 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
Alex Waterman
parent
281006ae7d
commit
e8201d6ce3
@@ -459,6 +459,114 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
||||
void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, bool dump)
|
||||
{
|
||||
ch->wdt_debug_dump = dump;
|
||||
}
|
||||
|
||||
static struct nvgpu_channel_wdt_state nvgpu_channel_collect_wdt_state(
|
||||
struct nvgpu_channel *ch)
|
||||
{
|
||||
struct gk20a *g = ch->g;
|
||||
struct nvgpu_channel_wdt_state state = { 0, 0 };
|
||||
|
||||
/*
|
||||
* Note: just checking for nvgpu_channel_wdt_enabled() is not enough at
|
||||
* the moment because system suspend puts g->regs away but doesn't stop
|
||||
* the worker thread that runs the watchdog. This might need to be
|
||||
* cleared up in the future.
|
||||
*/
|
||||
if (nvgpu_channel_wdt_running(ch->wdt)) {
|
||||
/*
|
||||
* Read the state only if the wdt is on to avoid unnecessary
|
||||
* accesses. The kernel mem for userd may not even exist; this
|
||||
* channel could be in usermode submit mode.
|
||||
*/
|
||||
state.gp_get = g->ops.userd.gp_get(g, ch);
|
||||
state.pb_get = g->ops.userd.pb_get(g, ch);
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
static void nvgpu_channel_launch_wdt(struct nvgpu_channel *ch)
|
||||
{
|
||||
struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch);
|
||||
|
||||
/*
|
||||
* FIXME: channel recovery can race the submit path and can start even
|
||||
* after this, but this check is the best we can do for now.
|
||||
*/
|
||||
if (!nvgpu_channel_check_unserviceable(ch)) {
|
||||
nvgpu_channel_wdt_start(ch->wdt, &state);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void nvgpu_channel_restart_all_wdts(struct gk20a *g)
|
||||
{
|
||||
struct nvgpu_fifo *f = &g->fifo;
|
||||
u32 chid;
|
||||
|
||||
for (chid = 0; chid < f->num_channels; chid++) {
|
||||
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
|
||||
|
||||
if (ch != NULL) {
|
||||
if ((ch->wdt != NULL) &&
|
||||
!nvgpu_channel_check_unserviceable(ch)) {
|
||||
struct nvgpu_channel_wdt_state state =
|
||||
nvgpu_channel_collect_wdt_state(ch);
|
||||
|
||||
nvgpu_channel_wdt_rewind(ch->wdt, &state);
|
||||
}
|
||||
nvgpu_channel_put(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void nvgpu_channel_recover_from_wdt(struct nvgpu_channel *ch)
|
||||
{
|
||||
struct gk20a *g = ch->g;
|
||||
|
||||
nvgpu_log_fn(g, " ");
|
||||
|
||||
if (nvgpu_channel_check_unserviceable(ch)) {
|
||||
/* channel is already recovered */
|
||||
nvgpu_info(g, "chid: %d unserviceable but wdt was ON", ch->chid);
|
||||
return;
|
||||
}
|
||||
|
||||
nvgpu_err(g, "Job on channel %d timed out", ch->chid);
|
||||
|
||||
/* force reset calls gk20a_debug_dump but not this */
|
||||
if (ch->wdt_debug_dump) {
|
||||
gk20a_gr_debug_dump(g);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
|
||||
if (g->ops.tsg.force_reset(ch,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
|
||||
ch->wdt_debug_dump) != 0) {
|
||||
nvgpu_err(g, "failed tsg force reset for chid: %d", ch->chid);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Test the watchdog progress. If the channel is stuck, reset it.
|
||||
*
|
||||
* The gpu is implicitly on at this point because the watchdog can only run on
|
||||
* channels that have submitted jobs pending for cleanup.
|
||||
*/
|
||||
static void nvgpu_channel_check_wdt(struct nvgpu_channel *ch)
|
||||
{
|
||||
struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch);
|
||||
|
||||
if (nvgpu_channel_wdt_check(ch->wdt, &state)) {
|
||||
nvgpu_channel_recover_from_wdt(ch);
|
||||
}
|
||||
}
|
||||
|
||||
static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
|
||||
{
|
||||
struct nvgpu_channel_worker *ch_worker =
|
||||
@@ -486,7 +594,7 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g)
|
||||
|
||||
if (ch != NULL) {
|
||||
if (!nvgpu_channel_check_unserviceable(ch)) {
|
||||
nvgpu_channel_wdt_check(ch->wdt, ch);
|
||||
nvgpu_channel_check_wdt(ch);
|
||||
}
|
||||
nvgpu_channel_put(ch);
|
||||
}
|
||||
@@ -521,6 +629,8 @@ static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
|
||||
|
||||
return ch_worker->watchdog_interval;
|
||||
}
|
||||
#else
|
||||
static void nvgpu_channel_launch_wdt(struct nvgpu_channel *ch) {}
|
||||
#endif /* CONFIG_NVGPU_CHANNEL_WDT */
|
||||
|
||||
static inline struct nvgpu_channel *
|
||||
@@ -635,7 +745,7 @@ int nvgpu_channel_add_job(struct nvgpu_channel *c,
|
||||
job->num_mapped_buffers = num_mapped_buffers;
|
||||
job->mapped_buffers = mapped_buffers;
|
||||
|
||||
nvgpu_channel_wdt_start(c->wdt, c);
|
||||
nvgpu_channel_launch_wdt(c);
|
||||
|
||||
nvgpu_channel_joblist_lock(c);
|
||||
nvgpu_channel_joblist_add(c, job);
|
||||
@@ -1456,11 +1566,12 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
|
||||
ch->unserviceable = true;
|
||||
|
||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
||||
ch->wdt = nvgpu_channel_wdt_alloc(ch);
|
||||
ch->wdt = nvgpu_channel_wdt_alloc(g);
|
||||
if (ch->wdt == NULL) {
|
||||
nvgpu_err(g, "wdt alloc failed");
|
||||
goto clean_up;
|
||||
}
|
||||
ch->wdt_debug_dump = true;
|
||||
#endif
|
||||
|
||||
ch->obj_class = 0;
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include <nvgpu/watchdog.h>
|
||||
#include <nvgpu/error_notifier.h>
|
||||
#include <nvgpu/watchdog.h>
|
||||
#include <nvgpu/string.h>
|
||||
|
||||
struct nvgpu_channel_wdt {
|
||||
struct gk20a *g;
|
||||
@@ -33,18 +34,15 @@ struct nvgpu_channel_wdt {
|
||||
struct nvgpu_spinlock lock;
|
||||
struct nvgpu_timeout timer;
|
||||
bool running;
|
||||
u32 gp_get;
|
||||
u64 pb_get;
|
||||
struct nvgpu_channel_wdt_state ch_state;
|
||||
|
||||
/* lock not needed */
|
||||
u32 limit_ms;
|
||||
bool enabled;
|
||||
bool debug_dump;
|
||||
};
|
||||
|
||||
struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch)
|
||||
struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct gk20a *g)
|
||||
{
|
||||
struct gk20a *g = ch->g;
|
||||
struct nvgpu_channel_wdt *wdt = nvgpu_kzalloc(g, sizeof(*wdt));
|
||||
|
||||
if (wdt == NULL) {
|
||||
@@ -55,7 +53,6 @@ struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch)
|
||||
nvgpu_spinlock_init(&wdt->lock);
|
||||
wdt->enabled = true;
|
||||
wdt->limit_ms = g->ch_wdt_init_limit_ms;
|
||||
wdt->debug_dump = true;
|
||||
|
||||
return wdt;
|
||||
}
|
||||
@@ -90,22 +87,12 @@ u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt)
|
||||
return wdt->limit_ms;
|
||||
}
|
||||
|
||||
void nvgpu_channel_wdt_set_debug_dump(struct nvgpu_channel_wdt *wdt, bool dump)
|
||||
{
|
||||
wdt->debug_dump = dump;
|
||||
}
|
||||
|
||||
static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch)
|
||||
struct nvgpu_channel_wdt_state *state)
|
||||
{
|
||||
struct gk20a *g = wdt->g;
|
||||
int ret;
|
||||
|
||||
if (nvgpu_channel_check_unserviceable(ch)) {
|
||||
wdt->running = false;
|
||||
return;
|
||||
}
|
||||
|
||||
ret = nvgpu_timeout_init(g, &wdt->timer,
|
||||
wdt->limit_ms,
|
||||
NVGPU_TIMER_CPU_TIMER);
|
||||
@@ -114,8 +101,7 @@ static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt,
|
||||
return;
|
||||
}
|
||||
|
||||
wdt->gp_get = g->ops.userd.gp_get(g, ch);
|
||||
wdt->pb_get = g->ops.userd.pb_get(g, ch);
|
||||
wdt->ch_state = *state;
|
||||
wdt->running = true;
|
||||
}
|
||||
|
||||
@@ -129,13 +115,9 @@ static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt,
|
||||
* If the timeout is already running, do nothing. This should be called when
|
||||
* new jobs are submitted. The timeout will stop when the last tracked job
|
||||
* finishes, making the channel idle.
|
||||
*
|
||||
* The channel's gpfifo read pointer will be used to determine if the job has
|
||||
* actually stuck at that time. After the timeout duration has expired, a
|
||||
* worker thread will consider the channel stuck and recover it if stuck.
|
||||
*/
|
||||
void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch)
|
||||
struct nvgpu_channel_wdt_state *state)
|
||||
{
|
||||
if (!nvgpu_is_timeouts_enabled(wdt->g)) {
|
||||
return;
|
||||
@@ -151,7 +133,7 @@ void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt,
|
||||
nvgpu_spinlock_release(&wdt->lock);
|
||||
return;
|
||||
}
|
||||
nvgpu_channel_wdt_init(wdt, ch);
|
||||
nvgpu_channel_wdt_init(wdt, state);
|
||||
nvgpu_spinlock_release(&wdt->lock);
|
||||
}
|
||||
|
||||
@@ -203,103 +185,69 @@ void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt)
|
||||
* timeouts. Stopped timeouts can only be started (which is technically a
|
||||
* rewind too) or continued (where the stop is actually pause).
|
||||
*/
|
||||
static void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch)
|
||||
void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel_wdt_state *state)
|
||||
{
|
||||
nvgpu_spinlock_acquire(&wdt->lock);
|
||||
if (wdt->running) {
|
||||
nvgpu_channel_wdt_init(wdt, ch);
|
||||
nvgpu_channel_wdt_init(wdt, state);
|
||||
}
|
||||
nvgpu_spinlock_release(&wdt->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewind the timeout on each non-dormant channel.
|
||||
* Check if the watchdog is running.
|
||||
*
|
||||
* Reschedule the timeout of each active channel for which timeouts are running
|
||||
* as if something was happened on each channel right now. This should be
|
||||
* called when a global hang is detected that could cause a false positive on
|
||||
* other innocent channels.
|
||||
* A running watchdog means one that is requested to run and expire in the
|
||||
* future. The state of a running watchdog has to be checked periodically to
|
||||
* see if it's expired.
|
||||
*/
|
||||
void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g)
|
||||
bool nvgpu_channel_wdt_running(struct nvgpu_channel_wdt *wdt)
|
||||
{
|
||||
struct nvgpu_fifo *f = &g->fifo;
|
||||
u32 chid;
|
||||
bool running;
|
||||
|
||||
for (chid = 0; chid < f->num_channels; chid++) {
|
||||
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
|
||||
nvgpu_spinlock_acquire(&wdt->lock);
|
||||
running = wdt->running;
|
||||
nvgpu_spinlock_release(&wdt->lock);
|
||||
|
||||
if (ch != NULL) {
|
||||
if ((ch->wdt != NULL) &&
|
||||
(!nvgpu_channel_check_unserviceable(ch))) {
|
||||
nvgpu_channel_wdt_rewind(ch->wdt, ch);
|
||||
}
|
||||
nvgpu_channel_put(ch);
|
||||
}
|
||||
}
|
||||
return running;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a timed out channel has hung and recover it if it has.
|
||||
* Check if a channel has been stuck for the watchdog limit.
|
||||
*
|
||||
* Test if this channel has really got stuck at this point by checking if its
|
||||
* {gp,pb}_get has advanced or not. If no {gp,pb}_get action happened since
|
||||
* when the watchdog was started and it's timed out, force-reset the channel.
|
||||
*
|
||||
* The gpu is implicitly on at this point, because the watchdog can only run on
|
||||
* channels that have submitted jobs pending for cleanup.
|
||||
* {gp,pb}_get have advanced or not. If progress was detected, start the timer
|
||||
* from zero again. If no {gp,pb}_get action happened in the watchdog time
|
||||
* limit, return true. Else return false.
|
||||
*/
|
||||
static void nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch)
|
||||
static bool nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel_wdt_state *state)
|
||||
{
|
||||
struct gk20a *g = wdt->g;
|
||||
u32 gp_get;
|
||||
u32 new_gp_get;
|
||||
u64 pb_get;
|
||||
u64 new_pb_get;
|
||||
struct nvgpu_channel_wdt_state previous_state;
|
||||
|
||||
nvgpu_log_fn(g, " ");
|
||||
|
||||
if (nvgpu_channel_check_unserviceable(ch)) {
|
||||
/* channel is already recovered */
|
||||
if (nvgpu_channel_wdt_stop(wdt) == true) {
|
||||
nvgpu_info(g, "chid: %d unserviceable but wdt was ON",
|
||||
ch->chid);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Get status but keep timer running */
|
||||
nvgpu_spinlock_acquire(&wdt->lock);
|
||||
gp_get = wdt->gp_get;
|
||||
pb_get = wdt->pb_get;
|
||||
previous_state = wdt->ch_state;
|
||||
nvgpu_spinlock_release(&wdt->lock);
|
||||
|
||||
new_gp_get = g->ops.userd.gp_get(g, ch);
|
||||
new_pb_get = g->ops.userd.pb_get(g, ch);
|
||||
|
||||
if (new_gp_get != gp_get || new_pb_get != pb_get) {
|
||||
if (nvgpu_memcmp((const u8 *)state,
|
||||
(const u8 *)&previous_state,
|
||||
sizeof(*state)) != 0) {
|
||||
/* Channel has advanced, timer keeps going but resets */
|
||||
nvgpu_channel_wdt_rewind(wdt, ch);
|
||||
} else if (!nvgpu_timeout_peek_expired(&wdt->timer)) {
|
||||
/* Seems stuck but waiting to time out */
|
||||
} else {
|
||||
nvgpu_err(g, "Job on channel %d timed out", ch->chid);
|
||||
|
||||
/* force reset calls gk20a_debug_dump but not this */
|
||||
if (wdt->debug_dump) {
|
||||
gk20a_gr_debug_dump(g);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
|
||||
if (g->ops.tsg.force_reset(ch,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
|
||||
wdt->debug_dump) != 0) {
|
||||
nvgpu_err(g, "failed tsg force reset for chid: %d",
|
||||
ch->chid);
|
||||
}
|
||||
#endif
|
||||
nvgpu_channel_wdt_rewind(wdt, state);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!nvgpu_timeout_peek_expired(&wdt->timer)) {
|
||||
/* Seems stuck but waiting to time out */
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -313,8 +261,8 @@ static void nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt,
|
||||
* The timeout is stopped (disabled) after the last job in a row finishes
|
||||
* and marks the channel idle.
|
||||
*/
|
||||
void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch)
|
||||
bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel_wdt_state *state)
|
||||
{
|
||||
bool running;
|
||||
|
||||
@@ -323,6 +271,8 @@ void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
||||
nvgpu_spinlock_release(&wdt->lock);
|
||||
|
||||
if (running) {
|
||||
nvgpu_channel_wdt_handler(wdt, ch);
|
||||
return nvgpu_channel_wdt_handler(wdt, state);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#include <nvgpu/engines.h>
|
||||
#include <nvgpu/debug.h>
|
||||
#include <nvgpu/channel.h>
|
||||
#include <nvgpu/watchdog.h>
|
||||
#include <nvgpu/tsg.h>
|
||||
#include <nvgpu/error_notifier.h>
|
||||
#include <nvgpu/nvgpu_err.h>
|
||||
@@ -71,10 +70,12 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
|
||||
|
||||
#ifdef CONFIG_NVGPU_RECOVERY
|
||||
/*
|
||||
* Cancel all channels' wdt since ctxsw timeout might
|
||||
* trigger multiple watchdogs at a time
|
||||
* Cancel all channels' wdt since ctxsw timeout causes the runlist to
|
||||
* stuck and might falsely trigger multiple watchdogs at a time. We
|
||||
* won't detect proper wdt timeouts that would have happened, but if
|
||||
* they're stuck, they will trigger the wdt soon enough again.
|
||||
*/
|
||||
nvgpu_channel_wdt_restart_all_channels(g);
|
||||
nvgpu_channel_restart_all_wdts(g);
|
||||
|
||||
nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump,
|
||||
RC_TYPE_CTXSW_TIMEOUT);
|
||||
|
||||
@@ -377,6 +377,7 @@ struct nvgpu_channel {
|
||||
|
||||
/* kernel watchdog to kill stuck jobs */
|
||||
struct nvgpu_channel_wdt *wdt;
|
||||
bool wdt_debug_dump;
|
||||
|
||||
/** Fence allocator in case of deterministic submit. */
|
||||
struct nvgpu_allocator fence_allocator;
|
||||
@@ -1161,4 +1162,27 @@ int nvgpu_channel_deferred_reset_engines(struct gk20a *g,
|
||||
struct nvgpu_channel *ch);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
||||
/**
|
||||
* @brief Rewind the timeout on each non-dormant channel.
|
||||
*
|
||||
* Reschedule the timeout of each active channel for which timeouts are running
|
||||
* as if something was happened on each channel right now. This should be
|
||||
* called when a global hang is detected that could cause a false positive on
|
||||
* other innocent channels.
|
||||
*/
|
||||
void nvgpu_channel_restart_all_wdts(struct gk20a *g);
|
||||
/**
|
||||
* @brief Enable or disable full debug dump on wdt error.
|
||||
*
|
||||
* Set the policy on whether or not to do the verbose channel and gr debug dump
|
||||
* when the channel gets recovered as a result of a watchdog timeout.
|
||||
*/
|
||||
void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, bool dump);
|
||||
#else
|
||||
static inline void nvgpu_channel_restart_all_wdts(struct gk20a *g) {}
|
||||
static inline void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch,
|
||||
bool dump) {}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -23,14 +23,19 @@
|
||||
#ifndef NVGPU_WATCHDOG_H
|
||||
#define NVGPU_WATCHDOG_H
|
||||
|
||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
||||
#include <nvgpu/types.h>
|
||||
|
||||
struct gk20a;
|
||||
struct nvgpu_channel;
|
||||
struct nvgpu_worker;
|
||||
struct nvgpu_channel_wdt;
|
||||
|
||||
struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch);
|
||||
struct nvgpu_channel_wdt_state {
|
||||
u64 gp_get;
|
||||
u64 pb_get;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NVGPU_CHANNEL_WDT
|
||||
|
||||
struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct gk20a *g);
|
||||
void nvgpu_channel_wdt_destroy(struct nvgpu_channel_wdt *wdt);
|
||||
|
||||
void nvgpu_channel_wdt_enable(struct nvgpu_channel_wdt *wdt);
|
||||
@@ -39,21 +44,21 @@ bool nvgpu_channel_wdt_enabled(struct nvgpu_channel_wdt *wdt);
|
||||
|
||||
void nvgpu_channel_wdt_set_limit(struct nvgpu_channel_wdt *wdt, u32 limit_ms);
|
||||
u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt);
|
||||
void nvgpu_channel_wdt_set_debug_dump(struct nvgpu_channel_wdt *wdt, bool dump);
|
||||
|
||||
void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch);
|
||||
void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt);
|
||||
struct nvgpu_channel_wdt_state *state);
|
||||
bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt);
|
||||
void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch);
|
||||
|
||||
void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g);
|
||||
void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt);
|
||||
void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel_wdt_state *state);
|
||||
bool nvgpu_channel_wdt_running(struct nvgpu_channel_wdt *wdt);
|
||||
bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel_wdt_state *state);
|
||||
|
||||
#else /* CONFIG_NVGPU_CHANNEL_WDT */
|
||||
|
||||
static inline struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(
|
||||
struct nvgpu_channel *ch)
|
||||
struct gk20a *g)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
@@ -71,21 +76,19 @@ static inline u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt)
|
||||
{
|
||||
return 0U;
|
||||
}
|
||||
static inline void nvgpu_channel_wdt_set_debug_dump(
|
||||
struct nvgpu_channel_wdt *wdt,
|
||||
bool dump) {}
|
||||
|
||||
static inline void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch) {}
|
||||
static inline void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) {}
|
||||
struct nvgpu_channel_wdt_state *state) {}
|
||||
static inline bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel *ch) {}
|
||||
|
||||
static inline void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g) {}
|
||||
static inline void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) {}
|
||||
static inline void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel_wdt_state *state) {}
|
||||
static inline bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
|
||||
struct nvgpu_channel_wdt_state *state) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NVGPU_CHANNEL_WDT */
|
||||
|
||||
|
||||
@@ -322,7 +322,7 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
|
||||
if (set_timeout)
|
||||
nvgpu_channel_wdt_set_limit(ch->wdt, args->timeout_ms);
|
||||
|
||||
nvgpu_channel_wdt_set_debug_dump(ch->wdt, !disable_dump);
|
||||
nvgpu_channel_set_wdt_debug_dump(ch, !disable_dump);
|
||||
|
||||
return 0;
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user