mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: support per-channel wdt timeouts
Replace the padding in nvgpu_channel_wdt_args with a timeout value in milliseconds, and add NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT to signify the existence of this new field. When the new flag is included in the value of wdt_status, the field is used to set a per-channel timeout to override the per-GPU default. Add NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP to disable the long debug dump when a timed out channel gets recovered by the watchdog. Printing the dump to serial console takes easily several seconds. (Note that there is NVGPU_TIMEOUT_FLAG_DISABLE_DUMP about ctxsw timeout separately for NVGPU_IOCTL_CHANNEL_SET_TIMEOUT_EX as well.) The behaviour of NVGPU_IOCTL_CHANNEL_WDT is changed so that either NVGPU_IOCTL_CHANNEL_ENABLE_WDT or NVGPU_IOCTL_CHANNEL_DISABLE_WDT has to be set. The old behaviour was that other values were silently ignored. The usage of the global default debugfs-controlled ch_wdt_timeout_ms is changed so that its value takes effect only for newly opened channels instead of in realtime. Also, zero value no longer means that the watchdog is disabled; there is a separate flag for that after all. gk20a_fifo_recover_tsg used to ignore the value of "verbose" when no engines were found. Correct this. Bug 1982826 Bug 1985845 Jira NVGPU-73 Change-Id: Iea6213a646a66cb7c631ed7d7c91d8c2ba8a92a4 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1510898 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
4f9368522e
commit
cb6ed949e2
@@ -753,7 +753,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
|
|||||||
*/
|
*/
|
||||||
need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
|
need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
|
||||||
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
|
(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
|
||||||
c->wdt_enabled ||
|
c->timeout.enabled ||
|
||||||
(g->can_railgate && !c->deterministic) ||
|
(g->can_railgate && !c->deterministic) ||
|
||||||
!skip_buffer_refcounting;
|
!skip_buffer_refcounting;
|
||||||
|
|
||||||
@@ -791,7 +791,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
|
|||||||
*/
|
*/
|
||||||
need_deferred_cleanup = !c->deterministic ||
|
need_deferred_cleanup = !c->deterministic ||
|
||||||
need_sync_framework ||
|
need_sync_framework ||
|
||||||
c->wdt_enabled ||
|
c->timeout.enabled ||
|
||||||
(g->can_railgate &&
|
(g->can_railgate &&
|
||||||
!c->deterministic) ||
|
!c->deterministic) ||
|
||||||
!skip_buffer_refcounting;
|
!skip_buffer_refcounting;
|
||||||
|
|||||||
@@ -319,10 +319,21 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch,
|
|||||||
static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
|
static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
|
||||||
struct nvgpu_channel_wdt_args *args)
|
struct nvgpu_channel_wdt_args *args)
|
||||||
{
|
{
|
||||||
if (args->wdt_status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
|
u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
|
||||||
ch->wdt_enabled = false;
|
NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
|
||||||
else if (args->wdt_status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
|
|
||||||
ch->wdt_enabled = true;
|
if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT)
|
||||||
|
ch->timeout.enabled = false;
|
||||||
|
else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT)
|
||||||
|
ch->timeout.enabled = true;
|
||||||
|
else
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT)
|
||||||
|
ch->timeout.limit_ms = args->timeout_ms;
|
||||||
|
|
||||||
|
ch->timeout.debug_dump = (args->wdt_status &
|
||||||
|
NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -443,7 +443,7 @@ u32 gk20a_ce_create_context(struct gk20a *g,
|
|||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
goto end;
|
goto end;
|
||||||
}
|
}
|
||||||
ce_ctx->ch->wdt_enabled = false;
|
ce_ctx->ch->timeout.enabled = false;
|
||||||
|
|
||||||
/* bind the channel to the vm */
|
/* bind the channel to the vm */
|
||||||
err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
|
err = __gk20a_vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
|
||||||
|
|||||||
@@ -61,8 +61,6 @@ static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
|
|||||||
static struct channel_gk20a_job *channel_gk20a_joblist_peek(
|
static struct channel_gk20a_job *channel_gk20a_joblist_peek(
|
||||||
struct channel_gk20a *c);
|
struct channel_gk20a *c);
|
||||||
|
|
||||||
static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
|
|
||||||
|
|
||||||
/* allocate GPU channel */
|
/* allocate GPU channel */
|
||||||
static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
|
static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
|
||||||
{
|
{
|
||||||
@@ -696,14 +694,19 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
|
|||||||
/* By default, channel is regular (non-TSG) channel */
|
/* By default, channel is regular (non-TSG) channel */
|
||||||
ch->tsgid = NVGPU_INVALID_TSG_ID;
|
ch->tsgid = NVGPU_INVALID_TSG_ID;
|
||||||
|
|
||||||
/* reset timeout counter and update timestamp */
|
/* clear ctxsw timeout counter and update timestamp */
|
||||||
ch->timeout_accumulated_ms = 0;
|
ch->timeout_accumulated_ms = 0;
|
||||||
ch->timeout_gpfifo_get = 0;
|
ch->timeout_gpfifo_get = 0;
|
||||||
/* set gr host default timeout */
|
/* set gr host default timeout */
|
||||||
ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
|
ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
|
||||||
ch->timeout_debug_dump = true;
|
ch->timeout_debug_dump = true;
|
||||||
ch->has_timedout = false;
|
ch->has_timedout = false;
|
||||||
ch->wdt_enabled = true;
|
|
||||||
|
/* init kernel watchdog timeout */
|
||||||
|
ch->timeout.enabled = true;
|
||||||
|
ch->timeout.limit_ms = g->ch_wdt_timeout_ms;
|
||||||
|
ch->timeout.debug_dump = true;
|
||||||
|
|
||||||
ch->obj_class = 0;
|
ch->obj_class = 0;
|
||||||
ch->subctx_id = 0;
|
ch->subctx_id = 0;
|
||||||
ch->runqueue_sel = 0;
|
ch->runqueue_sel = 0;
|
||||||
@@ -1166,10 +1169,10 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!c->g->timeouts_enabled || !c->wdt_enabled)
|
if (!c->g->timeouts_enabled || !c->timeout.enabled)
|
||||||
acquire_timeout = 0;
|
acquire_timeout = 0;
|
||||||
else
|
else
|
||||||
acquire_timeout = gk20a_get_channel_watchdog_timeout(c);
|
acquire_timeout = c->timeout.limit_ms;
|
||||||
|
|
||||||
err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
|
err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
|
||||||
c->gpfifo.entry_num,
|
c->gpfifo.entry_num,
|
||||||
@@ -1265,11 +1268,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
|
|||||||
ch->timeout_accumulated_ms > ch->timeout_ms_max;
|
ch->timeout_accumulated_ms > ch->timeout_ms_max;
|
||||||
}
|
}
|
||||||
|
|
||||||
static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
|
|
||||||
{
|
|
||||||
return ch->g->ch_wdt_timeout_ms;
|
|
||||||
}
|
|
||||||
|
|
||||||
u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
|
u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
|
||||||
{
|
{
|
||||||
update_gp_get(c->g, c);
|
update_gp_get(c->g, c);
|
||||||
@@ -1282,7 +1280,7 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
|
|||||||
ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);
|
ch->timeout.pb_get = ch->g->ops.fifo.userd_pb_get(ch->g, ch);
|
||||||
ch->timeout.running = true;
|
ch->timeout.running = true;
|
||||||
nvgpu_timeout_init(ch->g, &ch->timeout.timer,
|
nvgpu_timeout_init(ch->g, &ch->timeout.timer,
|
||||||
gk20a_get_channel_watchdog_timeout(ch),
|
ch->timeout.limit_ms,
|
||||||
NVGPU_TIMER_CPU_TIMER);
|
NVGPU_TIMER_CPU_TIMER);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1303,10 +1301,10 @@ static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
|
|||||||
*/
|
*/
|
||||||
static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
|
static void gk20a_channel_timeout_start(struct channel_gk20a *ch)
|
||||||
{
|
{
|
||||||
if (!ch->g->timeouts_enabled || !gk20a_get_channel_watchdog_timeout(ch))
|
if (!ch->g->timeouts_enabled)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!ch->wdt_enabled)
|
if (!ch->timeout.enabled)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
|
nvgpu_raw_spinlock_acquire(&ch->timeout.lock);
|
||||||
@@ -1425,11 +1423,13 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
|
|||||||
nvgpu_err(g, "Job on channel %d timed out",
|
nvgpu_err(g, "Job on channel %d timed out",
|
||||||
ch->chid);
|
ch->chid);
|
||||||
|
|
||||||
gk20a_debug_dump(g);
|
/* force reset calls gk20a_debug_dump but not this */
|
||||||
|
if (ch->timeout.debug_dump)
|
||||||
gk20a_gr_debug_dump(g);
|
gk20a_gr_debug_dump(g);
|
||||||
|
|
||||||
g->ops.fifo.force_reset_ch(ch,
|
g->ops.fifo.force_reset_ch(ch,
|
||||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true);
|
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
|
||||||
|
ch->timeout.debug_dump);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -96,11 +96,17 @@ struct channel_gk20a_joblist {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct channel_gk20a_timeout {
|
struct channel_gk20a_timeout {
|
||||||
|
/* lock protects the running timer state */
|
||||||
struct nvgpu_raw_spinlock lock;
|
struct nvgpu_raw_spinlock lock;
|
||||||
struct nvgpu_timeout timer;
|
struct nvgpu_timeout timer;
|
||||||
bool running;
|
bool running;
|
||||||
u32 gp_get;
|
u32 gp_get;
|
||||||
u64 pb_get;
|
u64 pb_get;
|
||||||
|
|
||||||
|
/* lock not needed */
|
||||||
|
u32 limit_ms;
|
||||||
|
bool enabled;
|
||||||
|
bool debug_dump;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -167,7 +173,6 @@ struct channel_gk20a {
|
|||||||
struct nvgpu_semaphore_int *hw_sema;
|
struct nvgpu_semaphore_int *hw_sema;
|
||||||
|
|
||||||
int chid;
|
int chid;
|
||||||
bool wdt_enabled;
|
|
||||||
nvgpu_atomic_t bound;
|
nvgpu_atomic_t bound;
|
||||||
bool vpr;
|
bool vpr;
|
||||||
bool deterministic;
|
bool deterministic;
|
||||||
@@ -203,7 +208,9 @@ struct channel_gk20a {
|
|||||||
u32 timeout_accumulated_ms;
|
u32 timeout_accumulated_ms;
|
||||||
u32 timeout_gpfifo_get;
|
u32 timeout_gpfifo_get;
|
||||||
|
|
||||||
|
/* kernel watchdog to kill stuck jobs */
|
||||||
struct channel_gk20a_timeout timeout;
|
struct channel_gk20a_timeout timeout;
|
||||||
|
|
||||||
/* for job cleanup handling in the background worker */
|
/* for job cleanup handling in the background worker */
|
||||||
struct nvgpu_list_node worker_item;
|
struct nvgpu_list_node worker_item;
|
||||||
|
|
||||||
|
|||||||
@@ -1808,7 +1808,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
|
|||||||
else {
|
else {
|
||||||
struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
|
struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
|
||||||
|
|
||||||
if (gk20a_fifo_error_tsg(g, tsg))
|
if (gk20a_fifo_error_tsg(g, tsg) && verbose)
|
||||||
gk20a_debug_dump(g);
|
gk20a_debug_dump(g);
|
||||||
|
|
||||||
gk20a_fifo_abort_tsg(g, tsgid, false);
|
gk20a_fifo_abort_tsg(g, tsgid, false);
|
||||||
|
|||||||
@@ -1577,13 +1577,15 @@ struct nvgpu_cycle_stats_snapshot_args {
|
|||||||
#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1
|
#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH 1
|
||||||
#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2
|
#define NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH 2
|
||||||
|
|
||||||
/* disable watchdog per-channel */
|
/* configure watchdog per-channel */
|
||||||
struct nvgpu_channel_wdt_args {
|
struct nvgpu_channel_wdt_args {
|
||||||
__u32 wdt_status;
|
__u32 wdt_status;
|
||||||
__u32 padding;
|
__u32 timeout_ms;
|
||||||
};
|
};
|
||||||
#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT 1
|
#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT (1 << 0)
|
||||||
#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT 2
|
#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT (1 << 1)
|
||||||
|
#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT (1 << 2)
|
||||||
|
#define NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP (1 << 3)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Interleaving channels in a runlist is an approach to improve
|
* Interleaving channels in a runlist is an approach to improve
|
||||||
|
|||||||
Reference in New Issue
Block a user