mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-23 18:16:01 +03:00
gpu: nvgpu: Add NVGPU_CHANNEL_WDT flag
NVGPU_CHANNEL_WDT feature is embedded within the NVGPU_CHANNEL_WDT flag to allow it to be compiled out for safety builds. Jira NVGPU-3012 Change-Id: I0ca54af9d7b1b8e01f4090442341eaaadca8e339 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2114480 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
bf561f38f7
commit
1dea88c6c7
@@ -27,6 +27,7 @@ endif
|
||||
|
||||
ccflags-y += -DNVGPU_ENGINE
|
||||
ccflags-y += -DNVGPU_USERD
|
||||
ccflags-y += -DNVGPU_CHANNEL_WDT
|
||||
|
||||
obj-$(CONFIG_GK20A) := nvgpu.o
|
||||
|
||||
|
||||
@@ -31,6 +31,9 @@ NV_COMPONENT_CFLAGS += -DNVGPU_DEBUGGER
|
||||
# Enable USERD for safety build until we switch to user mode submits only
|
||||
NV_COMPONENT_CFLAGS += -DNVGPU_USERD
|
||||
|
||||
# Enable Channel WDT for safety build until we switch to user mode submits only
|
||||
NV_COMPONENT_CFLAGS += -DNVGPU_CHANNEL_WDT
|
||||
|
||||
# Enable iGPU LS PMU for safety build until devctl whitelisting is done
|
||||
NVGPU_LS_PMU := 1
|
||||
NV_COMPONENT_CFLAGS += -DNVGPU_LS_PMU
|
||||
|
||||
@@ -545,7 +545,10 @@ u32 nvgpu_ce_create_context(struct gk20a *g,
|
||||
nvgpu_err(g, "ce: gk20a channel not available");
|
||||
goto end;
|
||||
}
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
ce_ctx->ch->wdt.enabled = false;
|
||||
#endif
|
||||
|
||||
/* bind the channel to the vm */
|
||||
err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
|
||||
|
||||
@@ -724,10 +724,12 @@ struct nvgpu_channel *gk20a_open_new_channel(struct gk20a *g,
|
||||
ch->ctxsw_timeout_debug_dump = true;
|
||||
ch->unserviceable = false;
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
/* init kernel watchdog timeout */
|
||||
ch->wdt.enabled = true;
|
||||
ch->wdt.limit_ms = g->ch_wdt_init_limit_ms;
|
||||
ch->wdt.debug_dump = true;
|
||||
#endif
|
||||
|
||||
ch->obj_class = 0;
|
||||
ch->subctx_id = 0;
|
||||
@@ -1220,7 +1222,7 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
|
||||
u32 gpfifo_size, gpfifo_entry_size;
|
||||
u64 gpfifo_gpu_va;
|
||||
int err = 0;
|
||||
u64 pbdma_acquire_timeout;
|
||||
u64 pbdma_acquire_timeout = 0ULL;
|
||||
|
||||
gpfifo_size = args->num_gpfifo_entries;
|
||||
gpfifo_entry_size = nvgpu_get_gpfifo_entry_size();
|
||||
@@ -1332,11 +1334,14 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
|
||||
}
|
||||
}
|
||||
|
||||
if (!nvgpu_is_timeouts_enabled(c->g) || !c->wdt.enabled) {
|
||||
pbdma_acquire_timeout = 0;
|
||||
} else {
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
if (c->wdt.enabled && nvgpu_is_timeouts_enabled(c->g)) {
|
||||
pbdma_acquire_timeout = c->wdt.limit_ms;
|
||||
}
|
||||
#else
|
||||
if (nvgpu_is_timeouts_enabled(c->g)) {
|
||||
pbdma_acquire_timeout = g->ch_wdt_init_limit_ms;
|
||||
#endif
|
||||
|
||||
err = g->ops.ramfc.setup(c, gpfifo_gpu_va,
|
||||
c->gpfifo.entry_num, pbdma_acquire_timeout,
|
||||
@@ -1510,6 +1515,8 @@ u32 nvgpu_channel_update_gpfifo_get_and_get_free_count(struct nvgpu_channel *ch)
|
||||
return nvgpu_channel_get_gpfifo_free_count(ch);
|
||||
}
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
|
||||
static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch)
|
||||
{
|
||||
struct gk20a *g = ch->g;
|
||||
@@ -1755,6 +1762,8 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g)
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline struct nvgpu_channel_worker *
|
||||
nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
|
||||
{
|
||||
@@ -1762,6 +1771,7 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
|
||||
((uintptr_t)worker - offsetof(struct nvgpu_channel_worker, worker));
|
||||
};
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
|
||||
static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
|
||||
{
|
||||
@@ -1797,6 +1807,18 @@ static void nvgpu_channel_worker_poll_wakeup_post_process_item(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
|
||||
struct nvgpu_worker *worker)
|
||||
{
|
||||
struct nvgpu_channel_worker *ch_worker =
|
||||
nvgpu_channel_worker_from_worker(worker);
|
||||
|
||||
return ch_worker->watchdog_interval;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void nvgpu_channel_worker_poll_wakeup_process_item(
|
||||
struct nvgpu_list_node *work_item)
|
||||
{
|
||||
@@ -1812,25 +1834,18 @@ static void nvgpu_channel_worker_poll_wakeup_process_item(
|
||||
nvgpu_channel_put(ch);
|
||||
}
|
||||
|
||||
static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
|
||||
struct nvgpu_worker *worker)
|
||||
{
|
||||
struct nvgpu_channel_worker *ch_worker =
|
||||
nvgpu_channel_worker_from_worker(worker);
|
||||
|
||||
return ch_worker->watchdog_interval;
|
||||
}
|
||||
|
||||
static const struct nvgpu_worker_ops channel_worker_ops = {
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
.pre_process = nvgpu_channel_worker_poll_init,
|
||||
.wakeup_early_exit = NULL,
|
||||
.wakeup_post_process =
|
||||
nvgpu_channel_worker_poll_wakeup_post_process_item,
|
||||
.wakeup_timeout =
|
||||
nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
|
||||
#endif
|
||||
.wakeup_early_exit = NULL,
|
||||
.wakeup_process_item =
|
||||
nvgpu_channel_worker_poll_wakeup_process_item,
|
||||
.wakeup_condition = NULL,
|
||||
.wakeup_timeout =
|
||||
nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -1938,7 +1953,9 @@ int gk20a_channel_add_job(struct nvgpu_channel *c,
|
||||
job->num_mapped_buffers = num_mapped_buffers;
|
||||
job->mapped_buffers = mapped_buffers;
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
nvgpu_channel_wdt_start(c);
|
||||
#endif
|
||||
|
||||
if (!pre_alloc_enabled) {
|
||||
channel_gk20a_joblist_lock(c);
|
||||
@@ -1985,7 +2002,9 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
|
||||
struct nvgpu_channel_job *job;
|
||||
struct gk20a *g;
|
||||
bool job_finished = false;
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
bool watchdog_on = false;
|
||||
#endif
|
||||
|
||||
c = nvgpu_channel_get(c);
|
||||
if (c == NULL) {
|
||||
@@ -2000,6 +2019,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
|
||||
vm = c->vm;
|
||||
g = c->g;
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
/*
|
||||
* If !clean_all, we're in a condition where watchdog isn't supported
|
||||
* anyway (this would be a no-op).
|
||||
@@ -2007,6 +2027,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
|
||||
if (clean_all) {
|
||||
watchdog_on = nvgpu_channel_wdt_stop(c);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Synchronize with abort cleanup that needs the jobs. */
|
||||
nvgpu_mutex_acquire(&c->joblist.cleanup_lock);
|
||||
@@ -2035,6 +2056,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
|
||||
|
||||
completed = nvgpu_fence_is_expired(job->post_fence);
|
||||
if (!completed) {
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
/*
|
||||
* The watchdog eventually sees an updated gp_get if
|
||||
* something happened in this loop. A new job can have
|
||||
@@ -2045,6 +2067,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
|
||||
if (clean_all && watchdog_on) {
|
||||
nvgpu_channel_wdt_continue(c);
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2298,7 +2321,9 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
|
||||
nvgpu_spinlock_init(&c->ref_actions_lock);
|
||||
#endif
|
||||
nvgpu_spinlock_init(&c->joblist.dynamic.lock);
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
nvgpu_spinlock_init(&c->wdt.lock);
|
||||
#endif
|
||||
|
||||
nvgpu_init_list_node(&c->joblist.dynamic.jobs);
|
||||
nvgpu_init_list_node(&c->dbg_s_list);
|
||||
|
||||
@@ -402,11 +402,14 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
|
||||
*/
|
||||
need_job_tracking = (flag_fence_wait ||
|
||||
flag_fence_get ||
|
||||
c->wdt.enabled ||
|
||||
(nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE)
|
||||
&& !c->deterministic) ||
|
||||
!skip_buffer_refcounting);
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
need_job_tracking = need_job_tracking || c->wdt.enabled;
|
||||
#endif
|
||||
|
||||
if (need_job_tracking) {
|
||||
bool need_sync_framework = false;
|
||||
|
||||
@@ -439,9 +442,12 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
|
||||
*/
|
||||
need_deferred_cleanup = !c->deterministic ||
|
||||
need_sync_framework ||
|
||||
c->wdt.enabled ||
|
||||
!skip_buffer_refcounting;
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
need_deferred_cleanup = need_deferred_cleanup || c->wdt.enabled;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For deterministic channels, we don't allow deferred clean_up
|
||||
* processing to occur. In cases we hit this, we fail the submit
|
||||
|
||||
@@ -62,11 +62,15 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
|
||||
{
|
||||
nvgpu_tsg_set_error_notifier(g, tsg,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
/*
|
||||
* Cancel all channels' wdt since ctxsw timeout might
|
||||
* trigger multiple watchdogs at a time
|
||||
*/
|
||||
nvgpu_channel_wdt_restart_all_channels(g);
|
||||
#endif
|
||||
|
||||
nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump,
|
||||
RC_TYPE_CTXSW_TIMEOUT);
|
||||
}
|
||||
|
||||
@@ -211,6 +211,8 @@ struct nvgpu_channel_joblist {
|
||||
struct nvgpu_mutex cleanup_lock;
|
||||
};
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
|
||||
struct nvgpu_channel_wdt {
|
||||
/* lock protects the running timer state */
|
||||
struct nvgpu_spinlock lock;
|
||||
@@ -225,6 +227,8 @@ struct nvgpu_channel_wdt {
|
||||
bool debug_dump;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Track refcount actions, saving their stack traces. This number specifies how
|
||||
* many most recent actions are stored in a buffer. Set to 0 to disable. 128
|
||||
@@ -318,8 +322,10 @@ struct nvgpu_channel {
|
||||
struct nvgpu_cond notifier_wq;
|
||||
struct nvgpu_cond semaphore_wq;
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
/* kernel watchdog to kill stuck jobs */
|
||||
struct nvgpu_channel_wdt wdt;
|
||||
#endif
|
||||
|
||||
/* for job cleanup handling in the background worker */
|
||||
struct nvgpu_list_node worker_item;
|
||||
|
||||
@@ -2075,9 +2075,12 @@ struct gk20a {
|
||||
struct nvgpu_ltc *ltc;
|
||||
|
||||
struct nvgpu_channel_worker {
|
||||
u32 watchdog_interval;
|
||||
struct nvgpu_worker worker;
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
u32 watchdog_interval;
|
||||
struct nvgpu_timeout timeout;
|
||||
#endif
|
||||
} channel_worker;
|
||||
|
||||
struct nvgpu_clk_arb_worker {
|
||||
|
||||
@@ -1338,7 +1338,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
|
||||
goto err_get_gk20a_channel;
|
||||
}
|
||||
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
ch->wdt.enabled = false;
|
||||
#endif
|
||||
|
||||
/* bind the channel to the vm */
|
||||
err = g->ops.mm.vm_bind_channel(g->mm.cde.vm, ch);
|
||||
|
||||
@@ -285,6 +285,7 @@ int gk20a_channel_free_cycle_stats_snapshot(struct nvgpu_channel *ch)
|
||||
static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
|
||||
struct nvgpu_channel_wdt_args *args)
|
||||
{
|
||||
#ifdef NVGPU_CHANNEL_WDT
|
||||
u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
|
||||
NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
|
||||
|
||||
@@ -302,6 +303,9 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
|
||||
NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
|
||||
|
||||
return 0;
|
||||
#else
|
||||
return -EINVAL;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void gk20a_channel_free_error_notifiers(struct nvgpu_channel *ch)
|
||||
|
||||
Reference in New Issue
Block a user