gpu: nvgpu: Add NVGPU_CHANNEL_WDT flag

NVGPU_CHANNEL_WDT feature is embedded within the NVGPU_CHANNEL_WDT flag
to allow it to be compiled out for safety builds.

Jira NVGPU-3012

Change-Id: I0ca54af9d7b1b8e01f4090442341eaaadca8e339
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2114480
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Debarshi Dutta
2019-05-08 14:52:51 +05:30
committed by mobile promotions
parent bf561f38f7
commit 1dea88c6c7
10 changed files with 76 additions and 19 deletions

View File

@@ -27,6 +27,7 @@ endif
ccflags-y += -DNVGPU_ENGINE
ccflags-y += -DNVGPU_USERD
ccflags-y += -DNVGPU_CHANNEL_WDT
obj-$(CONFIG_GK20A) := nvgpu.o

View File

@@ -31,6 +31,9 @@ NV_COMPONENT_CFLAGS += -DNVGPU_DEBUGGER
# Enable USERD for safety build until we switch to user mode submits only
NV_COMPONENT_CFLAGS += -DNVGPU_USERD
# Enable Channel WDT for safety build until we switch to user mode submits only
NV_COMPONENT_CFLAGS += -DNVGPU_CHANNEL_WDT
# Enable iGPU LS PMU for safety build until devctl whitelisting is done
NVGPU_LS_PMU := 1
NV_COMPONENT_CFLAGS += -DNVGPU_LS_PMU

View File

@@ -545,7 +545,10 @@ u32 nvgpu_ce_create_context(struct gk20a *g,
nvgpu_err(g, "ce: gk20a channel not available");
goto end;
}
#ifdef NVGPU_CHANNEL_WDT
ce_ctx->ch->wdt.enabled = false;
#endif
/* bind the channel to the vm */
err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);

View File

@@ -724,10 +724,12 @@ struct nvgpu_channel *gk20a_open_new_channel(struct gk20a *g,
ch->ctxsw_timeout_debug_dump = true;
ch->unserviceable = false;
#ifdef NVGPU_CHANNEL_WDT
/* init kernel watchdog timeout */
ch->wdt.enabled = true;
ch->wdt.limit_ms = g->ch_wdt_init_limit_ms;
ch->wdt.debug_dump = true;
#endif
ch->obj_class = 0;
ch->subctx_id = 0;
@@ -1220,7 +1222,7 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
u32 gpfifo_size, gpfifo_entry_size;
u64 gpfifo_gpu_va;
int err = 0;
u64 pbdma_acquire_timeout;
u64 pbdma_acquire_timeout = 0ULL;
gpfifo_size = args->num_gpfifo_entries;
gpfifo_entry_size = nvgpu_get_gpfifo_entry_size();
@@ -1332,11 +1334,14 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
}
}
if (!nvgpu_is_timeouts_enabled(c->g) || !c->wdt.enabled) {
pbdma_acquire_timeout = 0;
} else {
#ifdef NVGPU_CHANNEL_WDT
if (c->wdt.enabled && nvgpu_is_timeouts_enabled(c->g)) {
pbdma_acquire_timeout = c->wdt.limit_ms;
}
#else
if (nvgpu_is_timeouts_enabled(c->g)) {
pbdma_acquire_timeout = g->ch_wdt_init_limit_ms;
#endif
err = g->ops.ramfc.setup(c, gpfifo_gpu_va,
c->gpfifo.entry_num, pbdma_acquire_timeout,
@@ -1510,6 +1515,8 @@ u32 nvgpu_channel_update_gpfifo_get_and_get_free_count(struct nvgpu_channel *ch)
return nvgpu_channel_get_gpfifo_free_count(ch);
}
#ifdef NVGPU_CHANNEL_WDT
static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch)
{
struct gk20a *g = ch->g;
@@ -1755,6 +1762,8 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g)
}
}
#endif
static inline struct nvgpu_channel_worker *
nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
{
@@ -1762,6 +1771,7 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
((uintptr_t)worker - offsetof(struct nvgpu_channel_worker, worker));
};
#ifdef NVGPU_CHANNEL_WDT
static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
{
@@ -1797,6 +1807,18 @@ static void nvgpu_channel_worker_poll_wakeup_post_process_item(
}
}
}
static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
struct nvgpu_worker *worker)
{
struct nvgpu_channel_worker *ch_worker =
nvgpu_channel_worker_from_worker(worker);
return ch_worker->watchdog_interval;
}
#endif
static void nvgpu_channel_worker_poll_wakeup_process_item(
struct nvgpu_list_node *work_item)
{
@@ -1812,25 +1834,18 @@ static void nvgpu_channel_worker_poll_wakeup_process_item(
nvgpu_channel_put(ch);
}
static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
struct nvgpu_worker *worker)
{
struct nvgpu_channel_worker *ch_worker =
nvgpu_channel_worker_from_worker(worker);
return ch_worker->watchdog_interval;
}
static const struct nvgpu_worker_ops channel_worker_ops = {
#ifdef NVGPU_CHANNEL_WDT
.pre_process = nvgpu_channel_worker_poll_init,
.wakeup_early_exit = NULL,
.wakeup_post_process =
nvgpu_channel_worker_poll_wakeup_post_process_item,
.wakeup_timeout =
nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
#endif
.wakeup_early_exit = NULL,
.wakeup_process_item =
nvgpu_channel_worker_poll_wakeup_process_item,
.wakeup_condition = NULL,
.wakeup_timeout =
nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
};
/**
@@ -1938,7 +1953,9 @@ int gk20a_channel_add_job(struct nvgpu_channel *c,
job->num_mapped_buffers = num_mapped_buffers;
job->mapped_buffers = mapped_buffers;
#ifdef NVGPU_CHANNEL_WDT
nvgpu_channel_wdt_start(c);
#endif
if (!pre_alloc_enabled) {
channel_gk20a_joblist_lock(c);
@@ -1985,7 +2002,9 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
struct nvgpu_channel_job *job;
struct gk20a *g;
bool job_finished = false;
#ifdef NVGPU_CHANNEL_WDT
bool watchdog_on = false;
#endif
c = nvgpu_channel_get(c);
if (c == NULL) {
@@ -2000,6 +2019,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
vm = c->vm;
g = c->g;
#ifdef NVGPU_CHANNEL_WDT
/*
* If !clean_all, we're in a condition where watchdog isn't supported
* anyway (this would be a no-op).
@@ -2007,6 +2027,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
if (clean_all) {
watchdog_on = nvgpu_channel_wdt_stop(c);
}
#endif
/* Synchronize with abort cleanup that needs the jobs. */
nvgpu_mutex_acquire(&c->joblist.cleanup_lock);
@@ -2035,6 +2056,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
completed = nvgpu_fence_is_expired(job->post_fence);
if (!completed) {
#ifdef NVGPU_CHANNEL_WDT
/*
* The watchdog eventually sees an updated gp_get if
* something happened in this loop. A new job can have
@@ -2045,6 +2067,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
if (clean_all && watchdog_on) {
nvgpu_channel_wdt_continue(c);
}
#endif
break;
}
@@ -2298,7 +2321,9 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
nvgpu_spinlock_init(&c->ref_actions_lock);
#endif
nvgpu_spinlock_init(&c->joblist.dynamic.lock);
#ifdef NVGPU_CHANNEL_WDT
nvgpu_spinlock_init(&c->wdt.lock);
#endif
nvgpu_init_list_node(&c->joblist.dynamic.jobs);
nvgpu_init_list_node(&c->dbg_s_list);

View File

@@ -402,11 +402,14 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
*/
need_job_tracking = (flag_fence_wait ||
flag_fence_get ||
c->wdt.enabled ||
(nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE)
&& !c->deterministic) ||
!skip_buffer_refcounting);
#ifdef NVGPU_CHANNEL_WDT
need_job_tracking = need_job_tracking || c->wdt.enabled;
#endif
if (need_job_tracking) {
bool need_sync_framework = false;
@@ -439,9 +442,12 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
*/
need_deferred_cleanup = !c->deterministic ||
need_sync_framework ||
c->wdt.enabled ||
!skip_buffer_refcounting;
#ifdef NVGPU_CHANNEL_WDT
need_deferred_cleanup = need_deferred_cleanup || c->wdt.enabled;
#endif
/*
* For deterministic channels, we don't allow deferred clean_up
* processing to occur. In cases we hit this, we fail the submit

View File

@@ -62,11 +62,15 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
{
nvgpu_tsg_set_error_notifier(g, tsg,
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
#ifdef NVGPU_CHANNEL_WDT
/*
* Cancel all channels' wdt since ctxsw timeout might
* trigger multiple watchdogs at a time
*/
nvgpu_channel_wdt_restart_all_channels(g);
#endif
nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump,
RC_TYPE_CTXSW_TIMEOUT);
}

View File

@@ -211,6 +211,8 @@ struct nvgpu_channel_joblist {
struct nvgpu_mutex cleanup_lock;
};
#ifdef NVGPU_CHANNEL_WDT
struct nvgpu_channel_wdt {
/* lock protects the running timer state */
struct nvgpu_spinlock lock;
@@ -225,6 +227,8 @@ struct nvgpu_channel_wdt {
bool debug_dump;
};
#endif
/*
* Track refcount actions, saving their stack traces. This number specifies how
* many most recent actions are stored in a buffer. Set to 0 to disable. 128
@@ -318,8 +322,10 @@ struct nvgpu_channel {
struct nvgpu_cond notifier_wq;
struct nvgpu_cond semaphore_wq;
#ifdef NVGPU_CHANNEL_WDT
/* kernel watchdog to kill stuck jobs */
struct nvgpu_channel_wdt wdt;
#endif
/* for job cleanup handling in the background worker */
struct nvgpu_list_node worker_item;

View File

@@ -2075,9 +2075,12 @@ struct gk20a {
struct nvgpu_ltc *ltc;
struct nvgpu_channel_worker {
u32 watchdog_interval;
struct nvgpu_worker worker;
#ifdef NVGPU_CHANNEL_WDT
u32 watchdog_interval;
struct nvgpu_timeout timeout;
#endif
} channel_worker;
struct nvgpu_clk_arb_worker {

View File

@@ -1338,7 +1338,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
goto err_get_gk20a_channel;
}
#ifdef NVGPU_CHANNEL_WDT
ch->wdt.enabled = false;
#endif
/* bind the channel to the vm */
err = g->ops.mm.vm_bind_channel(g->mm.cde.vm, ch);

View File

@@ -285,6 +285,7 @@ int gk20a_channel_free_cycle_stats_snapshot(struct nvgpu_channel *ch)
static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
struct nvgpu_channel_wdt_args *args)
{
#ifdef NVGPU_CHANNEL_WDT
u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
@@ -302,6 +303,9 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
return 0;
#else
return -EINVAL;
#endif
}
static void gk20a_channel_free_error_notifiers(struct nvgpu_channel *ch)