gpu: nvgpu: Add NVGPU_CHANNEL_WDT flag

NVGPU_CHANNEL_WDT feature is embedded within the NVGPU_CHANNEL_WDT flag
to allow it to be compiled out for safety builds.

Jira NVGPU-3012

Change-Id: I0ca54af9d7b1b8e01f4090442341eaaadca8e339
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2114480
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Debarshi Dutta
2019-05-08 14:52:51 +05:30
committed by mobile promotions
parent bf561f38f7
commit 1dea88c6c7
10 changed files with 76 additions and 19 deletions

View File

@@ -27,6 +27,7 @@ endif
ccflags-y += -DNVGPU_ENGINE ccflags-y += -DNVGPU_ENGINE
ccflags-y += -DNVGPU_USERD ccflags-y += -DNVGPU_USERD
ccflags-y += -DNVGPU_CHANNEL_WDT
obj-$(CONFIG_GK20A) := nvgpu.o obj-$(CONFIG_GK20A) := nvgpu.o

View File

@@ -31,6 +31,9 @@ NV_COMPONENT_CFLAGS += -DNVGPU_DEBUGGER
# Enable USERD for safety build until we switch to user mode submits only # Enable USERD for safety build until we switch to user mode submits only
NV_COMPONENT_CFLAGS += -DNVGPU_USERD NV_COMPONENT_CFLAGS += -DNVGPU_USERD
# Enable Channel WDT for safety build until we switch to user mode submits only
NV_COMPONENT_CFLAGS += -DNVGPU_CHANNEL_WDT
# Enable iGPU LS PMU for safety build until devctl whitelisting is done # Enable iGPU LS PMU for safety build until devctl whitelisting is done
NVGPU_LS_PMU := 1 NVGPU_LS_PMU := 1
NV_COMPONENT_CFLAGS += -DNVGPU_LS_PMU NV_COMPONENT_CFLAGS += -DNVGPU_LS_PMU

View File

@@ -545,7 +545,10 @@ u32 nvgpu_ce_create_context(struct gk20a *g,
nvgpu_err(g, "ce: gk20a channel not available"); nvgpu_err(g, "ce: gk20a channel not available");
goto end; goto end;
} }
#ifdef NVGPU_CHANNEL_WDT
ce_ctx->ch->wdt.enabled = false; ce_ctx->ch->wdt.enabled = false;
#endif
/* bind the channel to the vm */ /* bind the channel to the vm */
err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch); err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);

View File

@@ -724,10 +724,12 @@ struct nvgpu_channel *gk20a_open_new_channel(struct gk20a *g,
ch->ctxsw_timeout_debug_dump = true; ch->ctxsw_timeout_debug_dump = true;
ch->unserviceable = false; ch->unserviceable = false;
#ifdef NVGPU_CHANNEL_WDT
/* init kernel watchdog timeout */ /* init kernel watchdog timeout */
ch->wdt.enabled = true; ch->wdt.enabled = true;
ch->wdt.limit_ms = g->ch_wdt_init_limit_ms; ch->wdt.limit_ms = g->ch_wdt_init_limit_ms;
ch->wdt.debug_dump = true; ch->wdt.debug_dump = true;
#endif
ch->obj_class = 0; ch->obj_class = 0;
ch->subctx_id = 0; ch->subctx_id = 0;
@@ -1220,7 +1222,7 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
u32 gpfifo_size, gpfifo_entry_size; u32 gpfifo_size, gpfifo_entry_size;
u64 gpfifo_gpu_va; u64 gpfifo_gpu_va;
int err = 0; int err = 0;
u64 pbdma_acquire_timeout; u64 pbdma_acquire_timeout = 0ULL;
gpfifo_size = args->num_gpfifo_entries; gpfifo_size = args->num_gpfifo_entries;
gpfifo_entry_size = nvgpu_get_gpfifo_entry_size(); gpfifo_entry_size = nvgpu_get_gpfifo_entry_size();
@@ -1332,11 +1334,14 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
} }
} }
if (!nvgpu_is_timeouts_enabled(c->g) || !c->wdt.enabled) { #ifdef NVGPU_CHANNEL_WDT
pbdma_acquire_timeout = 0; if (c->wdt.enabled && nvgpu_is_timeouts_enabled(c->g)) {
} else {
pbdma_acquire_timeout = c->wdt.limit_ms; pbdma_acquire_timeout = c->wdt.limit_ms;
} }
#else
if (nvgpu_is_timeouts_enabled(c->g)) {
pbdma_acquire_timeout = g->ch_wdt_init_limit_ms;
#endif
err = g->ops.ramfc.setup(c, gpfifo_gpu_va, err = g->ops.ramfc.setup(c, gpfifo_gpu_va,
c->gpfifo.entry_num, pbdma_acquire_timeout, c->gpfifo.entry_num, pbdma_acquire_timeout,
@@ -1510,6 +1515,8 @@ u32 nvgpu_channel_update_gpfifo_get_and_get_free_count(struct nvgpu_channel *ch)
return nvgpu_channel_get_gpfifo_free_count(ch); return nvgpu_channel_get_gpfifo_free_count(ch);
} }
#ifdef NVGPU_CHANNEL_WDT
static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch) static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch)
{ {
struct gk20a *g = ch->g; struct gk20a *g = ch->g;
@@ -1755,6 +1762,8 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g)
} }
} }
#endif
static inline struct nvgpu_channel_worker * static inline struct nvgpu_channel_worker *
nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker) nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
{ {
@@ -1762,6 +1771,7 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
((uintptr_t)worker - offsetof(struct nvgpu_channel_worker, worker)); ((uintptr_t)worker - offsetof(struct nvgpu_channel_worker, worker));
}; };
#ifdef NVGPU_CHANNEL_WDT
static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker) static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
{ {
@@ -1797,6 +1807,18 @@ static void nvgpu_channel_worker_poll_wakeup_post_process_item(
} }
} }
} }
static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
struct nvgpu_worker *worker)
{
struct nvgpu_channel_worker *ch_worker =
nvgpu_channel_worker_from_worker(worker);
return ch_worker->watchdog_interval;
}
#endif
static void nvgpu_channel_worker_poll_wakeup_process_item( static void nvgpu_channel_worker_poll_wakeup_process_item(
struct nvgpu_list_node *work_item) struct nvgpu_list_node *work_item)
{ {
@@ -1812,25 +1834,18 @@ static void nvgpu_channel_worker_poll_wakeup_process_item(
nvgpu_channel_put(ch); nvgpu_channel_put(ch);
} }
static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
struct nvgpu_worker *worker)
{
struct nvgpu_channel_worker *ch_worker =
nvgpu_channel_worker_from_worker(worker);
return ch_worker->watchdog_interval;
}
static const struct nvgpu_worker_ops channel_worker_ops = { static const struct nvgpu_worker_ops channel_worker_ops = {
#ifdef NVGPU_CHANNEL_WDT
.pre_process = nvgpu_channel_worker_poll_init, .pre_process = nvgpu_channel_worker_poll_init,
.wakeup_early_exit = NULL,
.wakeup_post_process = .wakeup_post_process =
nvgpu_channel_worker_poll_wakeup_post_process_item, nvgpu_channel_worker_poll_wakeup_post_process_item,
.wakeup_timeout =
nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
#endif
.wakeup_early_exit = NULL,
.wakeup_process_item = .wakeup_process_item =
nvgpu_channel_worker_poll_wakeup_process_item, nvgpu_channel_worker_poll_wakeup_process_item,
.wakeup_condition = NULL, .wakeup_condition = NULL,
.wakeup_timeout =
nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
}; };
/** /**
@@ -1938,7 +1953,9 @@ int gk20a_channel_add_job(struct nvgpu_channel *c,
job->num_mapped_buffers = num_mapped_buffers; job->num_mapped_buffers = num_mapped_buffers;
job->mapped_buffers = mapped_buffers; job->mapped_buffers = mapped_buffers;
#ifdef NVGPU_CHANNEL_WDT
nvgpu_channel_wdt_start(c); nvgpu_channel_wdt_start(c);
#endif
if (!pre_alloc_enabled) { if (!pre_alloc_enabled) {
channel_gk20a_joblist_lock(c); channel_gk20a_joblist_lock(c);
@@ -1985,7 +2002,9 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
struct nvgpu_channel_job *job; struct nvgpu_channel_job *job;
struct gk20a *g; struct gk20a *g;
bool job_finished = false; bool job_finished = false;
#ifdef NVGPU_CHANNEL_WDT
bool watchdog_on = false; bool watchdog_on = false;
#endif
c = nvgpu_channel_get(c); c = nvgpu_channel_get(c);
if (c == NULL) { if (c == NULL) {
@@ -2000,6 +2019,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
vm = c->vm; vm = c->vm;
g = c->g; g = c->g;
#ifdef NVGPU_CHANNEL_WDT
/* /*
* If !clean_all, we're in a condition where watchdog isn't supported * If !clean_all, we're in a condition where watchdog isn't supported
* anyway (this would be a no-op). * anyway (this would be a no-op).
@@ -2007,6 +2027,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
if (clean_all) { if (clean_all) {
watchdog_on = nvgpu_channel_wdt_stop(c); watchdog_on = nvgpu_channel_wdt_stop(c);
} }
#endif
/* Synchronize with abort cleanup that needs the jobs. */ /* Synchronize with abort cleanup that needs the jobs. */
nvgpu_mutex_acquire(&c->joblist.cleanup_lock); nvgpu_mutex_acquire(&c->joblist.cleanup_lock);
@@ -2035,6 +2056,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
completed = nvgpu_fence_is_expired(job->post_fence); completed = nvgpu_fence_is_expired(job->post_fence);
if (!completed) { if (!completed) {
#ifdef NVGPU_CHANNEL_WDT
/* /*
* The watchdog eventually sees an updated gp_get if * The watchdog eventually sees an updated gp_get if
* something happened in this loop. A new job can have * something happened in this loop. A new job can have
@@ -2045,6 +2067,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
if (clean_all && watchdog_on) { if (clean_all && watchdog_on) {
nvgpu_channel_wdt_continue(c); nvgpu_channel_wdt_continue(c);
} }
#endif
break; break;
} }
@@ -2298,7 +2321,9 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
nvgpu_spinlock_init(&c->ref_actions_lock); nvgpu_spinlock_init(&c->ref_actions_lock);
#endif #endif
nvgpu_spinlock_init(&c->joblist.dynamic.lock); nvgpu_spinlock_init(&c->joblist.dynamic.lock);
#ifdef NVGPU_CHANNEL_WDT
nvgpu_spinlock_init(&c->wdt.lock); nvgpu_spinlock_init(&c->wdt.lock);
#endif
nvgpu_init_list_node(&c->joblist.dynamic.jobs); nvgpu_init_list_node(&c->joblist.dynamic.jobs);
nvgpu_init_list_node(&c->dbg_s_list); nvgpu_init_list_node(&c->dbg_s_list);

View File

@@ -402,11 +402,14 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
*/ */
need_job_tracking = (flag_fence_wait || need_job_tracking = (flag_fence_wait ||
flag_fence_get || flag_fence_get ||
c->wdt.enabled ||
(nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE) (nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE)
&& !c->deterministic) || && !c->deterministic) ||
!skip_buffer_refcounting); !skip_buffer_refcounting);
#ifdef NVGPU_CHANNEL_WDT
need_job_tracking = need_job_tracking || c->wdt.enabled;
#endif
if (need_job_tracking) { if (need_job_tracking) {
bool need_sync_framework = false; bool need_sync_framework = false;
@@ -439,9 +442,12 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
*/ */
need_deferred_cleanup = !c->deterministic || need_deferred_cleanup = !c->deterministic ||
need_sync_framework || need_sync_framework ||
c->wdt.enabled ||
!skip_buffer_refcounting; !skip_buffer_refcounting;
#ifdef NVGPU_CHANNEL_WDT
need_deferred_cleanup = need_deferred_cleanup || c->wdt.enabled;
#endif
/* /*
* For deterministic channels, we don't allow deferred clean_up * For deterministic channels, we don't allow deferred clean_up
* processing to occur. In cases we hit this, we fail the submit * processing to occur. In cases we hit this, we fail the submit

View File

@@ -62,11 +62,15 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
{ {
nvgpu_tsg_set_error_notifier(g, tsg, nvgpu_tsg_set_error_notifier(g, tsg,
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
#ifdef NVGPU_CHANNEL_WDT
/* /*
* Cancel all channels' wdt since ctxsw timeout might * Cancel all channels' wdt since ctxsw timeout might
* trigger multiple watchdogs at a time * trigger multiple watchdogs at a time
*/ */
nvgpu_channel_wdt_restart_all_channels(g); nvgpu_channel_wdt_restart_all_channels(g);
#endif
nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump, nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump,
RC_TYPE_CTXSW_TIMEOUT); RC_TYPE_CTXSW_TIMEOUT);
} }

View File

@@ -211,6 +211,8 @@ struct nvgpu_channel_joblist {
struct nvgpu_mutex cleanup_lock; struct nvgpu_mutex cleanup_lock;
}; };
#ifdef NVGPU_CHANNEL_WDT
struct nvgpu_channel_wdt { struct nvgpu_channel_wdt {
/* lock protects the running timer state */ /* lock protects the running timer state */
struct nvgpu_spinlock lock; struct nvgpu_spinlock lock;
@@ -225,6 +227,8 @@ struct nvgpu_channel_wdt {
bool debug_dump; bool debug_dump;
}; };
#endif
/* /*
* Track refcount actions, saving their stack traces. This number specifies how * Track refcount actions, saving their stack traces. This number specifies how
* many most recent actions are stored in a buffer. Set to 0 to disable. 128 * many most recent actions are stored in a buffer. Set to 0 to disable. 128
@@ -318,8 +322,10 @@ struct nvgpu_channel {
struct nvgpu_cond notifier_wq; struct nvgpu_cond notifier_wq;
struct nvgpu_cond semaphore_wq; struct nvgpu_cond semaphore_wq;
#ifdef NVGPU_CHANNEL_WDT
/* kernel watchdog to kill stuck jobs */ /* kernel watchdog to kill stuck jobs */
struct nvgpu_channel_wdt wdt; struct nvgpu_channel_wdt wdt;
#endif
/* for job cleanup handling in the background worker */ /* for job cleanup handling in the background worker */
struct nvgpu_list_node worker_item; struct nvgpu_list_node worker_item;

View File

@@ -2075,9 +2075,12 @@ struct gk20a {
struct nvgpu_ltc *ltc; struct nvgpu_ltc *ltc;
struct nvgpu_channel_worker { struct nvgpu_channel_worker {
u32 watchdog_interval;
struct nvgpu_worker worker; struct nvgpu_worker worker;
#ifdef NVGPU_CHANNEL_WDT
u32 watchdog_interval;
struct nvgpu_timeout timeout; struct nvgpu_timeout timeout;
#endif
} channel_worker; } channel_worker;
struct nvgpu_clk_arb_worker { struct nvgpu_clk_arb_worker {

View File

@@ -1338,7 +1338,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
goto err_get_gk20a_channel; goto err_get_gk20a_channel;
} }
#ifdef NVGPU_CHANNEL_WDT
ch->wdt.enabled = false; ch->wdt.enabled = false;
#endif
/* bind the channel to the vm */ /* bind the channel to the vm */
err = g->ops.mm.vm_bind_channel(g->mm.cde.vm, ch); err = g->ops.mm.vm_bind_channel(g->mm.cde.vm, ch);

View File

@@ -285,6 +285,7 @@ int gk20a_channel_free_cycle_stats_snapshot(struct nvgpu_channel *ch)
static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch, static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
struct nvgpu_channel_wdt_args *args) struct nvgpu_channel_wdt_args *args)
{ {
#ifdef NVGPU_CHANNEL_WDT
u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT | u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
NVGPU_IOCTL_CHANNEL_ENABLE_WDT); NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
@@ -302,6 +303,9 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0; NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
return 0; return 0;
#else
return -EINVAL;
#endif
} }
static void gk20a_channel_free_error_notifiers(struct nvgpu_channel *ch) static void gk20a_channel_free_error_notifiers(struct nvgpu_channel *ch)