diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 0790a7c9e..85a5e8f98 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -27,6 +27,7 @@ endif ccflags-y += -DNVGPU_ENGINE ccflags-y += -DNVGPU_USERD +ccflags-y += -DNVGPU_CHANNEL_WDT obj-$(CONFIG_GK20A) := nvgpu.o diff --git a/drivers/gpu/nvgpu/Makefile.shared.configs b/drivers/gpu/nvgpu/Makefile.shared.configs index e5186b468..ca743435a 100644 --- a/drivers/gpu/nvgpu/Makefile.shared.configs +++ b/drivers/gpu/nvgpu/Makefile.shared.configs @@ -31,6 +31,9 @@ NV_COMPONENT_CFLAGS += -DNVGPU_DEBUGGER # Enable USERD for safety build until we switch to user mode submits only NV_COMPONENT_CFLAGS += -DNVGPU_USERD +# Enable Channel WDT for safety build until we switch to user mode submits only +NV_COMPONENT_CFLAGS += -DNVGPU_CHANNEL_WDT + # Enable iGPU LS PMU for safety build until devctl whitelisting is done NVGPU_LS_PMU := 1 NV_COMPONENT_CFLAGS += -DNVGPU_LS_PMU diff --git a/drivers/gpu/nvgpu/common/ce/ce.c b/drivers/gpu/nvgpu/common/ce/ce.c index 3d4f2109f..8675be834 100644 --- a/drivers/gpu/nvgpu/common/ce/ce.c +++ b/drivers/gpu/nvgpu/common/ce/ce.c @@ -545,7 +545,10 @@ u32 nvgpu_ce_create_context(struct gk20a *g, nvgpu_err(g, "ce: gk20a channel not available"); goto end; } + +#ifdef NVGPU_CHANNEL_WDT ce_ctx->ch->wdt.enabled = false; +#endif /* bind the channel to the vm */ err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch); diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index 198a19e29..5f9f30fc2 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -724,10 +724,12 @@ struct nvgpu_channel *gk20a_open_new_channel(struct gk20a *g, ch->ctxsw_timeout_debug_dump = true; ch->unserviceable = false; +#ifdef NVGPU_CHANNEL_WDT /* init kernel watchdog timeout */ ch->wdt.enabled = true; ch->wdt.limit_ms = g->ch_wdt_init_limit_ms; ch->wdt.debug_dump = true; +#endif ch->obj_class = 0; ch->subctx_id = 0; @@ -1220,7 +1222,7 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c, u32 gpfifo_size, gpfifo_entry_size; u64 gpfifo_gpu_va; int err = 0; - u64 pbdma_acquire_timeout; + u64 pbdma_acquire_timeout = 0ULL; gpfifo_size = args->num_gpfifo_entries; gpfifo_entry_size = nvgpu_get_gpfifo_entry_size(); @@ -1332,11 +1334,14 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c, } } - if (!nvgpu_is_timeouts_enabled(c->g) || !c->wdt.enabled) { - pbdma_acquire_timeout = 0; - } else { +#ifdef NVGPU_CHANNEL_WDT + if (c->wdt.enabled && nvgpu_is_timeouts_enabled(c->g)) { pbdma_acquire_timeout = c->wdt.limit_ms; } +#else + if (nvgpu_is_timeouts_enabled(c->g)) { + pbdma_acquire_timeout = g->ch_wdt_init_limit_ms; +#endif err = g->ops.ramfc.setup(c, gpfifo_gpu_va, c->gpfifo.entry_num, pbdma_acquire_timeout, @@ -1510,6 +1515,8 @@ u32 nvgpu_channel_update_gpfifo_get_and_get_free_count(struct nvgpu_channel *ch) return nvgpu_channel_get_gpfifo_free_count(ch); } +#ifdef NVGPU_CHANNEL_WDT + static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch) { struct gk20a *g = ch->g; @@ -1755,6 +1762,8 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g) } } +#endif + static inline struct nvgpu_channel_worker * nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker) { @@ -1762,6 +1771,7 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker) ((uintptr_t)worker - offsetof(struct nvgpu_channel_worker, worker)); }; +#ifdef NVGPU_CHANNEL_WDT static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker) { @@ -1797,6 +1807,18 @@ static void nvgpu_channel_worker_poll_wakeup_post_process_item( } } } + +static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout( + struct nvgpu_worker *worker) +{ + struct nvgpu_channel_worker *ch_worker = + nvgpu_channel_worker_from_worker(worker); + + return ch_worker->watchdog_interval; +} + +#endif + static void nvgpu_channel_worker_poll_wakeup_process_item( struct nvgpu_list_node *work_item) { @@ -1812,25 +1834,18 @@ static void nvgpu_channel_worker_poll_wakeup_process_item( nvgpu_channel_put(ch); } -static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout( - struct nvgpu_worker *worker) -{ - struct nvgpu_channel_worker *ch_worker = - nvgpu_channel_worker_from_worker(worker); - - return ch_worker->watchdog_interval; -} - static const struct nvgpu_worker_ops channel_worker_ops = { +#ifdef NVGPU_CHANNEL_WDT .pre_process = nvgpu_channel_worker_poll_init, - .wakeup_early_exit = NULL, .wakeup_post_process = nvgpu_channel_worker_poll_wakeup_post_process_item, + .wakeup_timeout = + nvgpu_channel_worker_poll_wakeup_condition_get_timeout, +#endif + .wakeup_early_exit = NULL, .wakeup_process_item = nvgpu_channel_worker_poll_wakeup_process_item, .wakeup_condition = NULL, - .wakeup_timeout = - nvgpu_channel_worker_poll_wakeup_condition_get_timeout, }; /** @@ -1938,7 +1953,9 @@ int gk20a_channel_add_job(struct nvgpu_channel *c, job->num_mapped_buffers = num_mapped_buffers; job->mapped_buffers = mapped_buffers; +#ifdef NVGPU_CHANNEL_WDT nvgpu_channel_wdt_start(c); +#endif if (!pre_alloc_enabled) { channel_gk20a_joblist_lock(c); @@ -1985,7 +2002,9 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c, struct nvgpu_channel_job *job; struct gk20a *g; bool job_finished = false; +#ifdef NVGPU_CHANNEL_WDT bool watchdog_on = false; +#endif c = nvgpu_channel_get(c); if (c == NULL) { @@ -2000,6 +2019,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c, vm = c->vm; g = c->g; +#ifdef NVGPU_CHANNEL_WDT /* * If !clean_all, we're in a condition where watchdog isn't supported * anyway (this would be a no-op). @@ -2007,6 +2027,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c, if (clean_all) { watchdog_on = nvgpu_channel_wdt_stop(c); } +#endif /* Synchronize with abort cleanup that needs the jobs. */ nvgpu_mutex_acquire(&c->joblist.cleanup_lock); @@ -2035,6 +2056,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c, completed = nvgpu_fence_is_expired(job->post_fence); if (!completed) { +#ifdef NVGPU_CHANNEL_WDT /* * The watchdog eventually sees an updated gp_get if * something happened in this loop. A new job can have @@ -2045,6 +2067,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c, if (clean_all && watchdog_on) { nvgpu_channel_wdt_continue(c); } +#endif break; } @@ -2298,7 +2321,9 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid) nvgpu_spinlock_init(&c->ref_actions_lock); #endif nvgpu_spinlock_init(&c->joblist.dynamic.lock); +#ifdef NVGPU_CHANNEL_WDT nvgpu_spinlock_init(&c->wdt.lock); +#endif nvgpu_init_list_node(&c->joblist.dynamic.jobs); nvgpu_init_list_node(&c->dbg_s_list); diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c index f7a3deb20..fe33ece16 100644 --- a/drivers/gpu/nvgpu/common/fifo/submit.c +++ b/drivers/gpu/nvgpu/common/fifo/submit.c @@ -402,11 +402,14 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c, */ need_job_tracking = (flag_fence_wait || flag_fence_get || - c->wdt.enabled || (nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE) && !c->deterministic) || !skip_buffer_refcounting); +#ifdef NVGPU_CHANNEL_WDT + need_job_tracking = need_job_tracking || c->wdt.enabled; +#endif + if (need_job_tracking) { bool need_sync_framework = false; @@ -439,9 +442,12 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c, */ need_deferred_cleanup = !c->deterministic || need_sync_framework || - c->wdt.enabled || !skip_buffer_refcounting; +#ifdef NVGPU_CHANNEL_WDT + need_deferred_cleanup = need_deferred_cleanup || c->wdt.enabled; +#endif + /* * For deterministic channels, we don't allow deferred clean_up * processing to occur. In cases we hit this, we fail the submit diff --git a/drivers/gpu/nvgpu/common/rc/rc.c b/drivers/gpu/nvgpu/common/rc/rc.c index fc43a772e..d629cdae7 100644 --- a/drivers/gpu/nvgpu/common/rc/rc.c +++ b/drivers/gpu/nvgpu/common/rc/rc.c @@ -62,11 +62,15 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, { nvgpu_tsg_set_error_notifier(g, tsg, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); + +#ifdef NVGPU_CHANNEL_WDT /* * Cancel all channels' wdt since ctxsw timeout might * trigger multiple watchdogs at a time */ nvgpu_channel_wdt_restart_all_channels(g); +#endif + nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump, RC_TYPE_CTXSW_TIMEOUT); } diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index 5457cd2b7..db5722988 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -211,6 +211,8 @@ struct nvgpu_channel_joblist { struct nvgpu_mutex cleanup_lock; }; +#ifdef NVGPU_CHANNEL_WDT + struct nvgpu_channel_wdt { /* lock protects the running timer state */ struct nvgpu_spinlock lock; @@ -225,6 +227,8 @@ struct nvgpu_channel_wdt { bool debug_dump; }; +#endif + /* * Track refcount actions, saving their stack traces. This number specifies how * many most recent actions are stored in a buffer. Set to 0 to disable. 128 @@ -318,8 +322,10 @@ struct nvgpu_channel { struct nvgpu_cond notifier_wq; struct nvgpu_cond semaphore_wq; +#ifdef NVGPU_CHANNEL_WDT /* kernel watchdog to kill stuck jobs */ struct nvgpu_channel_wdt wdt; +#endif /* for job cleanup handling in the background worker */ struct nvgpu_list_node worker_item; diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 0ee788727..58639a1df 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -2075,9 +2075,12 @@ struct gk20a { struct nvgpu_ltc *ltc; struct nvgpu_channel_worker { - u32 watchdog_interval; struct nvgpu_worker worker; + +#ifdef NVGPU_CHANNEL_WDT + u32 watchdog_interval; struct nvgpu_timeout timeout; +#endif } channel_worker; struct nvgpu_clk_arb_worker { diff --git a/drivers/gpu/nvgpu/os/linux/cde.c b/drivers/gpu/nvgpu/os/linux/cde.c index a58881a4b..8588e7544 100644 --- a/drivers/gpu/nvgpu/os/linux/cde.c +++ b/drivers/gpu/nvgpu/os/linux/cde.c @@ -1338,7 +1338,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx) goto err_get_gk20a_channel; } +#ifdef NVGPU_CHANNEL_WDT ch->wdt.enabled = false; +#endif /* bind the channel to the vm */ err = g->ops.mm.vm_bind_channel(g->mm.cde.vm, ch); diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c index 417839a7b..17d72ba8e 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c @@ -285,6 +285,7 @@ int gk20a_channel_free_cycle_stats_snapshot(struct nvgpu_channel *ch) static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch, struct nvgpu_channel_wdt_args *args) { +#ifdef NVGPU_CHANNEL_WDT u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT | NVGPU_IOCTL_CHANNEL_ENABLE_WDT); @@ -302,6 +303,9 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch, NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0; return 0; +#else + return -EINVAL; +#endif } static void gk20a_channel_free_error_notifiers(struct nvgpu_channel *ch)