diff --git a/drivers/gpu/nvgpu/common/ce/ce_app.c b/drivers/gpu/nvgpu/common/ce/ce_app.c index 47badc0e2..5897e605b 100644 --- a/drivers/gpu/nvgpu/common/ce/ce_app.c +++ b/drivers/gpu/nvgpu/common/ce/ce_app.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -535,7 +536,7 @@ u32 nvgpu_ce_app_create_context(struct gk20a *g, } #ifdef CONFIG_NVGPU_CHANNEL_WDT - ce_ctx->ch->wdt.enabled = false; + nvgpu_channel_wdt_disable(ce_ctx->ch->wdt); #endif /* bind the channel to the vm */ diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index 9e3a6a9c7..eae1f6b4b 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -490,7 +490,7 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g) if (ch != NULL) { if (!nvgpu_channel_check_unserviceable(ch)) { - nvgpu_channel_wdt_check(ch); + nvgpu_channel_wdt_check(ch->wdt, ch); } nvgpu_channel_put(ch); } @@ -641,7 +641,7 @@ int nvgpu_channel_add_job(struct nvgpu_channel *c, job->mapped_buffers = mapped_buffers; #ifdef CONFIG_NVGPU_CHANNEL_WDT - nvgpu_channel_wdt_start(c); + nvgpu_channel_wdt_start(c->wdt, c); #endif if (!pre_alloc_enabled) { @@ -706,7 +706,7 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c, * anyway (this would be a no-op). */ if (clean_all) { - watchdog_on = nvgpu_channel_wdt_stop(c); + watchdog_on = nvgpu_channel_wdt_stop(c->wdt); } #endif @@ -746,7 +746,7 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c, * later timeout is still used. */ if (clean_all && watchdog_on) { - nvgpu_channel_wdt_continue(c); + nvgpu_channel_wdt_continue(c->wdt); } #endif break; @@ -1202,6 +1202,11 @@ unbind: g->ops.channel.unbind(ch); g->ops.channel.free_inst(g, ch); +#ifdef CONFIG_NVGPU_CHANNEL_WDT + nvgpu_channel_wdt_destroy(ch->wdt); + ch->wdt = NULL; +#endif + #ifdef CONFIG_NVGPU_DETERMINISTIC_CHANNELS channel_free_put_deterministic_ref_from_init(ch); #endif @@ -1459,10 +1464,7 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6)) ch->unserviceable = true; #ifdef CONFIG_NVGPU_CHANNEL_WDT - /* init kernel watchdog timeout */ - ch->wdt.enabled = true; - ch->wdt.limit_ms = g->ch_wdt_init_limit_ms; - ch->wdt.debug_dump = true; + ch->wdt = nvgpu_channel_wdt_alloc(ch); #endif ch->obj_class = 0; @@ -1513,8 +1515,9 @@ static int channel_setup_ramfc(struct nvgpu_channel *c, struct gk20a *g = c->g; #ifdef CONFIG_NVGPU_CHANNEL_WDT - if (c->wdt.enabled && nvgpu_is_timeouts_enabled(c->g)) { - pbdma_acquire_timeout = c->wdt.limit_ms; + if (nvgpu_channel_wdt_enabled(c->wdt) && + nvgpu_is_timeouts_enabled(c->g)) { + pbdma_acquire_timeout = nvgpu_channel_wdt_limit(c->wdt); } #endif @@ -1930,9 +1933,6 @@ int nvgpu_channel_init_support(struct gk20a *g, u32 chid) nvgpu_spinlock_init(&c->ref_actions_lock); #endif #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT -#ifdef CONFIG_NVGPU_CHANNEL_WDT - nvgpu_spinlock_init(&c->wdt.lock); -#endif nvgpu_spinlock_init(&c->joblist.dynamic.lock); nvgpu_init_list_node(&c->joblist.dynamic.jobs); nvgpu_init_list_node(&c->worker_item); diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c index e153982c3..3000acdbb 100644 --- a/drivers/gpu/nvgpu/common/fifo/submit.c +++ b/drivers/gpu/nvgpu/common/fifo/submit.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -551,7 +552,7 @@ static int nvgpu_submit_deterministic(struct nvgpu_channel *c, #ifdef CONFIG_NVGPU_CHANNEL_WDT /* the watchdog needs periodic job cleanup */ - if (c->wdt.enabled) { + if (nvgpu_channel_wdt_enabled(c->wdt)) { return -EINVAL; } #endif @@ -666,7 +667,8 @@ static int nvgpu_submit_nondeterministic(struct nvgpu_channel *c, !skip_buffer_refcounting); #ifdef CONFIG_NVGPU_CHANNEL_WDT - need_job_tracking = need_job_tracking || c->wdt.enabled; + need_job_tracking = need_job_tracking || + nvgpu_channel_wdt_enabled(c->wdt); #endif if (need_job_tracking) { diff --git a/drivers/gpu/nvgpu/common/fifo/watchdog.c b/drivers/gpu/nvgpu/common/fifo/watchdog.c index bb59218dc..3fe0328a2 100644 --- a/drivers/gpu/nvgpu/common/fifo/watchdog.c +++ b/drivers/gpu/nvgpu/common/fifo/watchdog.c @@ -22,30 +22,101 @@ #include #include +#include #include #include -static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch) +struct nvgpu_channel_wdt { + struct gk20a *g; + + /* lock protects the running timer state */ + struct nvgpu_spinlock lock; + struct nvgpu_timeout timer; + bool running; + u32 gp_get; + u64 pb_get; + + /* lock not needed */ + u32 limit_ms; + bool enabled; + bool debug_dump; +}; + +struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch) { struct gk20a *g = ch->g; + struct nvgpu_channel_wdt *wdt = nvgpu_kzalloc(g, sizeof(*wdt)); + + if (wdt == NULL) { + return NULL; + } + + wdt->g = g; + nvgpu_spinlock_init(&wdt->lock); + wdt->enabled = true; + wdt->limit_ms = g->ch_wdt_init_limit_ms; + wdt->debug_dump = true; + + return wdt; +} + +void nvgpu_channel_wdt_destroy(struct nvgpu_channel_wdt *wdt) +{ + nvgpu_kfree(wdt->g, wdt); +} + +void nvgpu_channel_wdt_enable(struct nvgpu_channel_wdt *wdt) +{ + wdt->enabled = true; +} + +void nvgpu_channel_wdt_disable(struct nvgpu_channel_wdt *wdt) +{ + wdt->enabled = false; +} + +bool nvgpu_channel_wdt_enabled(struct nvgpu_channel_wdt *wdt) +{ + return wdt->enabled; +} + +void nvgpu_channel_wdt_set_limit(struct nvgpu_channel_wdt *wdt, u32 limit_ms) +{ + wdt->limit_ms = limit_ms; +} + +u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt) +{ + return wdt->limit_ms; +} + +void nvgpu_channel_wdt_set_debug_dump(struct nvgpu_channel_wdt *wdt, bool dump) +{ + wdt->debug_dump = dump; +} + +static void nvgpu_channel_wdt_init(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel *ch) +{ + struct gk20a *g = wdt->g; int ret; if (nvgpu_channel_check_unserviceable(ch)) { - ch->wdt.running = false; + wdt->running = false; return; } - ret = nvgpu_timeout_init(g, &ch->wdt.timer, - ch->wdt.limit_ms, + ret = nvgpu_timeout_init(g, &wdt->timer, + wdt->limit_ms, NVGPU_TIMER_CPU_TIMER); if (ret != 0) { nvgpu_err(g, "timeout_init failed: %d", ret); return; } - ch->wdt.gp_get = g->ops.userd.gp_get(g, ch); - ch->wdt.pb_get = g->ops.userd.pb_get(g, ch); - ch->wdt.running = true; + wdt->gp_get = g->ops.userd.gp_get(g, ch); + wdt->pb_get = g->ops.userd.pb_get(g, ch); + wdt->running = true; } /** @@ -63,24 +134,25 @@ static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch) * actually stuck at that time. After the timeout duration has expired, a * worker thread will consider the channel stuck and recover it if stuck. */ -void nvgpu_channel_wdt_start(struct nvgpu_channel *ch) +void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel *ch) { - if (!nvgpu_is_timeouts_enabled(ch->g)) { + if (!nvgpu_is_timeouts_enabled(wdt->g)) { return; } - if (!ch->wdt.enabled) { + if (!wdt->enabled) { return; } - nvgpu_spinlock_acquire(&ch->wdt.lock); + nvgpu_spinlock_acquire(&wdt->lock); - if (ch->wdt.running) { - nvgpu_spinlock_release(&ch->wdt.lock); + if (wdt->running) { + nvgpu_spinlock_release(&wdt->lock); return; } - nvgpu_channel_wdt_init(ch); - nvgpu_spinlock_release(&ch->wdt.lock); + nvgpu_channel_wdt_init(wdt, ch); + nvgpu_spinlock_release(&wdt->lock); } /** @@ -94,14 +166,14 @@ void nvgpu_channel_wdt_start(struct nvgpu_channel *ch) * (This should be called from an update handler running in the same thread * with the watchdog.) */ -bool nvgpu_channel_wdt_stop(struct nvgpu_channel *ch) +bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt) { bool was_running; - nvgpu_spinlock_acquire(&ch->wdt.lock); - was_running = ch->wdt.running; - ch->wdt.running = false; - nvgpu_spinlock_release(&ch->wdt.lock); + nvgpu_spinlock_acquire(&wdt->lock); + was_running = wdt->running; + wdt->running = false; + nvgpu_spinlock_release(&wdt->lock); return was_running; } @@ -114,11 +186,11 @@ bool nvgpu_channel_wdt_stop(struct nvgpu_channel *ch) * (This should be called from an update handler running in the same thread * with the watchdog.) */ -void nvgpu_channel_wdt_continue(struct nvgpu_channel *ch) +void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) { - nvgpu_spinlock_acquire(&ch->wdt.lock); - ch->wdt.running = true; - nvgpu_spinlock_release(&ch->wdt.lock); + nvgpu_spinlock_acquire(&wdt->lock); + wdt->running = true; + nvgpu_spinlock_release(&wdt->lock); } /** @@ -131,13 +203,14 @@ void nvgpu_channel_wdt_continue(struct nvgpu_channel *ch) * timeouts. Stopped timeouts can only be started (which is technically a * rewind too) or continued (where the stop is actually pause). */ -static void nvgpu_channel_wdt_rewind(struct nvgpu_channel *ch) +static void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel *ch) { - nvgpu_spinlock_acquire(&ch->wdt.lock); - if (ch->wdt.running) { - nvgpu_channel_wdt_init(ch); + nvgpu_spinlock_acquire(&wdt->lock); + if (wdt->running) { + nvgpu_channel_wdt_init(wdt, ch); } - nvgpu_spinlock_release(&ch->wdt.lock); + nvgpu_spinlock_release(&wdt->lock); } /** @@ -158,7 +231,7 @@ void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g) if (ch != NULL) { if (!nvgpu_channel_check_unserviceable(ch)) { - nvgpu_channel_wdt_rewind(ch); + nvgpu_channel_wdt_rewind(ch->wdt, ch); } nvgpu_channel_put(ch); } @@ -175,9 +248,10 @@ void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g) * The gpu is implicitly on at this point, because the watchdog can only run on * channels that have submitted jobs pending for cleanup. */ -static void nvgpu_channel_wdt_handler(struct nvgpu_channel *ch) +static void nvgpu_channel_wdt_handler(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel *ch) { - struct gk20a *g = ch->g; + struct gk20a *g = wdt->g; u32 gp_get; u32 new_gp_get; u64 pb_get; @@ -187,7 +261,7 @@ static void nvgpu_channel_wdt_handler(struct nvgpu_channel *ch) if (nvgpu_channel_check_unserviceable(ch)) { /* channel is already recovered */ - if (nvgpu_channel_wdt_stop(ch) == true) { + if (nvgpu_channel_wdt_stop(wdt) == true) { nvgpu_info(g, "chid: %d unserviceable but wdt was ON", ch->chid); } @@ -195,32 +269,31 @@ static void nvgpu_channel_wdt_handler(struct nvgpu_channel *ch) } /* Get status but keep timer running */ - nvgpu_spinlock_acquire(&ch->wdt.lock); - gp_get = ch->wdt.gp_get; - pb_get = ch->wdt.pb_get; - nvgpu_spinlock_release(&ch->wdt.lock); + nvgpu_spinlock_acquire(&wdt->lock); + gp_get = wdt->gp_get; + pb_get = wdt->pb_get; + nvgpu_spinlock_release(&wdt->lock); new_gp_get = g->ops.userd.gp_get(g, ch); new_pb_get = g->ops.userd.pb_get(g, ch); if (new_gp_get != gp_get || new_pb_get != pb_get) { /* Channel has advanced, timer keeps going but resets */ - nvgpu_channel_wdt_rewind(ch); - } else if (!nvgpu_timeout_peek_expired(&ch->wdt.timer)) { + nvgpu_channel_wdt_rewind(wdt, ch); + } else if (!nvgpu_timeout_peek_expired(&wdt->timer)) { /* Seems stuck but waiting to time out */ } else { - nvgpu_err(g, "Job on channel %d timed out", - ch->chid); + nvgpu_err(g, "Job on channel %d timed out", ch->chid); /* force reset calls gk20a_debug_dump but not this */ - if (ch->wdt.debug_dump) { + if (wdt->debug_dump) { gk20a_gr_debug_dump(g); } #ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL if (g->ops.tsg.force_reset(ch, - NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, - ch->wdt.debug_dump) != 0) { + NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, + wdt->debug_dump) != 0) { nvgpu_err(g, "failed tsg force reset for chid: %d", ch->chid); } @@ -239,15 +312,16 @@ static void nvgpu_channel_wdt_handler(struct nvgpu_channel *ch) * The timeout is stopped (disabled) after the last job in a row finishes * and marks the channel idle. */ -void nvgpu_channel_wdt_check(struct nvgpu_channel *ch) +void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel *ch) { bool running; - nvgpu_spinlock_acquire(&ch->wdt.lock); - running = ch->wdt.running; - nvgpu_spinlock_release(&ch->wdt.lock); + nvgpu_spinlock_acquire(&wdt->lock); + running = wdt->running; + nvgpu_spinlock_release(&wdt->lock); if (running) { - nvgpu_channel_wdt_handler(ch); + nvgpu_channel_wdt_handler(wdt, ch); } } diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index 998d83e9b..7c7f65388 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -275,24 +275,6 @@ struct nvgpu_channel_joblist { struct nvgpu_mutex cleanup_lock; }; -#ifdef CONFIG_NVGPU_CHANNEL_WDT - -struct nvgpu_channel_wdt { - /* lock protects the running timer state */ - struct nvgpu_spinlock lock; - struct nvgpu_timeout timer; - bool running; - u32 gp_get; - u64 pb_get; - - /* lock not needed */ - u32 limit_ms; - bool enabled; - bool debug_dump; -}; - -#endif - /** * Track refcount actions, saving their stack traces. This number specifies how * many most recent actions are stored in a buffer. Set to 0 to disable. 128 @@ -398,7 +380,7 @@ struct nvgpu_channel { #ifdef CONFIG_NVGPU_CHANNEL_WDT /* kernel watchdog to kill stuck jobs */ - struct nvgpu_channel_wdt wdt; + struct nvgpu_channel_wdt *wdt; #endif /* CONFIG_NVGPU_CHANNEL_WDT */ #endif /* CONFIG_NVGPU_KERNEL_MODE_SUBMIT */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/watchdog.h b/drivers/gpu/nvgpu/include/nvgpu/watchdog.h index 4b8c52c99..ff002d5d3 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/watchdog.h +++ b/drivers/gpu/nvgpu/include/nvgpu/watchdog.h @@ -23,15 +23,32 @@ #ifndef NVGPU_WATCHDOG_H #define NVGPU_WATCHDOG_H +#ifdef CONFIG_NVGPU_CHANNEL_WDT + struct gk20a; struct nvgpu_channel; struct nvgpu_worker; -void nvgpu_channel_wdt_start(struct nvgpu_channel *ch); -void nvgpu_channel_wdt_continue(struct nvgpu_channel *ch); -bool nvgpu_channel_wdt_stop(struct nvgpu_channel *ch); -void nvgpu_channel_wdt_check(struct nvgpu_channel *ch); +struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch); +void nvgpu_channel_wdt_destroy(struct nvgpu_channel_wdt *wdt); + +void nvgpu_channel_wdt_enable(struct nvgpu_channel_wdt *wdt); +void nvgpu_channel_wdt_disable(struct nvgpu_channel_wdt *wdt); +bool nvgpu_channel_wdt_enabled(struct nvgpu_channel_wdt *wdt); + +void nvgpu_channel_wdt_set_limit(struct nvgpu_channel_wdt *wdt, u32 limit_ms); +u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt); +void nvgpu_channel_wdt_set_debug_dump(struct nvgpu_channel_wdt *wdt, bool dump); + +void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel *ch); +void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt); +bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt); +void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt, + struct nvgpu_channel *ch); void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g); #endif + +#endif diff --git a/drivers/gpu/nvgpu/os/linux/cde.c b/drivers/gpu/nvgpu/os/linux/cde.c index f89cd30a4..320fe41f6 100644 --- a/drivers/gpu/nvgpu/os/linux/cde.c +++ b/drivers/gpu/nvgpu/os/linux/cde.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1343,7 +1344,7 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx) } #ifdef CONFIG_NVGPU_CHANNEL_WDT - ch->wdt.enabled = false; + nvgpu_channel_wdt_disable(ch->wdt); #endif /* bind the channel to the vm */ diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c index a805ff428..5d66cc08e 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -294,19 +295,22 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch, #ifdef CONFIG_NVGPU_CHANNEL_WDT u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT | NVGPU_IOCTL_CHANNEL_ENABLE_WDT); + bool set_timeout = (args->wdt_status & + NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT) != 0U; + bool disable_dump = (args->wdt_status & + NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) != 0U; if (status == NVGPU_IOCTL_CHANNEL_DISABLE_WDT) - ch->wdt.enabled = false; + nvgpu_channel_wdt_disable(ch->wdt); else if (status == NVGPU_IOCTL_CHANNEL_ENABLE_WDT) - ch->wdt.enabled = true; + nvgpu_channel_wdt_enable(ch->wdt); else return -EINVAL; - if (args->wdt_status & NVGPU_IOCTL_CHANNEL_WDT_FLAG_SET_TIMEOUT) - ch->wdt.limit_ms = args->timeout_ms; + if (set_timeout) + nvgpu_channel_wdt_set_limit(ch->wdt, args->timeout_ms); - ch->wdt.debug_dump = (args->wdt_status & - NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0; + nvgpu_channel_wdt_set_debug_dump(ch->wdt, !disable_dump); return 0; #else