diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index ecc5606a8..709fc5fdc 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -2296,6 +2296,25 @@ void nvgpu_channel_set_error_notifier(struct gk20a *g, struct nvgpu_channel *ch, g->ops.channel.set_error_notifier(ch, error_notifier); } +#ifndef CONFIG_NVGPU_RECOVERY +void nvgpu_channel_sw_quiesce(struct gk20a *g) +{ + struct nvgpu_fifo *f = &g->fifo; + struct nvgpu_channel *ch; + u32 chid; + + for (chid = 0; chid < f->num_channels; chid++) { + ch = nvgpu_channel_get(&f->channel[chid]); + if (ch != NULL) { + nvgpu_channel_set_error_notifier(g, ch, + NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); + nvgpu_channel_set_has_timedout_and_wakeup_wqs(g, ch); + nvgpu_channel_put(ch); + } + } +} +#endif + /* * Stop deterministic channel activity for do_idle() when power needs to go off * momentarily but deterministic channels keep power refs for potentially a diff --git a/drivers/gpu/nvgpu/common/fifo/fifo.c b/drivers/gpu/nvgpu/common/fifo/fifo.c index 5a4c19441..dc9582dee 100644 --- a/drivers/gpu/nvgpu/common/fifo/fifo.c +++ b/drivers/gpu/nvgpu/common/fifo/fifo.c @@ -263,3 +263,24 @@ int nvgpu_fifo_suspend(struct gk20a *g) nvgpu_log_fn(g, "done"); return 0; } + +#ifndef CONFIG_NVGPU_RECOVERY +void nvgpu_fifo_sw_quiesce(struct gk20a *g) +{ + u32 runlist_mask; + + nvgpu_runlist_lock_active_runlists(g); + + /* Disable all runlists */ + runlist_mask = nvgpu_runlist_get_runlists_mask(g, + 0U, ID_TYPE_UNKNOWN, 0U, 0U); + g->ops.runlist.write_state(g, runlist_mask, RUNLIST_DISABLED); + + /* Preempt all runlists (runlist->reset_eng_bitmask will be ignored)*/ + g->ops.fifo.preempt_runlists_for_rc(g, runlist_mask); + + nvgpu_channel_sw_quiesce(g); + + nvgpu_runlist_unlock_active_runlists(g); +} +#endif diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c index c5e4b71b3..587a3f311 100644 --- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c +++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c @@ -73,6 +73,91 @@ static void gk20a_mask_interrupts(struct gk20a *g) } } +#ifndef CONFIG_NVGPU_RECOVERY +static int nvgpu_sw_quiesce_thread(void *data) +{ + struct gk20a *g = data; + int err = 0; + + /* wait until all SW quiesce is requested */ + NVGPU_COND_WAIT(&g->sw_quiesce_cond, + g->sw_quiesce_pending || + nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U); + + if (nvgpu_thread_should_stop(&g->sw_quiesce_thread)) { + goto done; + } + + nvgpu_err(g, "sw quiesce in progress"); + + nvgpu_mutex_acquire(&g->power_lock); + + if (!g->power_on || g->is_virtual) { + err = -EINVAL; + goto idle; + } + + nvgpu_start_gpu_idle(g); + nvgpu_disable_irqs(g); + gk20a_mask_interrupts(g); + nvgpu_fifo_sw_quiesce(g); + +idle: + nvgpu_mutex_release(&g->power_lock); + nvgpu_err(g, "sw quiesce done, err=%d", err); + +done: + nvgpu_log_info(g, "done"); + return err; +} + +static int nvgpu_sw_quiesce_init_support(struct gk20a *g) +{ + int err; + + nvgpu_cond_init(&g->sw_quiesce_cond); + g->sw_quiesce_pending = false; + + err = nvgpu_thread_create(&g->sw_quiesce_thread, g, + nvgpu_sw_quiesce_thread, "sw-quiesce"); + if (err != 0) { + return err; + } + + return 0; +} + +static void nvgpu_sw_quiesce_remove_support(struct gk20a *g) +{ + nvgpu_thread_stop(&g->sw_quiesce_thread); + nvgpu_cond_destroy(&g->sw_quiesce_cond); +} +#endif + +void nvgpu_sw_quiesce(struct gk20a *g) +{ +#ifndef CONFIG_NVGPU_RECOVERY + if (g->is_virtual) { + goto fail; + } + + nvgpu_err(g, "SW quiesce requested"); + + /* + * When this flag is set, interrupt handlers should + * exit after masking interrupts. This should mitigate + * interrupt storm cases. + */ + g->sw_quiesce_pending = true; + + nvgpu_cond_signal(&g->sw_quiesce_cond); + return; + +fail: +#endif + nvgpu_err(g, "sw quiesce not supported"); +} + int nvgpu_prepare_poweroff(struct gk20a *g) { int tmp_ret, ret = 0; @@ -164,6 +249,14 @@ int nvgpu_finalize_poweron(struct gk20a *g) g->power_on = true; +#ifndef CONFIG_NVGPU_RECOVERY + err = nvgpu_sw_quiesce_init_support(g); + if (err != 0) { + nvgpu_err(g, "failed to init sw-quiesce support"); + goto done; + } +#endif + #ifdef CONFIG_NVGPU_DGPU /* * Before probing the GPU make sure the GPU's state is cleared. This is @@ -668,6 +761,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount) g->ops.ltc.ltc_remove_support(g); } +#ifndef CONFIG_NVGPU_RECOVERY + nvgpu_sw_quiesce_remove_support(g); +#endif + if (g->gfree != NULL) { g->gfree(g); } diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index f2dc1e13f..b53e5d1e9 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -754,6 +754,21 @@ int nvgpu_channel_setup_sw(struct gk20a *g); */ void nvgpu_channel_cleanup_sw(struct gk20a *g); +#ifndef CONFIG_NVGPU_RECOVERY +/** + * @brief Emergency quiescing of channels + * + * @param g[in] Pointer to GPU driver struct. + * + * Driver has encountered uncorrectable error, and is entering + * SW Quiesce state. For each channel: + * - set error notifier + * - mark channel as unserviceable + * - signal on wait queues (notify_wq and semaphore_wq) + */ +void nvgpu_channel_sw_quiesce(struct gk20a *g); +#endif + /** * @brief Close channel * diff --git a/drivers/gpu/nvgpu/include/nvgpu/fifo.h b/drivers/gpu/nvgpu/include/nvgpu/fifo.h index 9e3073301..b17ba9f6a 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/fifo.h +++ b/drivers/gpu/nvgpu/include/nvgpu/fifo.h @@ -325,5 +325,8 @@ void nvgpu_fifo_cleanup_sw_common(struct gk20a *g); const char *nvgpu_fifo_decode_pbdma_ch_eng_status(u32 index); int nvgpu_fifo_suspend(struct gk20a *g); +#ifndef CONFIG_NVGPU_RECOVERY +void nvgpu_fifo_sw_quiesce(struct gk20a *g); +#endif #endif /* NVGPU_FIFO_COMMON_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index d822c13a5..f24b2649e 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -2000,6 +2000,12 @@ struct gk20a { bool suspended; bool sw_ready; +#ifndef CONFIG_NVGPU_RECOVERY + bool sw_quiesce_pending; + struct nvgpu_cond sw_quiesce_cond; + struct nvgpu_thread sw_quiesce_thread; +#endif + u64 log_mask; u32 log_trace; diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h index 46502bf2e..ca20af9fa 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h @@ -23,6 +23,9 @@ #ifndef NVGPU_INIT_H #define NVGPU_INIT_H +struct gk20a; +struct nvgpu_ref; + /** * @file * @page unit-init Unit Init @@ -99,6 +102,49 @@ int nvgpu_finalize_poweron(struct gk20a *g); */ int nvgpu_prepare_poweroff(struct gk20a *g); +/** + * @brief Enter SW Quiesce state + * + * @param g [in] The GPU + * + * Enters SW quiesce state: + * - set sw_quiesce_pending: When this flag is set, interrupt + * handlers exit after masking interrupts. This should help mitigate + * an interrupt storm. + * - wake up thread to complete quiescing. + * + * The thread performs the following: + * - set NVGPU_DRIVER_IS_DYING to prevent allocation of new resources + * - disable interrupts + * - disable fifo scheduling + * - preempt all runlists + * - set error notifier for all active channels + * + * @note: For channels with usermode submit enabled, userspace can + * still ring doorbell, but this will not trigger any work on + * engines since fifo scheduling is disabled. + */ +void nvgpu_sw_quiesce(struct gk20a *g); + +/** + * @brief Start GPU idle + * + * @param g [in] The GPU + * + * Set #NVGPU_DRIVER_IS_DYING to prevent allocation of new resources. + * User API call will fail once this flag is set, as gk20a_busy will fail. + */ +void nvgpu_start_gpu_idle(struct gk20a *g); + +/** + * @brief Disable interrupt handlers + * + * @param g [in] The GPU + * + * Disable interrupt handlers. + */ +void nvgpu_disable_irqs(struct gk20a *g); + /** * @brief Check if the device can go busy * diff --git a/drivers/gpu/nvgpu/libnvgpu-drv_safe.export b/drivers/gpu/nvgpu/libnvgpu-drv_safe.export index 25603204e..8c46709df 100644 --- a/drivers/gpu/nvgpu/libnvgpu-drv_safe.export +++ b/drivers/gpu/nvgpu/libnvgpu-drv_safe.export @@ -254,6 +254,7 @@ nvgpu_sgt_ipa_to_pa nvgpu_spinlock_acquire nvgpu_spinlock_init nvgpu_spinlock_release +nvgpu_sw_quiesce nvgpu_userd_init_slabs nvgpu_usermode_writel nvgpu_vfree_impl diff --git a/drivers/gpu/nvgpu/os/linux/intr.c b/drivers/gpu/nvgpu/os/linux/intr.c index 9ca5d4307..609aec057 100644 --- a/drivers/gpu/nvgpu/os/linux/intr.c +++ b/drivers/gpu/nvgpu/os/linux/intr.c @@ -20,6 +20,9 @@ #include #include +#ifndef CONFIG_NVGPU_RECOVERY +#include +#endif #include "os_linux.h" irqreturn_t nvgpu_intr_stall(struct gk20a *g) @@ -39,6 +42,11 @@ irqreturn_t nvgpu_intr_stall(struct gk20a *g) return IRQ_NONE; g->ops.mc.intr_stall_pause(g); +#ifndef CONFIG_NVGPU_RECOVERY + if (g->sw_quiesce_pending) { + return IRQ_NONE; + } +#endif nvgpu_atomic_inc(&g->hw_irq_stall_count); @@ -90,6 +98,11 @@ irqreturn_t nvgpu_intr_nonstall(struct gk20a *g) return IRQ_NONE; g->ops.mc.intr_nonstall_pause(g); +#ifndef CONFIG_NVGPU_RECOVERY + if (g->sw_quiesce_pending) { + return IRQ_NONE; + } +#endif ops = g->ops.mc.isr_nonstall(g); if (ops) { diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index 22593f817..061c7e3d7 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -534,6 +534,16 @@ static int gk20a_lockout_registers(struct gk20a *g) return 0; } +void nvgpu_disable_irqs(struct gk20a *g) +{ + if (g->irqs_enabled) { + disable_irq(g->irq_stall); + if (g->irq_stall != g->irq_nonstall) + disable_irq(g->irq_nonstall); + g->irqs_enabled = 0; + } +} + static int gk20a_pm_prepare_poweroff(struct device *dev) { struct gk20a *g = get_gk20a(dev); @@ -553,12 +563,7 @@ static int gk20a_pm_prepare_poweroff(struct device *dev) /* disable IRQs and wait for completion */ irqs_enabled = g->irqs_enabled; - if (irqs_enabled) { - disable_irq(g->irq_stall); - if (g->irq_stall != g->irq_nonstall) - disable_irq(g->irq_nonstall); - g->irqs_enabled = 0; - } + nvgpu_disable_irqs(g); gk20a_scale_suspend(dev); @@ -1319,21 +1324,19 @@ static int gk20a_pm_deinit(struct device *dev) return 0; } -int nvgpu_start_gpu_idle(struct gk20a *g) +void nvgpu_start_gpu_idle(struct gk20a *g) { struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); down_write(&l->busy_lock); - - /* - * Set NVGPU_DRIVER_IS_DYING to avoid gpu being marked - * busy to submit new work to gpu. - */ nvgpu_set_enabled(g, NVGPU_DRIVER_IS_DYING, true); - + /* + * GR SW ready needs to be invalidated at this time with the busy lock + * held to prevent a racing condition on the gr/mm code + */ + nvgpu_gr_sw_ready(g, false); + g->sw_ready = false; up_write(&l->busy_lock); - - return 0; } int nvgpu_wait_for_gpu_idle(struct gk20a *g) @@ -1360,13 +1363,7 @@ void gk20a_driver_start_unload(struct gk20a *g) nvgpu_log(g, gpu_dbg_shutdown, "Driver is now going down!\n"); - down_write(&l->busy_lock); - nvgpu_set_enabled(g, NVGPU_DRIVER_IS_DYING, true); - /* GR SW ready needs to be invalidated at this time with the busy lock - * held to prevent a racing condition on the gr/mm code */ - nvgpu_gr_sw_ready(g, false); - g->sw_ready = false; - up_write(&l->busy_lock); + nvgpu_start_gpu_idle(g); if (g->is_virtual) return; diff --git a/drivers/gpu/nvgpu/os/linux/module.h b/drivers/gpu/nvgpu/os/linux/module.h index 218d33044..83c5bbf52 100644 --- a/drivers/gpu/nvgpu/os/linux/module.h +++ b/drivers/gpu/nvgpu/os/linux/module.h @@ -24,7 +24,6 @@ void gk20a_remove_support(struct gk20a *g); void gk20a_driver_start_unload(struct gk20a *g); int nvgpu_quiesce(struct gk20a *g); int nvgpu_remove(struct device *dev, struct class *class); -int nvgpu_start_gpu_idle(struct gk20a *g); int nvgpu_wait_for_gpu_idle(struct gk20a *g); void nvgpu_free_irq(struct gk20a *g); struct device_node *nvgpu_get_node(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/os/linux/pci_power.c b/drivers/gpu/nvgpu/os/linux/pci_power.c index a8ebc55d0..ba48cfade 100644 --- a/drivers/gpu/nvgpu/os/linux/pci_power.c +++ b/drivers/gpu/nvgpu/os/linux/pci_power.c @@ -24,6 +24,7 @@ #include #include +#include #include "module.h" #include "platform_gk20a.h" @@ -527,11 +528,7 @@ static int nvgpu_pci_gpu_power_off(char *dev_name) g = get_gk20a(dev); pgpios = &pp->gpios; - ret = nvgpu_start_gpu_idle(g); - if (ret) { - pr_err("nvgpu: start gpu idle failed\n"); - goto out; - } + nvgpu_start_gpu_idle(g); ret = nvgpu_wait_for_gpu_idle(g); if (ret) { diff --git a/drivers/gpu/nvgpu/os/posix/nvgpu.c b/drivers/gpu/nvgpu/os/posix/nvgpu.c index 94ba24a42..b67924f24 100644 --- a/drivers/gpu/nvgpu/os/posix/nvgpu.c +++ b/drivers/gpu/nvgpu/os/posix/nvgpu.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,15 @@ void nvgpu_kernel_restart(void *cmd) BUG(); } +void nvgpu_start_gpu_idle(struct gk20a *g) +{ + nvgpu_set_enabled(g, NVGPU_DRIVER_IS_DYING, true); +} + +void nvgpu_disable_irqs(struct gk20a *g) +{ +} + /* * We have no runtime PM stuff in userspace so these are really just noops. */