From 957b19092fccfefca0ed4939f650f66441f08f1e Mon Sep 17 00:00:00 2001 From: tkudav Date: Fri, 29 May 2020 10:57:34 +0530 Subject: [PATCH] gpu: nvgpu: Enable Quiesce on all builds Make Recovery and quiesce co-exist to support quiesce state on unrecoverrable errors. Currently, the quiesce code is wrapped under ifndef CONFIG_NVGPU_RECOVERY. Isolate the quiesce code from recovery config, thereby enabling it on all builds. On Linux, the hung_task checker(check_hung_uninterruptible_tasks() in kernel/hung_task.c) complains that quiesce thread is stuck for more than 120 seconds. INFO: task sw-quiesce:1068 blocked for more than 120 seconds. The wait time of more than 120 seconds is expected as quiesce thread will wait until quiesce call is triggered on fatal unrecoverable errors. However, the INFO print upsets the kernel_warning_test(KWT) on Linux builds. To fix the failing KWT, change the quiesce task to interruptible instead of uninterruptible as checker only looks at uninterruptible tasks. Bug 2919899 JIRA NVGPU-5479 Change-Id: Ibd1023506859d8371998b785e881ace52cb5f030 Signed-off-by: tkudav Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2342774 Reviewed-by: automaticguardword Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: svc-mobile-coverity Reviewed-by: Deepak Nibade Reviewed-by: Alex Waterman Reviewed-by: mobile promotions GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/fifo/channel.c | 2 - drivers/gpu/nvgpu/common/fifo/fifo.c | 2 - drivers/gpu/nvgpu/common/init/nvgpu_init.c | 74 ++++++++------------ drivers/gpu/nvgpu/include/nvgpu/channel.h | 2 - drivers/gpu/nvgpu/include/nvgpu/fifo.h | 2 - drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 3 +- drivers/gpu/nvgpu/os/linux/intr.c | 4 -- userspace/units/fifo/channel/nvgpu-channel.c | 2 - userspace/units/fifo/fifo/nvgpu-fifo.c | 3 +- 9 files changed, 33 insertions(+), 61 deletions(-) diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index eae1f6b4b..d17174ee1 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -1762,7 +1762,6 @@ void nvgpu_channel_set_error_notifier(struct gk20a *g, struct nvgpu_channel *ch, g->ops.channel.set_error_notifier(ch, error_notifier); } -#ifndef CONFIG_NVGPU_RECOVERY void nvgpu_channel_sw_quiesce(struct gk20a *g) { struct nvgpu_fifo *f = &g->fifo; @@ -1779,7 +1778,6 @@ void nvgpu_channel_sw_quiesce(struct gk20a *g) } } } -#endif #ifdef CONFIG_NVGPU_DETERMINISTIC_CHANNELS /* diff --git a/drivers/gpu/nvgpu/common/fifo/fifo.c b/drivers/gpu/nvgpu/common/fifo/fifo.c index 2b31eed32..ed83b9db8 100644 --- a/drivers/gpu/nvgpu/common/fifo/fifo.c +++ b/drivers/gpu/nvgpu/common/fifo/fifo.c @@ -281,7 +281,6 @@ int nvgpu_fifo_suspend(struct gk20a *g) return 0; } -#ifndef CONFIG_NVGPU_RECOVERY void nvgpu_fifo_sw_quiesce(struct gk20a *g) { u32 runlist_mask = U32_MAX; @@ -291,4 +290,3 @@ void nvgpu_fifo_sw_quiesce(struct gk20a *g) /* Preempt all runlists */ nvgpu_fifo_preempt_runlists_for_rc(g, runlist_mask); } -#endif diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c index bc810c361..c85e59fe3 100644 --- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c +++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c @@ -70,8 +70,6 @@ static void gk20a_mask_interrupts(struct gk20a *g) #endif } -#ifndef CONFIG_NVGPU_RECOVERY - #define NVGPU_SW_QUIESCE_TIMEOUT_MS 50 static int nvgpu_sw_quiesce_thread(void *data) @@ -79,7 +77,7 @@ static int nvgpu_sw_quiesce_thread(void *data) struct gk20a *g = data; /* wait until SW quiesce is requested */ - NVGPU_COND_WAIT(&g->sw_quiesce_cond, + NVGPU_COND_WAIT_INTERRUPTIBLE(&g->sw_quiesce_cond, g->sw_quiesce_pending || nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U); @@ -105,20 +103,39 @@ static void nvgpu_sw_quiesce_bug_cb(void *arg) nvgpu_sw_quiesce(g); } -#endif + +static void nvgpu_sw_quiesce_thread_stop_fn(void *data) +{ + struct gk20a *g = data; + + /* + * If the thread is still waiting on the cond, + * nvgpu_thread_should_stop() will return true, and the thread will + * exit. + */ + nvgpu_cond_signal_interruptible(&g->sw_quiesce_cond); +} + +void nvgpu_sw_quiesce_remove_support(struct gk20a *g) +{ + if (g->sw_quiesce_init_done) { + nvgpu_bug_unregister_cb(&g->sw_quiesce_bug_cb); + nvgpu_thread_stop_graceful(&g->sw_quiesce_thread, + nvgpu_sw_quiesce_thread_stop_fn, + g); + nvgpu_cond_destroy(&g->sw_quiesce_cond); + g->sw_quiesce_init_done = false; + } +} static int nvgpu_sw_quiesce_init_support(struct gk20a *g) { -#ifdef CONFIG_NVGPU_RECOVERY - nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, true); -#else int err; if (g->sw_quiesce_init_done) { return 0; } - nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, false); err = nvgpu_cond_init(&g->sw_quiesce_cond); if (err != 0) { @@ -141,44 +158,18 @@ static int nvgpu_sw_quiesce_init_support(struct gk20a *g) g->sw_quiesce_bug_cb.cb = nvgpu_sw_quiesce_bug_cb; g->sw_quiesce_bug_cb.arg = g; nvgpu_bug_register_cb(&g->sw_quiesce_bug_cb); + +#ifdef CONFIG_NVGPU_RECOVERY + nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, true); +#else + nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, false); #endif return 0; } -#ifndef CONFIG_NVGPU_RECOVERY -static void nvgpu_sw_quiesce_thread_stop_fn(void *data) -{ - struct gk20a *g = data; - - /* - * If the thread is still waiting on the cond, - * nvgpu_thread_should_stop() will return true, and the thread will - * exit. - */ - nvgpu_cond_signal(&g->sw_quiesce_cond); -} -#endif - -void nvgpu_sw_quiesce_remove_support(struct gk20a *g) -{ -#ifndef CONFIG_NVGPU_RECOVERY - if (g->sw_quiesce_init_done) { - nvgpu_bug_unregister_cb(&g->sw_quiesce_bug_cb); - nvgpu_thread_stop_graceful(&g->sw_quiesce_thread, - nvgpu_sw_quiesce_thread_stop_fn, - g); - nvgpu_cond_destroy(&g->sw_quiesce_cond); - g->sw_quiesce_init_done = false; - } -#endif -} - void nvgpu_sw_quiesce(struct gk20a *g) { -#ifdef CONFIG_NVGPU_RECOVERY - nvgpu_err(g, "SW quiesce not supported"); -#else if (g->is_virtual || (g->enabled_flags == NULL) || nvgpu_is_enabled(g, NVGPU_DISABLE_SW_QUIESCE)) { nvgpu_err(g, "SW quiesce not supported"); @@ -204,10 +195,9 @@ void nvgpu_sw_quiesce(struct gk20a *g) */ g->sw_quiesce_pending = true; - nvgpu_cond_signal(&g->sw_quiesce_cond); + nvgpu_cond_signal_interruptible(&g->sw_quiesce_cond); gk20a_mask_interrupts(g); nvgpu_fifo_sw_quiesce(g); -#endif } /* init interface layer support for all falcons */ @@ -748,11 +738,9 @@ int nvgpu_can_busy(struct gk20a *g) * or the driver is restarting */ -#ifndef CONFIG_NVGPU_RECOVERY if (g->sw_quiesce_pending) { return 0; } -#endif if (nvgpu_is_enabled(g, NVGPU_KERNEL_IS_DYING) || nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) { diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index 7c7f65388..a160a4035 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -696,7 +696,6 @@ int nvgpu_channel_setup_sw(struct gk20a *g); */ void nvgpu_channel_cleanup_sw(struct gk20a *g); -#ifndef CONFIG_NVGPU_RECOVERY /** * @brief Emergency quiescing of channels * @@ -709,7 +708,6 @@ void nvgpu_channel_cleanup_sw(struct gk20a *g); * - signal on wait queues (notify_wq and semaphore_wq) */ void nvgpu_channel_sw_quiesce(struct gk20a *g); -#endif /** * @brief Close channel diff --git a/drivers/gpu/nvgpu/include/nvgpu/fifo.h b/drivers/gpu/nvgpu/include/nvgpu/fifo.h index 5cdfc2255..a74dc8a26 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/fifo.h +++ b/drivers/gpu/nvgpu/include/nvgpu/fifo.h @@ -509,7 +509,6 @@ const char *nvgpu_fifo_decode_pbdma_ch_eng_status(u32 index); */ int nvgpu_fifo_suspend(struct gk20a *g); -#ifndef CONFIG_NVGPU_RECOVERY /** * @brief Emergency quiescing of FIFO. * @@ -523,6 +522,5 @@ int nvgpu_fifo_suspend(struct gk20a *g); * - Preempt all runlists */ void nvgpu_fifo_sw_quiesce(struct gk20a *g); -#endif #endif /* NVGPU_FIFO_COMMON_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 9e8e7ab9a..20599eba6 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -693,13 +693,12 @@ struct gk20a { #endif bool sw_ready; -#ifndef CONFIG_NVGPU_RECOVERY bool sw_quiesce_init_done; bool sw_quiesce_pending; struct nvgpu_cond sw_quiesce_cond; struct nvgpu_thread sw_quiesce_thread; struct nvgpu_bug_cb sw_quiesce_bug_cb; -#endif + struct nvgpu_list_node bug_node; /** Controls which messages are logged */ diff --git a/drivers/gpu/nvgpu/os/linux/intr.c b/drivers/gpu/nvgpu/os/linux/intr.c index 825a09f75..baecb1621 100644 --- a/drivers/gpu/nvgpu/os/linux/intr.c +++ b/drivers/gpu/nvgpu/os/linux/intr.c @@ -39,11 +39,9 @@ irqreturn_t nvgpu_intr_stall(struct gk20a *g) return IRQ_NONE; nvgpu_mc_intr_stall_pause(g); -#ifndef CONFIG_NVGPU_RECOVERY if (g->sw_quiesce_pending) { return IRQ_NONE; } -#endif nvgpu_atomic_set(&g->mc.sw_irq_stall_pending, 1); @@ -91,11 +89,9 @@ irqreturn_t nvgpu_intr_nonstall(struct gk20a *g) return IRQ_NONE; nvgpu_mc_intr_nonstall_pause(g); -#ifndef CONFIG_NVGPU_RECOVERY if (g->sw_quiesce_pending) { return IRQ_NONE; } -#endif nvgpu_atomic_set(&g->mc.sw_irq_nonstall_pending, 1); ops = g->ops.mc.isr_nonstall(g); diff --git a/userspace/units/fifo/channel/nvgpu-channel.c b/userspace/units/fifo/channel/nvgpu-channel.c index 4d3f1afd6..6f3c3c7cd 100644 --- a/userspace/units/fifo/channel/nvgpu-channel.c +++ b/userspace/units/fifo/channel/nvgpu-channel.c @@ -1270,10 +1270,8 @@ int test_channel_sw_quiesce(struct unit_module *m, struct gk20a *g, void *vargs) unit_assert(ch != NULL, goto done); unit_assert(f->num_channels > 0U, goto done); -#ifndef CONFIG_NVGPU_RECOVERY nvgpu_channel_sw_quiesce(g); unit_assert(ch->unserviceable == true, goto done); -#endif ret = UNIT_SUCCESS; diff --git a/userspace/units/fifo/fifo/nvgpu-fifo.c b/userspace/units/fifo/fifo/nvgpu-fifo.c index 46f51ac1c..0eef12b69 100644 --- a/userspace/units/fifo/fifo/nvgpu-fifo.c +++ b/userspace/units/fifo/fifo/nvgpu-fifo.c @@ -177,14 +177,13 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args) err = nvgpu_fifo_init_support(g); unit_assert(err == 0, goto done); -#ifndef CONFIG_NVGPU_RECOVERY runlist_mask = nvgpu_runlist_get_runlists_mask(g, 0U, ID_TYPE_UNKNOWN, 0U, 0U); unit_assert(runlist_mask != 0U, goto done); nvgpu_fifo_sw_quiesce(g); reg_val = nvgpu_readl(g, fifo_sched_disable_r()); unit_assert((reg_val & runlist_mask) == runlist_mask, goto done); -#endif + ret = UNIT_SUCCESS; done: