gpu: nvgpu: Enable Quiesce on all builds

Make Recovery and quiesce co-exist to support quiesce state
on unrecoverrable errors. Currently, the quiesce code is wrapped
under ifndef CONFIG_NVGPU_RECOVERY. Isolate the quiesce code from
recovery config, thereby enabling it on all builds.

On Linux, the hung_task checker(check_hung_uninterruptible_tasks()
in kernel/hung_task.c) complains that quiesce thread is stuck for
more than 120 seconds.

INFO: task sw-quiesce:1068 blocked for more than 120 seconds.

The wait time of more than 120 seconds is expected as quiesce
thread will wait until quiesce call is triggered on fatal
unrecoverable errors. However, the INFO print upsets the
kernel_warning_test(KWT) on Linux builds. To fix the failing
KWT, change the quiesce task to interruptible instead of
uninterruptible as checker only looks at uninterruptible tasks.

Bug 2919899
JIRA NVGPU-5479

Change-Id: Ibd1023506859d8371998b785e881ace52cb5f030
Signed-off-by: tkudav <tkudav@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2342774
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
tkudav
2020-05-29 10:57:34 +05:30
committed by Alex Waterman
parent 1f28443889
commit 957b19092f
9 changed files with 33 additions and 61 deletions

View File

@@ -1762,7 +1762,6 @@ void nvgpu_channel_set_error_notifier(struct gk20a *g, struct nvgpu_channel *ch,
g->ops.channel.set_error_notifier(ch, error_notifier);
}
#ifndef CONFIG_NVGPU_RECOVERY
void nvgpu_channel_sw_quiesce(struct gk20a *g)
{
struct nvgpu_fifo *f = &g->fifo;
@@ -1779,7 +1778,6 @@ void nvgpu_channel_sw_quiesce(struct gk20a *g)
}
}
}
#endif
#ifdef CONFIG_NVGPU_DETERMINISTIC_CHANNELS
/*

View File

@@ -281,7 +281,6 @@ int nvgpu_fifo_suspend(struct gk20a *g)
return 0;
}
#ifndef CONFIG_NVGPU_RECOVERY
void nvgpu_fifo_sw_quiesce(struct gk20a *g)
{
u32 runlist_mask = U32_MAX;
@@ -291,4 +290,3 @@ void nvgpu_fifo_sw_quiesce(struct gk20a *g)
/* Preempt all runlists */
nvgpu_fifo_preempt_runlists_for_rc(g, runlist_mask);
}
#endif

View File

@@ -70,8 +70,6 @@ static void gk20a_mask_interrupts(struct gk20a *g)
#endif
}
#ifndef CONFIG_NVGPU_RECOVERY
#define NVGPU_SW_QUIESCE_TIMEOUT_MS 50
static int nvgpu_sw_quiesce_thread(void *data)
@@ -79,7 +77,7 @@ static int nvgpu_sw_quiesce_thread(void *data)
struct gk20a *g = data;
/* wait until SW quiesce is requested */
NVGPU_COND_WAIT(&g->sw_quiesce_cond,
NVGPU_COND_WAIT_INTERRUPTIBLE(&g->sw_quiesce_cond,
g->sw_quiesce_pending ||
nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U);
@@ -105,20 +103,39 @@ static void nvgpu_sw_quiesce_bug_cb(void *arg)
nvgpu_sw_quiesce(g);
}
#endif
static void nvgpu_sw_quiesce_thread_stop_fn(void *data)
{
struct gk20a *g = data;
/*
* If the thread is still waiting on the cond,
* nvgpu_thread_should_stop() will return true, and the thread will
* exit.
*/
nvgpu_cond_signal_interruptible(&g->sw_quiesce_cond);
}
void nvgpu_sw_quiesce_remove_support(struct gk20a *g)
{
if (g->sw_quiesce_init_done) {
nvgpu_bug_unregister_cb(&g->sw_quiesce_bug_cb);
nvgpu_thread_stop_graceful(&g->sw_quiesce_thread,
nvgpu_sw_quiesce_thread_stop_fn,
g);
nvgpu_cond_destroy(&g->sw_quiesce_cond);
g->sw_quiesce_init_done = false;
}
}
static int nvgpu_sw_quiesce_init_support(struct gk20a *g)
{
#ifdef CONFIG_NVGPU_RECOVERY
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, true);
#else
int err;
if (g->sw_quiesce_init_done) {
return 0;
}
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, false);
err = nvgpu_cond_init(&g->sw_quiesce_cond);
if (err != 0) {
@@ -141,44 +158,18 @@ static int nvgpu_sw_quiesce_init_support(struct gk20a *g)
g->sw_quiesce_bug_cb.cb = nvgpu_sw_quiesce_bug_cb;
g->sw_quiesce_bug_cb.arg = g;
nvgpu_bug_register_cb(&g->sw_quiesce_bug_cb);
#ifdef CONFIG_NVGPU_RECOVERY
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, true);
#else
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, false);
#endif
return 0;
}
#ifndef CONFIG_NVGPU_RECOVERY
static void nvgpu_sw_quiesce_thread_stop_fn(void *data)
{
struct gk20a *g = data;
/*
* If the thread is still waiting on the cond,
* nvgpu_thread_should_stop() will return true, and the thread will
* exit.
*/
nvgpu_cond_signal(&g->sw_quiesce_cond);
}
#endif
void nvgpu_sw_quiesce_remove_support(struct gk20a *g)
{
#ifndef CONFIG_NVGPU_RECOVERY
if (g->sw_quiesce_init_done) {
nvgpu_bug_unregister_cb(&g->sw_quiesce_bug_cb);
nvgpu_thread_stop_graceful(&g->sw_quiesce_thread,
nvgpu_sw_quiesce_thread_stop_fn,
g);
nvgpu_cond_destroy(&g->sw_quiesce_cond);
g->sw_quiesce_init_done = false;
}
#endif
}
void nvgpu_sw_quiesce(struct gk20a *g)
{
#ifdef CONFIG_NVGPU_RECOVERY
nvgpu_err(g, "SW quiesce not supported");
#else
if (g->is_virtual || (g->enabled_flags == NULL) ||
nvgpu_is_enabled(g, NVGPU_DISABLE_SW_QUIESCE)) {
nvgpu_err(g, "SW quiesce not supported");
@@ -204,10 +195,9 @@ void nvgpu_sw_quiesce(struct gk20a *g)
*/
g->sw_quiesce_pending = true;
nvgpu_cond_signal(&g->sw_quiesce_cond);
nvgpu_cond_signal_interruptible(&g->sw_quiesce_cond);
gk20a_mask_interrupts(g);
nvgpu_fifo_sw_quiesce(g);
#endif
}
/* init interface layer support for all falcons */
@@ -748,11 +738,9 @@ int nvgpu_can_busy(struct gk20a *g)
* or the driver is restarting
*/
#ifndef CONFIG_NVGPU_RECOVERY
if (g->sw_quiesce_pending) {
return 0;
}
#endif
if (nvgpu_is_enabled(g, NVGPU_KERNEL_IS_DYING) ||
nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) {

View File

@@ -696,7 +696,6 @@ int nvgpu_channel_setup_sw(struct gk20a *g);
*/
void nvgpu_channel_cleanup_sw(struct gk20a *g);
#ifndef CONFIG_NVGPU_RECOVERY
/**
* @brief Emergency quiescing of channels
*
@@ -709,7 +708,6 @@ void nvgpu_channel_cleanup_sw(struct gk20a *g);
* - signal on wait queues (notify_wq and semaphore_wq)
*/
void nvgpu_channel_sw_quiesce(struct gk20a *g);
#endif
/**
* @brief Close channel

View File

@@ -509,7 +509,6 @@ const char *nvgpu_fifo_decode_pbdma_ch_eng_status(u32 index);
*/
int nvgpu_fifo_suspend(struct gk20a *g);
#ifndef CONFIG_NVGPU_RECOVERY
/**
* @brief Emergency quiescing of FIFO.
*
@@ -523,6 +522,5 @@ int nvgpu_fifo_suspend(struct gk20a *g);
* - Preempt all runlists
*/
void nvgpu_fifo_sw_quiesce(struct gk20a *g);
#endif
#endif /* NVGPU_FIFO_COMMON_H */

View File

@@ -693,13 +693,12 @@ struct gk20a {
#endif
bool sw_ready;
#ifndef CONFIG_NVGPU_RECOVERY
bool sw_quiesce_init_done;
bool sw_quiesce_pending;
struct nvgpu_cond sw_quiesce_cond;
struct nvgpu_thread sw_quiesce_thread;
struct nvgpu_bug_cb sw_quiesce_bug_cb;
#endif
struct nvgpu_list_node bug_node;
/** Controls which messages are logged */

View File

@@ -39,11 +39,9 @@ irqreturn_t nvgpu_intr_stall(struct gk20a *g)
return IRQ_NONE;
nvgpu_mc_intr_stall_pause(g);
#ifndef CONFIG_NVGPU_RECOVERY
if (g->sw_quiesce_pending) {
return IRQ_NONE;
}
#endif
nvgpu_atomic_set(&g->mc.sw_irq_stall_pending, 1);
@@ -91,11 +89,9 @@ irqreturn_t nvgpu_intr_nonstall(struct gk20a *g)
return IRQ_NONE;
nvgpu_mc_intr_nonstall_pause(g);
#ifndef CONFIG_NVGPU_RECOVERY
if (g->sw_quiesce_pending) {
return IRQ_NONE;
}
#endif
nvgpu_atomic_set(&g->mc.sw_irq_nonstall_pending, 1);
ops = g->ops.mc.isr_nonstall(g);

View File

@@ -1270,10 +1270,8 @@ int test_channel_sw_quiesce(struct unit_module *m, struct gk20a *g, void *vargs)
unit_assert(ch != NULL, goto done);
unit_assert(f->num_channels > 0U, goto done);
#ifndef CONFIG_NVGPU_RECOVERY
nvgpu_channel_sw_quiesce(g);
unit_assert(ch->unserviceable == true, goto done);
#endif
ret = UNIT_SUCCESS;

View File

@@ -177,14 +177,13 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args)
err = nvgpu_fifo_init_support(g);
unit_assert(err == 0, goto done);
#ifndef CONFIG_NVGPU_RECOVERY
runlist_mask = nvgpu_runlist_get_runlists_mask(g, 0U,
ID_TYPE_UNKNOWN, 0U, 0U);
unit_assert(runlist_mask != 0U, goto done);
nvgpu_fifo_sw_quiesce(g);
reg_val = nvgpu_readl(g, fifo_sched_disable_r());
unit_assert((reg_val & runlist_mask) == runlist_mask, goto done);
#endif
ret = UNIT_SUCCESS;
done: