mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 10:34:43 +03:00
gpu: nvgpu: Enable Quiesce on all builds
Make Recovery and quiesce co-exist to support quiesce state on unrecoverrable errors. Currently, the quiesce code is wrapped under ifndef CONFIG_NVGPU_RECOVERY. Isolate the quiesce code from recovery config, thereby enabling it on all builds. On Linux, the hung_task checker(check_hung_uninterruptible_tasks() in kernel/hung_task.c) complains that quiesce thread is stuck for more than 120 seconds. INFO: task sw-quiesce:1068 blocked for more than 120 seconds. The wait time of more than 120 seconds is expected as quiesce thread will wait until quiesce call is triggered on fatal unrecoverable errors. However, the INFO print upsets the kernel_warning_test(KWT) on Linux builds. To fix the failing KWT, change the quiesce task to interruptible instead of uninterruptible as checker only looks at uninterruptible tasks. Bug 2919899 JIRA NVGPU-5479 Change-Id: Ibd1023506859d8371998b785e881ace52cb5f030 Signed-off-by: tkudav <tkudav@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2342774 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
@@ -1762,7 +1762,6 @@ void nvgpu_channel_set_error_notifier(struct gk20a *g, struct nvgpu_channel *ch,
|
||||
g->ops.channel.set_error_notifier(ch, error_notifier);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
void nvgpu_channel_sw_quiesce(struct gk20a *g)
|
||||
{
|
||||
struct nvgpu_fifo *f = &g->fifo;
|
||||
@@ -1779,7 +1778,6 @@ void nvgpu_channel_sw_quiesce(struct gk20a *g)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NVGPU_DETERMINISTIC_CHANNELS
|
||||
/*
|
||||
|
||||
@@ -281,7 +281,6 @@ int nvgpu_fifo_suspend(struct gk20a *g)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
void nvgpu_fifo_sw_quiesce(struct gk20a *g)
|
||||
{
|
||||
u32 runlist_mask = U32_MAX;
|
||||
@@ -291,4 +290,3 @@ void nvgpu_fifo_sw_quiesce(struct gk20a *g)
|
||||
/* Preempt all runlists */
|
||||
nvgpu_fifo_preempt_runlists_for_rc(g, runlist_mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -70,8 +70,6 @@ static void gk20a_mask_interrupts(struct gk20a *g)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
|
||||
#define NVGPU_SW_QUIESCE_TIMEOUT_MS 50
|
||||
|
||||
static int nvgpu_sw_quiesce_thread(void *data)
|
||||
@@ -79,7 +77,7 @@ static int nvgpu_sw_quiesce_thread(void *data)
|
||||
struct gk20a *g = data;
|
||||
|
||||
/* wait until SW quiesce is requested */
|
||||
NVGPU_COND_WAIT(&g->sw_quiesce_cond,
|
||||
NVGPU_COND_WAIT_INTERRUPTIBLE(&g->sw_quiesce_cond,
|
||||
g->sw_quiesce_pending ||
|
||||
nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U);
|
||||
|
||||
@@ -105,20 +103,39 @@ static void nvgpu_sw_quiesce_bug_cb(void *arg)
|
||||
|
||||
nvgpu_sw_quiesce(g);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void nvgpu_sw_quiesce_thread_stop_fn(void *data)
|
||||
{
|
||||
struct gk20a *g = data;
|
||||
|
||||
/*
|
||||
* If the thread is still waiting on the cond,
|
||||
* nvgpu_thread_should_stop() will return true, and the thread will
|
||||
* exit.
|
||||
*/
|
||||
nvgpu_cond_signal_interruptible(&g->sw_quiesce_cond);
|
||||
}
|
||||
|
||||
void nvgpu_sw_quiesce_remove_support(struct gk20a *g)
|
||||
{
|
||||
if (g->sw_quiesce_init_done) {
|
||||
nvgpu_bug_unregister_cb(&g->sw_quiesce_bug_cb);
|
||||
nvgpu_thread_stop_graceful(&g->sw_quiesce_thread,
|
||||
nvgpu_sw_quiesce_thread_stop_fn,
|
||||
g);
|
||||
nvgpu_cond_destroy(&g->sw_quiesce_cond);
|
||||
g->sw_quiesce_init_done = false;
|
||||
}
|
||||
}
|
||||
|
||||
static int nvgpu_sw_quiesce_init_support(struct gk20a *g)
|
||||
{
|
||||
#ifdef CONFIG_NVGPU_RECOVERY
|
||||
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, true);
|
||||
#else
|
||||
int err;
|
||||
|
||||
if (g->sw_quiesce_init_done) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, false);
|
||||
|
||||
err = nvgpu_cond_init(&g->sw_quiesce_cond);
|
||||
if (err != 0) {
|
||||
@@ -141,44 +158,18 @@ static int nvgpu_sw_quiesce_init_support(struct gk20a *g)
|
||||
g->sw_quiesce_bug_cb.cb = nvgpu_sw_quiesce_bug_cb;
|
||||
g->sw_quiesce_bug_cb.arg = g;
|
||||
nvgpu_bug_register_cb(&g->sw_quiesce_bug_cb);
|
||||
|
||||
#ifdef CONFIG_NVGPU_RECOVERY
|
||||
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, true);
|
||||
#else
|
||||
nvgpu_set_enabled(g, NVGPU_SUPPORT_FAULT_RECOVERY, false);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
static void nvgpu_sw_quiesce_thread_stop_fn(void *data)
|
||||
{
|
||||
struct gk20a *g = data;
|
||||
|
||||
/*
|
||||
* If the thread is still waiting on the cond,
|
||||
* nvgpu_thread_should_stop() will return true, and the thread will
|
||||
* exit.
|
||||
*/
|
||||
nvgpu_cond_signal(&g->sw_quiesce_cond);
|
||||
}
|
||||
#endif
|
||||
|
||||
void nvgpu_sw_quiesce_remove_support(struct gk20a *g)
|
||||
{
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
if (g->sw_quiesce_init_done) {
|
||||
nvgpu_bug_unregister_cb(&g->sw_quiesce_bug_cb);
|
||||
nvgpu_thread_stop_graceful(&g->sw_quiesce_thread,
|
||||
nvgpu_sw_quiesce_thread_stop_fn,
|
||||
g);
|
||||
nvgpu_cond_destroy(&g->sw_quiesce_cond);
|
||||
g->sw_quiesce_init_done = false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void nvgpu_sw_quiesce(struct gk20a *g)
|
||||
{
|
||||
#ifdef CONFIG_NVGPU_RECOVERY
|
||||
nvgpu_err(g, "SW quiesce not supported");
|
||||
#else
|
||||
if (g->is_virtual || (g->enabled_flags == NULL) ||
|
||||
nvgpu_is_enabled(g, NVGPU_DISABLE_SW_QUIESCE)) {
|
||||
nvgpu_err(g, "SW quiesce not supported");
|
||||
@@ -204,10 +195,9 @@ void nvgpu_sw_quiesce(struct gk20a *g)
|
||||
*/
|
||||
g->sw_quiesce_pending = true;
|
||||
|
||||
nvgpu_cond_signal(&g->sw_quiesce_cond);
|
||||
nvgpu_cond_signal_interruptible(&g->sw_quiesce_cond);
|
||||
gk20a_mask_interrupts(g);
|
||||
nvgpu_fifo_sw_quiesce(g);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* init interface layer support for all falcons */
|
||||
@@ -748,11 +738,9 @@ int nvgpu_can_busy(struct gk20a *g)
|
||||
* or the driver is restarting
|
||||
*/
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
if (g->sw_quiesce_pending) {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (nvgpu_is_enabled(g, NVGPU_KERNEL_IS_DYING) ||
|
||||
nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) {
|
||||
|
||||
@@ -696,7 +696,6 @@ int nvgpu_channel_setup_sw(struct gk20a *g);
|
||||
*/
|
||||
void nvgpu_channel_cleanup_sw(struct gk20a *g);
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
/**
|
||||
* @brief Emergency quiescing of channels
|
||||
*
|
||||
@@ -709,7 +708,6 @@ void nvgpu_channel_cleanup_sw(struct gk20a *g);
|
||||
* - signal on wait queues (notify_wq and semaphore_wq)
|
||||
*/
|
||||
void nvgpu_channel_sw_quiesce(struct gk20a *g);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Close channel
|
||||
|
||||
@@ -509,7 +509,6 @@ const char *nvgpu_fifo_decode_pbdma_ch_eng_status(u32 index);
|
||||
*/
|
||||
int nvgpu_fifo_suspend(struct gk20a *g);
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
/**
|
||||
* @brief Emergency quiescing of FIFO.
|
||||
*
|
||||
@@ -523,6 +522,5 @@ int nvgpu_fifo_suspend(struct gk20a *g);
|
||||
* - Preempt all runlists
|
||||
*/
|
||||
void nvgpu_fifo_sw_quiesce(struct gk20a *g);
|
||||
#endif
|
||||
|
||||
#endif /* NVGPU_FIFO_COMMON_H */
|
||||
|
||||
@@ -693,13 +693,12 @@ struct gk20a {
|
||||
#endif
|
||||
bool sw_ready;
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
bool sw_quiesce_init_done;
|
||||
bool sw_quiesce_pending;
|
||||
struct nvgpu_cond sw_quiesce_cond;
|
||||
struct nvgpu_thread sw_quiesce_thread;
|
||||
struct nvgpu_bug_cb sw_quiesce_bug_cb;
|
||||
#endif
|
||||
|
||||
struct nvgpu_list_node bug_node;
|
||||
|
||||
/** Controls which messages are logged */
|
||||
|
||||
@@ -39,11 +39,9 @@ irqreturn_t nvgpu_intr_stall(struct gk20a *g)
|
||||
return IRQ_NONE;
|
||||
|
||||
nvgpu_mc_intr_stall_pause(g);
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
if (g->sw_quiesce_pending) {
|
||||
return IRQ_NONE;
|
||||
}
|
||||
#endif
|
||||
|
||||
nvgpu_atomic_set(&g->mc.sw_irq_stall_pending, 1);
|
||||
|
||||
@@ -91,11 +89,9 @@ irqreturn_t nvgpu_intr_nonstall(struct gk20a *g)
|
||||
return IRQ_NONE;
|
||||
|
||||
nvgpu_mc_intr_nonstall_pause(g);
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
if (g->sw_quiesce_pending) {
|
||||
return IRQ_NONE;
|
||||
}
|
||||
#endif
|
||||
|
||||
nvgpu_atomic_set(&g->mc.sw_irq_nonstall_pending, 1);
|
||||
ops = g->ops.mc.isr_nonstall(g);
|
||||
|
||||
@@ -1270,10 +1270,8 @@ int test_channel_sw_quiesce(struct unit_module *m, struct gk20a *g, void *vargs)
|
||||
unit_assert(ch != NULL, goto done);
|
||||
unit_assert(f->num_channels > 0U, goto done);
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
nvgpu_channel_sw_quiesce(g);
|
||||
unit_assert(ch->unserviceable == true, goto done);
|
||||
#endif
|
||||
|
||||
ret = UNIT_SUCCESS;
|
||||
|
||||
|
||||
@@ -177,14 +177,13 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args)
|
||||
err = nvgpu_fifo_init_support(g);
|
||||
unit_assert(err == 0, goto done);
|
||||
|
||||
#ifndef CONFIG_NVGPU_RECOVERY
|
||||
runlist_mask = nvgpu_runlist_get_runlists_mask(g, 0U,
|
||||
ID_TYPE_UNKNOWN, 0U, 0U);
|
||||
unit_assert(runlist_mask != 0U, goto done);
|
||||
nvgpu_fifo_sw_quiesce(g);
|
||||
reg_val = nvgpu_readl(g, fifo_sched_disable_r());
|
||||
unit_assert((reg_val & runlist_mask) == runlist_mask, goto done);
|
||||
#endif
|
||||
|
||||
ret = UNIT_SUCCESS;
|
||||
|
||||
done:
|
||||
|
||||
Reference in New Issue
Block a user