gpu: nvgpu: update SW quiesce

Update SW quiesce as follows:
- After waking up sw_quiesce_thread, nvgpu_sw_quiesce
  masks interrupts, then disables and preempts runlists
  without lock. There could be still a concurrent thread
  that would re-enable the runlist by accident. This is
  very unlikely and would mean we are not in mission mode
  anyway.
- In sw_quiesce_thread, wait NVGPU_SW_QUIESCE_TIMEOUT_MS,
  to leave some time for interrupt handler to set error
  notifier (in case of HW error interrupt). Then disable
  and preempt runlists, and set error notifier for remaining
  channels before exiting the process.

Also modified nvgpu_can_busy to return false in case
SW quiesce is pending. This will make subsequent
devctl to fail.

Jira NVGPU-4512

Change-Id: I36dd554485f3b9b08f740f352f737ac4baa28746
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2266389
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Thomas Fleury
2019-12-19 11:11:56 -05:00
committed by Alex Waterman
parent fbafc9e05c
commit e0a6000456
8 changed files with 67 additions and 57 deletions

View File

@@ -1,7 +1,7 @@
/* /*
* FIFO * FIFO
* *
* Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -282,20 +282,11 @@ int nvgpu_fifo_suspend(struct gk20a *g)
#ifndef CONFIG_NVGPU_RECOVERY #ifndef CONFIG_NVGPU_RECOVERY
void nvgpu_fifo_sw_quiesce(struct gk20a *g) void nvgpu_fifo_sw_quiesce(struct gk20a *g)
{ {
u32 runlist_mask; u32 runlist_mask = U32_MAX;
nvgpu_runlist_lock_active_runlists(g);
/* Disable all runlists */
runlist_mask = nvgpu_runlist_get_runlists_mask(g,
0U, ID_TYPE_UNKNOWN, 0U, 0U);
g->ops.runlist.write_state(g, runlist_mask, RUNLIST_DISABLED); g->ops.runlist.write_state(g, runlist_mask, RUNLIST_DISABLED);
/* Preempt all runlists */ /* Preempt all runlists */
g->ops.fifo.preempt_runlists_for_rc(g, runlist_mask); g->ops.fifo.preempt_runlists_for_rc(g, runlist_mask);
nvgpu_channel_sw_quiesce(g);
nvgpu_runlist_unlock_active_runlists(g);
} }
#endif #endif

View File

@@ -72,15 +72,14 @@ static void gk20a_mask_interrupts(struct gk20a *g)
} }
#ifndef CONFIG_NVGPU_RECOVERY #ifndef CONFIG_NVGPU_RECOVERY
#define NVGPU_SW_QUIESCE_TIMEOUT_MS 50
static int nvgpu_sw_quiesce_thread(void *data) static int nvgpu_sw_quiesce_thread(void *data)
{ {
struct gk20a *g = data; struct gk20a *g = data;
int err = 0;
g->sw_quiesce_init_done = true; /* wait until SW quiesce is requested */
nvgpu_cond_signal(&g->sw_quiesce_cond);
/* wait until all SW quiesce is requested */
NVGPU_COND_WAIT(&g->sw_quiesce_cond, NVGPU_COND_WAIT(&g->sw_quiesce_cond,
g->sw_quiesce_pending || g->sw_quiesce_pending ||
nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U); nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U);
@@ -88,29 +87,17 @@ static int nvgpu_sw_quiesce_thread(void *data)
if (nvgpu_thread_should_stop(&g->sw_quiesce_thread)) { if (nvgpu_thread_should_stop(&g->sw_quiesce_thread)) {
goto done; goto done;
} }
nvgpu_wait_for_deferred_interrupts(g);
nvgpu_err(g, "sw quiesce in progress"); nvgpu_err(g, "SW quiesce thread running");
nvgpu_msleep(NVGPU_SW_QUIESCE_TIMEOUT_MS);
nvgpu_mutex_acquire(&g->power_lock);
if (nvgpu_is_powered_off(g) || g->is_virtual) {
err = -EINVAL;
goto idle;
}
nvgpu_start_gpu_idle(g);
nvgpu_disable_irqs(g);
gk20a_mask_interrupts(g);
nvgpu_fifo_sw_quiesce(g); nvgpu_fifo_sw_quiesce(g);
nvgpu_channel_sw_quiesce(g);
idle: nvgpu_bug_exit(1);
nvgpu_mutex_release(&g->power_lock);
nvgpu_err(g, "sw quiesce done, err=%d", err);
done: done:
nvgpu_log_info(g, "done"); nvgpu_log_info(g, "done");
return err; return 0;
} }
#endif #endif
@@ -142,8 +129,7 @@ static int nvgpu_sw_quiesce_init_support(struct gk20a *g)
return err; return err;
} }
/* wait until thread actually starts */ g->sw_quiesce_init_done = true;
NVGPU_COND_WAIT(&g->sw_quiesce_cond, g->sw_quiesce_init_done, 0U);
#endif #endif
return 0; return 0;
@@ -162,10 +148,23 @@ void nvgpu_sw_quiesce_remove_support(struct gk20a *g)
void nvgpu_sw_quiesce(struct gk20a *g) void nvgpu_sw_quiesce(struct gk20a *g)
{ {
#ifndef CONFIG_NVGPU_RECOVERY #ifdef CONFIG_NVGPU_RECOVERY
nvgpu_err(g, "SW quiesce not supported");
#else
if (g->is_virtual || (g->enabled_flags == NULL) || if (g->is_virtual || (g->enabled_flags == NULL) ||
nvgpu_is_enabled(g, NVGPU_DISABLE_SW_QUIESCE)) { nvgpu_is_enabled(g, NVGPU_DISABLE_SW_QUIESCE)) {
goto fail; nvgpu_err(g, "SW quiesce not supported");
return;
}
if (!g->sw_quiesce_init_done) {
nvgpu_err(g, "SW quiesce not initialized");
return;
}
if (g->sw_quiesce_pending) {
nvgpu_err(g, "SW quiesce already pending");
return;
} }
nvgpu_err(g, "SW quiesce requested"); nvgpu_err(g, "SW quiesce requested");
@@ -177,13 +176,10 @@ void nvgpu_sw_quiesce(struct gk20a *g)
*/ */
g->sw_quiesce_pending = true; g->sw_quiesce_pending = true;
nvgpu_cond_signal(&g->sw_quiesce_cond); nvgpu_cond_broadcast(&g->sw_quiesce_cond);
gk20a_mask_interrupts(g);
return; nvgpu_fifo_sw_quiesce(g);
fail:
#endif #endif
nvgpu_err(g, "sw quiesce not supported");
} }
/* init interface layer support for all falcons */ /* init interface layer support for all falcons */
@@ -709,6 +705,13 @@ int nvgpu_can_busy(struct gk20a *g)
/* Can't do anything if the system is rebooting/shutting down /* Can't do anything if the system is rebooting/shutting down
* or the driver is restarting * or the driver is restarting
*/ */
#ifndef CONFIG_NVGPU_RECOVERY
if (g->sw_quiesce_pending) {
return 0;
}
#endif
if (nvgpu_is_enabled(g, NVGPU_KERNEL_IS_DYING) || if (nvgpu_is_enabled(g, NVGPU_KERNEL_IS_DYING) ||
nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) { nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) {
return 0; return 0;

View File

@@ -109,7 +109,7 @@ nvgpu_bug_cb_from_node(struct nvgpu_list_node *node)
}; };
#ifdef __KERNEL__ #ifdef __KERNEL__
static inline void nvgpu_bug_exit(void) { } static inline void nvgpu_bug_exit(int status) { }
static inline void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb) { } static inline void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb) { }
static inline void nvgpu_bug_unregister_cb(struct nvgpu_bug_cb *cb) { } static inline void nvgpu_bug_unregister_cb(struct nvgpu_bug_cb *cb) { }
#endif #endif

View File

@@ -1,7 +1,7 @@
/* /*
* FIFO common definitions. * FIFO common definitions.
* *
* Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -511,15 +511,12 @@ int nvgpu_fifo_suspend(struct gk20a *g);
* *
* @param g [in] The GPU driver struct. * @param g [in] The GPU driver struct.
* *
* Gracefully put FIFO into a non-functioning state to ensure that no corrupted * Put FIFO into a non-functioning state to ensure that no corrupted
* work is completed because of the fault. This is because the freedom * work is completed because of the fault. This is because the freedom
* from interference may not always be shown between the faulted and * from interference may not always be shown between the faulted and
* the non-faulted TSG contexts. * the non-faulted TSG contexts.
* - Disable all runlists * - Disable all runlists
* - Preempt all runlists * - Preempt all runlists
* - Quiesce all channels
*
* @see nvgpu_channel_sw_quiesce
*/ */
void nvgpu_fifo_sw_quiesce(struct gk20a *g); void nvgpu_fifo_sw_quiesce(struct gk20a *g);
#endif #endif

View File

@@ -124,6 +124,16 @@ void nvgpu_bug_cb_longjmp(void *arg);
struct nvgpu_bug_cb; struct nvgpu_bug_cb;
/**
* @brief Exit current process
*
* @param status [in] Status to return
*
* This function is used during BUG() handling to exit
* current process.
*/
void nvgpu_bug_exit(int status);
/** /**
* @brief Register callback to be invoked on BUG() * @brief Register callback to be invoked on BUG()
* *

View File

@@ -89,6 +89,13 @@ static void nvgpu_bug_init(void)
bug.in_use = true; bug.in_use = true;
} }
void nvgpu_bug_exit(int status)
{
#ifndef __NVGPU_UNIT_TEST__
exit(status);
#endif
}
void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb) void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb)
{ {
(void) pthread_once(&bug.once, nvgpu_bug_init); (void) pthread_once(&bug.once, nvgpu_bug_init);

View File

@@ -28,6 +28,7 @@
#include <nvgpu/posix/posix-fault-injection.h> #include <nvgpu/posix/posix-fault-injection.h>
#include <nvgpu/posix/dma.h> #include <nvgpu/posix/dma.h>
#include <nvgpu/io.h> #include <nvgpu/io.h>
#include <nvgpu/runlist.h>
#include "hal/init/hal_gv11b.h" #include "hal/init/hal_gv11b.h"
#include "nvgpu/hw/gk20a/hw_fifo_gk20a.h" #include "nvgpu/hw/gk20a/hw_fifo_gk20a.h"
@@ -166,6 +167,7 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args)
u32 reg_val; u32 reg_val;
int ret = UNIT_FAIL; int ret = UNIT_FAIL;
int err; int err;
u32 runlist_mask;
err = test_fifo_setup_gv11b_reg_space(m, g); err = test_fifo_setup_gv11b_reg_space(m, g);
unit_assert(err == 0, goto done); unit_assert(err == 0, goto done);
@@ -176,9 +178,12 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args)
unit_assert(err == 0, goto done); unit_assert(err == 0, goto done);
#ifndef CONFIG_NVGPU_RECOVERY #ifndef CONFIG_NVGPU_RECOVERY
runlist_mask = nvgpu_runlist_get_runlists_mask(g, 0U,
ID_TYPE_UNKNOWN, 0U, 0U);
unit_assert(runlist_mask != 0U, goto done);
nvgpu_fifo_sw_quiesce(g); nvgpu_fifo_sw_quiesce(g);
reg_val = nvgpu_readl(g, fifo_sched_disable_r()); reg_val = nvgpu_readl(g, fifo_sched_disable_r());
unit_assert(reg_val == 3U, goto done); unit_assert((reg_val & runlist_mask) == runlist_mask, goto done);
#endif #endif
ret = UNIT_SUCCESS; ret = UNIT_SUCCESS;

View File

@@ -719,7 +719,6 @@ int test_quiesce(struct unit_module *m, struct gk20a *g, void *args)
nvgpu_set_power_state(g, NVGPU_STATE_POWERED_ON); nvgpu_set_power_state(g, NVGPU_STATE_POWERED_ON);
/* make sure we simulate interrupts enabled */ /* make sure we simulate interrupts enabled */
g->mc.irqs_enabled = true;
intr_masked = false; intr_masked = false;
/* setup HAL for masking interrupts */ /* setup HAL for masking interrupts */
@@ -740,10 +739,8 @@ int test_quiesce(struct unit_module *m, struct gk20a *g, void *args)
/* wait for quiesce thread to complete */ /* wait for quiesce thread to complete */
nvgpu_thread_join(&g->sw_quiesce_thread); nvgpu_thread_join(&g->sw_quiesce_thread);
if (!intr_masked) {
unit_err(m, "quiesce failed to mask interrupts\n");
if (g->mc.irqs_enabled || !intr_masked) {
unit_err(m, "quiesce failed to disable interrupts\n");
ret = UNIT_FAIL; ret = UNIT_FAIL;
} }