From e0a6000456d2b45270704c994ef823421a349c60 Mon Sep 17 00:00:00 2001 From: Thomas Fleury Date: Thu, 19 Dec 2019 11:11:56 -0500 Subject: [PATCH] gpu: nvgpu: update SW quiesce Update SW quiesce as follows: - After waking up sw_quiesce_thread, nvgpu_sw_quiesce masks interrupts, then disables and preempts runlists without lock. There could be still a concurrent thread that would re-enable the runlist by accident. This is very unlikely and would mean we are not in mission mode anyway. - In sw_quiesce_thread, wait NVGPU_SW_QUIESCE_TIMEOUT_MS, to leave some time for interrupt handler to set error notifier (in case of HW error interrupt). Then disable and preempt runlists, and set error notifier for remaining channels before exiting the process. Also modified nvgpu_can_busy to return false in case SW quiesce is pending. This will make subsequent devctl to fail. Jira NVGPU-4512 Change-Id: I36dd554485f3b9b08f740f352f737ac4baa28746 Signed-off-by: Thomas Fleury Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2266389 Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-cert Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: Alex Waterman Reviewed-by: mobile promotions GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/fifo/fifo.c | 13 +--- drivers/gpu/nvgpu/common/init/nvgpu_init.c | 67 +++++++++++---------- drivers/gpu/nvgpu/include/nvgpu/bug.h | 2 +- drivers/gpu/nvgpu/include/nvgpu/fifo.h | 7 +-- drivers/gpu/nvgpu/include/nvgpu/posix/bug.h | 10 +++ drivers/gpu/nvgpu/os/posix/bug.c | 7 +++ userspace/units/fifo/fifo/nvgpu-fifo.c | 11 +++- userspace/units/init/nvgpu-init.c | 7 +-- 8 files changed, 67 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/nvgpu/common/fifo/fifo.c b/drivers/gpu/nvgpu/common/fifo/fifo.c index 471e3e530..f3dd63c5f 100644 --- a/drivers/gpu/nvgpu/common/fifo/fifo.c +++ b/drivers/gpu/nvgpu/common/fifo/fifo.c @@ -1,7 +1,7 @@ /* * FIFO * - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -282,20 +282,11 @@ int nvgpu_fifo_suspend(struct gk20a *g) #ifndef CONFIG_NVGPU_RECOVERY void nvgpu_fifo_sw_quiesce(struct gk20a *g) { - u32 runlist_mask; + u32 runlist_mask = U32_MAX; - nvgpu_runlist_lock_active_runlists(g); - - /* Disable all runlists */ - runlist_mask = nvgpu_runlist_get_runlists_mask(g, - 0U, ID_TYPE_UNKNOWN, 0U, 0U); g->ops.runlist.write_state(g, runlist_mask, RUNLIST_DISABLED); /* Preempt all runlists */ g->ops.fifo.preempt_runlists_for_rc(g, runlist_mask); - - nvgpu_channel_sw_quiesce(g); - - nvgpu_runlist_unlock_active_runlists(g); } #endif diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c index bb9659b1a..d98000a91 100644 --- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c +++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c @@ -72,15 +72,14 @@ static void gk20a_mask_interrupts(struct gk20a *g) } #ifndef CONFIG_NVGPU_RECOVERY + +#define NVGPU_SW_QUIESCE_TIMEOUT_MS 50 + static int nvgpu_sw_quiesce_thread(void *data) { struct gk20a *g = data; - int err = 0; - g->sw_quiesce_init_done = true; - nvgpu_cond_signal(&g->sw_quiesce_cond); - - /* wait until all SW quiesce is requested */ + /* wait until SW quiesce is requested */ NVGPU_COND_WAIT(&g->sw_quiesce_cond, g->sw_quiesce_pending || nvgpu_thread_should_stop(&g->sw_quiesce_thread), 0U); @@ -88,29 +87,17 @@ static int nvgpu_sw_quiesce_thread(void *data) if (nvgpu_thread_should_stop(&g->sw_quiesce_thread)) { goto done; } - nvgpu_wait_for_deferred_interrupts(g); - nvgpu_err(g, "sw quiesce in progress"); + nvgpu_err(g, "SW quiesce thread running"); + nvgpu_msleep(NVGPU_SW_QUIESCE_TIMEOUT_MS); - nvgpu_mutex_acquire(&g->power_lock); - - if (nvgpu_is_powered_off(g) || g->is_virtual) { - err = -EINVAL; - goto idle; - } - - nvgpu_start_gpu_idle(g); - nvgpu_disable_irqs(g); - gk20a_mask_interrupts(g); nvgpu_fifo_sw_quiesce(g); - -idle: - nvgpu_mutex_release(&g->power_lock); - nvgpu_err(g, "sw quiesce done, err=%d", err); + nvgpu_channel_sw_quiesce(g); + nvgpu_bug_exit(1); done: nvgpu_log_info(g, "done"); - return err; + return 0; } #endif @@ -142,8 +129,7 @@ static int nvgpu_sw_quiesce_init_support(struct gk20a *g) return err; } - /* wait until thread actually starts */ - NVGPU_COND_WAIT(&g->sw_quiesce_cond, g->sw_quiesce_init_done, 0U); + g->sw_quiesce_init_done = true; #endif return 0; @@ -162,10 +148,23 @@ void nvgpu_sw_quiesce_remove_support(struct gk20a *g) void nvgpu_sw_quiesce(struct gk20a *g) { -#ifndef CONFIG_NVGPU_RECOVERY +#ifdef CONFIG_NVGPU_RECOVERY + nvgpu_err(g, "SW quiesce not supported"); +#else if (g->is_virtual || (g->enabled_flags == NULL) || nvgpu_is_enabled(g, NVGPU_DISABLE_SW_QUIESCE)) { - goto fail; + nvgpu_err(g, "SW quiesce not supported"); + return; + } + + if (!g->sw_quiesce_init_done) { + nvgpu_err(g, "SW quiesce not initialized"); + return; + } + + if (g->sw_quiesce_pending) { + nvgpu_err(g, "SW quiesce already pending"); + return; } nvgpu_err(g, "SW quiesce requested"); @@ -177,13 +176,10 @@ void nvgpu_sw_quiesce(struct gk20a *g) */ g->sw_quiesce_pending = true; - nvgpu_cond_signal(&g->sw_quiesce_cond); - - return; - -fail: + nvgpu_cond_broadcast(&g->sw_quiesce_cond); + gk20a_mask_interrupts(g); + nvgpu_fifo_sw_quiesce(g); #endif - nvgpu_err(g, "sw quiesce not supported"); } /* init interface layer support for all falcons */ @@ -709,6 +705,13 @@ int nvgpu_can_busy(struct gk20a *g) /* Can't do anything if the system is rebooting/shutting down * or the driver is restarting */ + +#ifndef CONFIG_NVGPU_RECOVERY + if (g->sw_quiesce_pending) { + return 0; + } +#endif + if (nvgpu_is_enabled(g, NVGPU_KERNEL_IS_DYING) || nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) { return 0; diff --git a/drivers/gpu/nvgpu/include/nvgpu/bug.h b/drivers/gpu/nvgpu/include/nvgpu/bug.h index 477bfaeac..61893f144 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/bug.h +++ b/drivers/gpu/nvgpu/include/nvgpu/bug.h @@ -109,7 +109,7 @@ nvgpu_bug_cb_from_node(struct nvgpu_list_node *node) }; #ifdef __KERNEL__ -static inline void nvgpu_bug_exit(void) { } +static inline void nvgpu_bug_exit(int status) { } static inline void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb) { } static inline void nvgpu_bug_unregister_cb(struct nvgpu_bug_cb *cb) { } #endif diff --git a/drivers/gpu/nvgpu/include/nvgpu/fifo.h b/drivers/gpu/nvgpu/include/nvgpu/fifo.h index 8be9c7096..fc55cd80c 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/fifo.h +++ b/drivers/gpu/nvgpu/include/nvgpu/fifo.h @@ -1,7 +1,7 @@ /* * FIFO common definitions. * - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -511,15 +511,12 @@ int nvgpu_fifo_suspend(struct gk20a *g); * * @param g [in] The GPU driver struct. * - * Gracefully put FIFO into a non-functioning state to ensure that no corrupted + * Put FIFO into a non-functioning state to ensure that no corrupted * work is completed because of the fault. This is because the freedom * from interference may not always be shown between the faulted and * the non-faulted TSG contexts. * - Disable all runlists * - Preempt all runlists - * - Quiesce all channels - * - * @see nvgpu_channel_sw_quiesce */ void nvgpu_fifo_sw_quiesce(struct gk20a *g); #endif diff --git a/drivers/gpu/nvgpu/include/nvgpu/posix/bug.h b/drivers/gpu/nvgpu/include/nvgpu/posix/bug.h index e81297b2d..4cfd7ef7b 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/posix/bug.h +++ b/drivers/gpu/nvgpu/include/nvgpu/posix/bug.h @@ -124,6 +124,16 @@ void nvgpu_bug_cb_longjmp(void *arg); struct nvgpu_bug_cb; +/** + * @brief Exit current process + * + * @param status [in] Status to return + * + * This function is used during BUG() handling to exit + * current process. + */ +void nvgpu_bug_exit(int status); + /** * @brief Register callback to be invoked on BUG() * diff --git a/drivers/gpu/nvgpu/os/posix/bug.c b/drivers/gpu/nvgpu/os/posix/bug.c index 4c08debdd..601997a85 100644 --- a/drivers/gpu/nvgpu/os/posix/bug.c +++ b/drivers/gpu/nvgpu/os/posix/bug.c @@ -89,6 +89,13 @@ static void nvgpu_bug_init(void) bug.in_use = true; } +void nvgpu_bug_exit(int status) +{ +#ifndef __NVGPU_UNIT_TEST__ + exit(status); +#endif +} + void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb) { (void) pthread_once(&bug.once, nvgpu_bug_init); diff --git a/userspace/units/fifo/fifo/nvgpu-fifo.c b/userspace/units/fifo/fifo/nvgpu-fifo.c index 123ac3cde..300061c20 100644 --- a/userspace/units/fifo/fifo/nvgpu-fifo.c +++ b/userspace/units/fifo/fifo/nvgpu-fifo.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "hal/init/hal_gv11b.h" #include "nvgpu/hw/gk20a/hw_fifo_gk20a.h" @@ -166,6 +167,7 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args) u32 reg_val; int ret = UNIT_FAIL; int err; + u32 runlist_mask; err = test_fifo_setup_gv11b_reg_space(m, g); unit_assert(err == 0, goto done); @@ -176,9 +178,12 @@ int test_fifo_sw_quiesce(struct unit_module *m, struct gk20a *g, void *args) unit_assert(err == 0, goto done); #ifndef CONFIG_NVGPU_RECOVERY - nvgpu_fifo_sw_quiesce(g); - reg_val = nvgpu_readl(g, fifo_sched_disable_r()); - unit_assert(reg_val == 3U, goto done); + runlist_mask = nvgpu_runlist_get_runlists_mask(g, 0U, + ID_TYPE_UNKNOWN, 0U, 0U); + unit_assert(runlist_mask != 0U, goto done); + nvgpu_fifo_sw_quiesce(g); + reg_val = nvgpu_readl(g, fifo_sched_disable_r()); + unit_assert((reg_val & runlist_mask) == runlist_mask, goto done); #endif ret = UNIT_SUCCESS; diff --git a/userspace/units/init/nvgpu-init.c b/userspace/units/init/nvgpu-init.c index 39671b606..a86709815 100644 --- a/userspace/units/init/nvgpu-init.c +++ b/userspace/units/init/nvgpu-init.c @@ -719,7 +719,6 @@ int test_quiesce(struct unit_module *m, struct gk20a *g, void *args) nvgpu_set_power_state(g, NVGPU_STATE_POWERED_ON); /* make sure we simulate interrupts enabled */ - g->mc.irqs_enabled = true; intr_masked = false; /* setup HAL for masking interrupts */ @@ -740,10 +739,8 @@ int test_quiesce(struct unit_module *m, struct gk20a *g, void *args) /* wait for quiesce thread to complete */ nvgpu_thread_join(&g->sw_quiesce_thread); - - - if (g->mc.irqs_enabled || !intr_masked) { - unit_err(m, "quiesce failed to disable interrupts\n"); + if (!intr_masked) { + unit_err(m, "quiesce failed to mask interrupts\n"); ret = UNIT_FAIL; }