gpu: nvgpu: Unify remove/shutdown codepaths

The following changes are part of the porting of the bind/unbind
functionality.

These changes reuse the shutdown codepaths in iGPU and dGPU and fix a locking
issue with in gk20a_busy() where the usage count can lead to a deadlock during
the driver shutdown. It fixes a racing condition with the gr/mm code by
invalidating the sw ready flag while holding the busy lock

JIRA: EVLR-1739

Change-Id: I62ce47378436b21f447f4cd93388759ed3f9bad1
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1554959
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
David Nieto
2017-09-07 16:12:44 -07:00
committed by mobile promotions
parent 980bf96bf2
commit ef6ea3475c
4 changed files with 88 additions and 81 deletions

View File

@@ -489,9 +489,8 @@ void nvgpu_clk_arb_cleanup_arbiter(struct gk20a *g)
nvgpu_kfree(g, arb->vf_table_pool[index].gpc2clk_points);
nvgpu_kfree(g, arb->vf_table_pool[index].mclk_points);
}
nvgpu_mutex_destroy(&g->clk_arb->pstate_lock);
}
nvgpu_mutex_destroy(&g->clk_arb->pstate_lock);
nvgpu_kfree(g, g->clk_arb);
g->clk_arb = NULL;
}

View File

@@ -640,6 +640,48 @@ static int gk20a_pm_unrailgate(struct device *dev)
return ret;
}
/*
* Idle the GPU in preparation of shutdown/remove.
* gk20a_driver_start_unload() does not idle the GPU, but instead changes the SW
* state to prevent further activity on the driver SW side.
* On driver removal quiesce() should be called after start_unload()
*/
int nvgpu_quiesce(struct gk20a *g)
{
int err;
struct device *dev = dev_from_gk20a(g);
err = gk20a_wait_for_idle(g);
if (err) {
nvgpu_err(g, "failed to idle GPU, err=%d", err);
return err;
}
err = gk20a_fifo_disable_all_engine_activity(g, true);
if (err) {
nvgpu_err(g, "failed to disable engine activity, err=%d",
err);
return err;
}
err = gk20a_fifo_wait_engine_idle(g);
if (err) {
nvgpu_err(g, "failed to idle engines, err=%d",
err);
return err;
}
if (gk20a_gpu_is_virtual(dev))
err = vgpu_pm_prepare_poweroff(dev);
else
err = gk20a_pm_prepare_poweroff(dev);
if (err)
nvgpu_err(g, "failed to prepare for poweroff, err=%d",
err);
return err;
}
static void gk20a_pm_shutdown(struct platform_device *pdev)
{
struct gk20a_platform *platform = platform_get_drvdata(pdev);
@@ -668,35 +710,9 @@ static void gk20a_pm_shutdown(struct platform_device *pdev)
/* Prevent more requests by disabling Runtime PM */
__pm_runtime_disable(&pdev->dev, false);
err = gk20a_wait_for_idle(g);
if (err) {
nvgpu_err(g, "failed to idle GPU, err=%d", err);
err = nvgpu_quiesce(g);
if (err)
goto finish;
}
err = gk20a_fifo_disable_all_engine_activity(g, true);
if (err) {
nvgpu_err(g, "failed to disable engine activity, err=%d",
err);
goto finish;
}
err = gk20a_fifo_wait_engine_idle(g);
if (err) {
nvgpu_err(g, "failed to idle engines, err=%d",
err);
goto finish;
}
if (gk20a_gpu_is_virtual(&pdev->dev))
err = vgpu_pm_prepare_poweroff(&pdev->dev);
else
err = gk20a_pm_prepare_poweroff(&pdev->dev);
if (err) {
nvgpu_err(g, "failed to prepare for poweroff, err=%d",
err);
goto finish;
}
err = gk20a_pm_railgate(&pdev->dev);
if (err)
@@ -854,6 +870,9 @@ void gk20a_driver_start_unload(struct gk20a *g)
down_write(&g->busy_lock);
__nvgpu_set_enabled(g, NVGPU_DRIVER_IS_DYING, true);
/* GR SW ready needs to be invalidated at this time with the busy lock
* held to prevent a racing condition on the gr/mm code */
g->gr.sw_ready = false;
up_write(&g->busy_lock);
if (g->is_virtual)
@@ -979,18 +998,14 @@ static int gk20a_probe(struct platform_device *dev)
return 0;
}
static int __exit gk20a_remove(struct platform_device *pdev)
int nvgpu_remove(struct device *dev, struct class *class)
{
struct device *dev = &pdev->dev;
struct gk20a *g = get_gk20a(dev);
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct gk20a_platform *platform = gk20a_get_platform(dev);
gk20a_dbg_fn("");
if (gk20a_gpu_is_virtual(dev))
return vgpu_remove(pdev);
if (platform->has_cde)
gk20a_cde_destroy(l);
@@ -1001,16 +1016,11 @@ static int __exit gk20a_remove(struct platform_device *pdev)
if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
gk20a_scale_exit(dev);
if (g->remove_support)
g->remove_support(g);
gk20a_ce_destroy(g);
#ifdef CONFIG_ARCH_TEGRA_18x_SOC
nvgpu_clk_arb_cleanup_arbiter(g);
#endif
gk20a_user_deinit(dev, &nvgpu_class);
gk20a_user_deinit(dev, class);
gk20a_debug_deinit(g);
@@ -1026,14 +1036,28 @@ static int __exit gk20a_remove(struct platform_device *pdev)
if (platform->remove)
platform->remove(dev);
set_gk20a(pdev, NULL);
gk20a_put(g);
gk20a_dbg_fn("removed");
return 0;
}
static int __exit gk20a_remove(struct platform_device *pdev)
{
int err;
struct device *dev = &pdev->dev;
struct gk20a *g = get_gk20a(dev);
if (gk20a_gpu_is_virtual(dev))
return vgpu_remove(pdev);
err = nvgpu_remove(dev, &nvgpu_class);
set_gk20a(pdev, NULL);
gk20a_put(g);
return err;
}
static struct platform_driver gk20a_driver = {
.probe = gk20a_probe,
.remove = __exit_p(gk20a_remove),

View File

@@ -19,6 +19,8 @@ struct device;
int gk20a_pm_finalize_poweron(struct device *dev);
void gk20a_remove_support(struct gk20a *g);
void gk20a_driver_start_unload(struct gk20a *g);
int nvgpu_quiesce(struct gk20a *g);
int nvgpu_remove(struct device *dev, struct class *class);
extern struct class nvgpu_class;

View File

@@ -513,52 +513,34 @@ static int nvgpu_pci_probe(struct pci_dev *pdev,
static void nvgpu_pci_remove(struct pci_dev *pdev)
{
struct gk20a_platform *platform = gk20a_get_platform(&pdev->dev);
struct gk20a *g = get_gk20a(&pdev->dev);
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct device *dev = dev_from_gk20a(g);
int err;
gk20a_dbg(gpu_dbg_shutdown, "Removing nvgpu driver!\n");
/* no support yet for unbind if DGPU is in VGPU mode */
if (gk20a_gpu_is_virtual(dev))
return;
if (g->irqs_enabled)
disable_irq(g->irq_stall);
/* only idle the GPU if the GPU is powered on */
if (g->power_on) {
gk20a_driver_start_unload(g);
err = nvgpu_quiesce(g);
/* TODO: handle failure to idle */
WARN(err, "gpu failed to idle during driver removal");
}
devm_free_irq(&pdev->dev, g->irq_stall, g);
nvgpu_remove(dev, &nvgpu_pci_class);
#if defined(CONFIG_PCI_MSI)
if (g->msi_enabled) {
if (g->msi_enabled)
pci_disable_msi(pdev);
g->msi_enabled = false;
else {
/* IRQ does not need to be enabled in MSI as the line is not
* shared
*/
enable_irq(g->irq_stall);
}
#endif
gk20a_dbg(gpu_dbg_shutdown, "IRQs disabled.\n");
/*
* Wait for the driver to finish up all the IOCTLs it's working on
* before cleaning up the driver's data structures.
*/
gk20a_driver_start_unload(g);
gk20a_dbg(gpu_dbg_shutdown, "Driver idle.\n");
#ifdef CONFIG_ARCH_TEGRA_18x_SOC
nvgpu_clk_arb_cleanup_arbiter(g);
#endif
gk20a_user_deinit(dev_from_gk20a(g), &nvgpu_pci_class);
gk20a_dbg(gpu_dbg_shutdown, "User de-init done.\b");
#ifdef CONFIG_DEBUG_FS
debugfs_remove_recursive(l->debugfs);
debugfs_remove_recursive(l->debugfs_alias);
#endif
nvgpu_remove_sysfs(dev_from_gk20a(g));
if (platform->remove)
platform->remove(dev_from_gk20a(g));
gk20a_dbg(gpu_dbg_shutdown, "Platform remove done.\b");
enable_irq(g->irq_stall);
gk20a_get_platform(&pdev->dev)->g = NULL;
gk20a_put(g);
}