From 6789a862e6a68e029a635ce75b2fe3a500a4ad75 Mon Sep 17 00:00:00 2001 From: Nitin Kumbhar Date: Tue, 9 Apr 2019 15:39:13 +0530 Subject: [PATCH] gpu: nvgpu: skip nvlink shutdown on invalid gpu state A dGPU can disappear from PCI bus for various reasons. This is detected while accessing GPU registers and system is rebooted. If dGPU has disappeared from the system, driver shutdown cannot access dGPU registers. Skip any such de-initialization (nvlink) done during shutdown. Bug 200505461 Change-Id: Ief2e84212421093e57e63ff5958b209bd6857db9 Signed-off-by: Nitin Kumbhar Reviewed-on: https://git-master.nvidia.com/r/2093302 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/init/nvgpu_init.c | 10 +++++++++- drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 2 ++ drivers/gpu/nvgpu/os/linux/pci.c | 7 +++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c index 792aaa5eb..f9deb6f98 100644 --- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c +++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c @@ -49,13 +49,21 @@ #include "gk20a/ce2_gk20a.h" -void __nvgpu_check_gpu_state(struct gk20a *g) +bool is_nvgpu_gpu_state_valid(struct gk20a *g) { u32 boot_0 = 0xffffffffU; boot_0 = nvgpu_mc_boot_0(g, NULL, NULL, NULL); if (boot_0 == 0xffffffffU) { nvgpu_err(g, "GPU has disappeared from bus!!"); + return false; + } + return true; +} + +void __nvgpu_check_gpu_state(struct gk20a *g) +{ + if (!is_nvgpu_gpu_state_valid(g)) { nvgpu_err(g, "Rebooting system!!"); nvgpu_kernel_restart(NULL); } diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 150897ae9..36ec4bcba 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -2284,6 +2284,8 @@ struct gk20a_cyclestate_buffer_elem { void __nvgpu_check_gpu_state(struct gk20a *g); void __gk20a_warn_on_no_regs(void); +bool is_nvgpu_gpu_state_valid(struct gk20a *g); + /* classes that the device supports */ /* TBD: get these from an open-sourced SDK? */ enum { diff --git a/drivers/gpu/nvgpu/os/linux/pci.c b/drivers/gpu/nvgpu/os/linux/pci.c index d7b8c46c4..af9da76f0 100644 --- a/drivers/gpu/nvgpu/os/linux/pci.c +++ b/drivers/gpu/nvgpu/os/linux/pci.c @@ -747,8 +747,11 @@ void nvgpu_pci_shutdown(struct pci_dev *pdev) if (gk20a_gpu_is_virtual(dev)) return; - err = nvgpu_nvlink_deinit(g); - WARN(err, "gpu failed to remove nvlink"); + if (is_nvgpu_gpu_state_valid(g)) { + err = nvgpu_nvlink_deinit(g); + WARN(err, "gpu failed to remove nvlink"); + } else + nvgpu_err(g, "skipped nvlink deinit"); nvgpu_info(g, "shut down complete"); }