gpu: nvgpu: prevent crash during unbind

This change solves crashes during bind that were introduced in the driver during the OS unification refactoring due to lack of coverage of the remove() function. The fixes during remove are: (1) Prevent NULL dereference on GPUs with secure boot (2) Prevent NULL dereferences when fecs_trace is not enabled (3) Added PRAMIN blocker during driver removal if HW is no longer accesible (4) Prevent double free of debugfs nodes as they are handled on the debugfs_remove_recursive() call (5) quiesce() can now be called without checking is HW accesible flag is set (6) added function to free irq so no IRQ association is left on the driver after it is removed (7) prevent NULL dereference on nvgpu_thread_stop() if the thread is already stopped JIRA: EVLR-1739 Change-Id: I787d38f202d5267a6b34815f23e1bc88110e8455 Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1563005 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-23 01:50:07 +03:00 · 2017-09-18 20:31:28 -07:00
parent f6fcecfc6f
commit 7134e9e852
12 changed files with 98 additions and 65 deletions
--- a/drivers/gpu/nvgpu/common/linux/debug.c
+++ b/drivers/gpu/nvgpu/common/linux/debug.c
@@ -409,5 +409,5 @@ void gk20a_debug_deinit(struct gk20a *g)
 	gk20a_fifo_debugfs_deinit(g);

 	debugfs_remove_recursive(l->debugfs);
-	debugfs_remove_recursive(l->debugfs_alias);
+	debugfs_remove(l->debugfs_alias);
 }
--- a/drivers/gpu/nvgpu/common/linux/debug_allocator.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_allocator.c
@@ -55,8 +55,6 @@ void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a)

 void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a)
 {
-	if (!IS_ERR_OR_NULL(a->debugfs_entry))
-		debugfs_remove(a->debugfs_entry);
 }

 void nvgpu_alloc_debugfs_init(struct gk20a *g)
--- a/drivers/gpu/nvgpu/common/linux/debug_clk.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_clk.c
@@ -267,6 +267,5 @@ int gm20b_clk_init_debugfs(struct gk20a *g)

 err_out:
 	pr_err("%s: Failed to make debugfs node\n", __func__);
-	debugfs_remove_recursive(l->debugfs);
 	return -ENOMEM;
 }
--- a/drivers/gpu/nvgpu/common/linux/debug_pmu.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_pmu.c
@@ -477,6 +477,5 @@ int gk20a_pmu_debugfs_init(struct gk20a *g)
 	return 0;
 err_out:
 	pr_err("%s: Failed to make debugfs node\n", __func__);
-	debugfs_remove_recursive(l->debugfs);
 	return -ENOMEM;
 }
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -226,9 +226,12 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
 	 * After this point, gk20a interrupts should not get
 	 * serviced.
 	 */
+	if (g->irqs_enabled) {
 		disable_irq(g->irq_stall);
 		if (g->irq_stall != g->irq_nonstall)
 			disable_irq(g->irq_nonstall);
+		g->irqs_enabled = 0;
+	}

 	/* Decrement platform power refcount */
 	if (platform->idle)
@@ -640,6 +643,18 @@ static int gk20a_pm_unrailgate(struct device *dev)
 	return ret;
 }

+/*
+ * Remove association of the driver with OS interrupt handler
+ */
+void nvgpu_free_irq(struct gk20a *g)
+{
+	struct device *dev = dev_from_gk20a(g);
+
+	devm_free_irq(dev, g->irq_stall, g);
+	if (g->irq_stall != g->irq_nonstall)
+		devm_free_irq(dev, g->irq_nonstall, g);
+}
+
 /*
 * Idle the GPU in preparation of shutdown/remove.
 * gk20a_driver_start_unload() does not idle the GPU, but instead changes the SW
@@ -651,6 +666,7 @@ int nvgpu_quiesce(struct gk20a *g)
 	int err;
 	struct device *dev = dev_from_gk20a(g);

+	if (g->power_on) {
 		err = gk20a_wait_for_idle(g);
 		if (err) {
 			nvgpu_err(g, "failed to idle GPU, err=%d", err);
@@ -659,7 +675,8 @@ int nvgpu_quiesce(struct gk20a *g)

 		err = gk20a_fifo_disable_all_engine_activity(g, true);
 		if (err) {
-		nvgpu_err(g, "failed to disable engine activity, err=%d",
+			nvgpu_err(g,
+				"failed to disable engine activity, err=%d",
 				err);
 		return err;
 		}
@@ -670,6 +687,7 @@ int nvgpu_quiesce(struct gk20a *g)
 				err);
 			return err;
 		}
+	}

 	if (gk20a_gpu_is_virtual(dev))
 		err = vgpu_pm_prepare_poweroff(dev);
@@ -679,6 +697,7 @@ int nvgpu_quiesce(struct gk20a *g)
 	if (err)
 		nvgpu_err(g, "failed to prepare for poweroff, err=%d",
 			err);
+
 	return err;
 }

--- a/drivers/gpu/nvgpu/common/linux/module.h
+++ b/drivers/gpu/nvgpu/common/linux/module.h
@@ -21,6 +21,7 @@ void gk20a_remove_support(struct gk20a *g);
 void gk20a_driver_start_unload(struct gk20a *g);
 int nvgpu_quiesce(struct gk20a *g);
 int nvgpu_remove(struct device *dev, struct class *class);
+void nvgpu_free_irq(struct gk20a *g);

 extern struct class nvgpu_class;

--- a/drivers/gpu/nvgpu/common/linux/pci.c
+++ b/drivers/gpu/nvgpu/common/linux/pci.c
@@ -521,13 +521,12 @@ static void nvgpu_pci_remove(struct pci_dev *pdev)
 	if (gk20a_gpu_is_virtual(dev))
 		return;

-	/* only idle the GPU if the GPU is powered on */
-	if (g->power_on) {
 	gk20a_driver_start_unload(g);
 	err = nvgpu_quiesce(g);
 	/* TODO: handle failure to idle */
 	WARN(err, "gpu failed to idle during driver removal");
-	}
+
+	nvgpu_free_irq(g);

 	nvgpu_remove(dev, &nvgpu_pci_class);

--- a/drivers/gpu/nvgpu/common/linux/thread.c
+++ b/drivers/gpu/nvgpu/common/linux/thread.c
@@ -46,8 +46,10 @@ int nvgpu_thread_create(struct nvgpu_thread *thread,

 void nvgpu_thread_stop(struct nvgpu_thread *thread)
 {
+	if (thread->task) {
 		kthread_stop(thread->task);
 		thread->task = NULL;
+	}
 };

 bool nvgpu_thread_should_stop(struct nvgpu_thread *thread)
--- a/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
+++ b/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
@@ -2223,6 +2223,7 @@ static void nvgpu_remove_pmu_support(struct nvgpu_pmu *pmu)
 	if (nvgpu_alloc_initialized(&pmu->dmem))
 		nvgpu_alloc_destroy(&pmu->dmem);

+	if (pmu->fw)
 		nvgpu_release_firmware(g, pmu->fw);

 	nvgpu_mutex_destroy(&pmu->elpg_mutex);
--- a/drivers/gpu/nvgpu/common/pramin.c
+++ b/drivers/gpu/nvgpu/common/pramin.c
@@ -16,6 +16,7 @@

 #include <nvgpu/pramin.h>
 #include <nvgpu/page_allocator.h>
+#include <nvgpu/enabled.h>

 #include "gk20a/gk20a.h"

@@ -88,6 +89,14 @@ void nvgpu_pramin_access_batched(struct gk20a *g, struct nvgpu_mem *mem,
 	void *sgl;
 	u32 byteoff, start_reg, until_end, n;

+	/*
+	 * TODO: Vidmem is not accesible through pramin on shutdown path.
+	 * driver should be refactored to prevent this from happening, but for
+	 * now it is ok just to ignore the writes
+	 */
+	if (!g->regs && nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
+		return;
+
 	alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
 	sgt = &alloc->sgt;
 	for (sgl = sgt->sgl; sgl; sgl = nvgpu_sgt_get_next(sgt, sgl)) {
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -465,21 +465,30 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)

 	trace_gk20a_free_channel(ch->chid);

+	/*
+	 * Disable channel/TSG and unbind here. This should not be executed if
+	 * HW access is not available during shutdown/removal path as it will
+	 * trigger a timeout
+	 */
+	if (!nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) {
 		/* abort channel and remove from runlist */
 		if (gk20a_is_channel_marked_as_tsg(ch)) {
 			err = g->ops.fifo.tsg_unbind_channel(ch);
 			if (err)
-			nvgpu_err(g, "failed to unbind channel %d from TSG", ch->chid);
+				nvgpu_err(g,
+					"failed to unbind channel %d from TSG",
+					ch->chid);
 			/*
 			 * Channel is not a part of TSG this point onwards
 			 * So stash its status and use it whenever necessary
-		 * e.g. while releasing gr_ctx in g->ops.gr.free_channel_ctx()
+			 * e.g. while releasing gr_ctx in
+			 * g->ops.gr.free_channel_ctx()
 			 */
 			was_tsg = true;
 		} else {
 			gk20a_disable_channel(ch);
 		}
-
+	}
 	/* wait until there's only our ref to the channel */
 	if (!force)
 		gk20a_wait_until_counter_is_N(
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -67,6 +67,7 @@ struct gk20a_fecs_trace {
 	struct nvgpu_mutex hash_lock;
 	struct nvgpu_mutex poll_lock;
 	struct nvgpu_thread poll_task;
+	bool init;
 };

 #ifdef CONFIG_GK20A_CTXSW_TRACE
@@ -547,23 +548,12 @@ static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
 		&gk20a_fecs_trace_debugfs_ring_fops);
 }

-static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
-{
-	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
-
-	debugfs_remove_recursive(l->debugfs);
-}
-
 #else

 static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
 {
 }

-static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
-{
-}
-
 #endif /* CONFIG_DEBUG_FS */

 int gk20a_fecs_trace_init(struct gk20a *g)
@@ -598,6 +588,9 @@ int gk20a_fecs_trace_init(struct gk20a *g)
 		NVGPU_GPU_FLAGS_SUPPORT_FECS_CTXSW_TRACE;

 	gk20a_fecs_trace_debugfs_init(g);
+
+	trace->init = true;
+
 	return 0;

 clean_hash_lock:
@@ -682,6 +675,7 @@ int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
 {
 	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);

+	if (g->fecs_trace) {
 		gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
 			"ch=%p context_ptr=%x", ch, context_ptr);

@@ -691,6 +685,7 @@ int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
 			gk20a_fecs_trace_poll(g);
 		}
 		gk20a_fecs_trace_hash_del(g, context_ptr);
+	}
 	return 0;
 }

@@ -709,7 +704,9 @@ int gk20a_fecs_trace_deinit(struct gk20a *g)
 {
 	struct gk20a_fecs_trace *trace = g->fecs_trace;

-	gk20a_fecs_trace_debugfs_cleanup(g);
+	if (!trace->init)
+		return 0;
+
 	nvgpu_thread_stop(&trace->poll_task);
 	gk20a_fecs_trace_free_ring(g);
 	gk20a_fecs_trace_free_hash_table(g);