gpu: nvgpu: fix arbiter teardown on PCI

The driver is not properly tearing down the arbiter on the PCI driver unload. This change makes sure that the workqueues are drained before tearing down the driver bug 200277762 JIRA: EVLR-1023 Change-Id: If98fd00e27949ba1569dd26e2af02b75897231a7 Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: http://git-master/r/1320147 Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-25 02:52:51 +03:00 · 2017-02-13 11:22:59 -08:00
parent 50f371f891
commit 469308beca
3 changed files with 63 additions and 26 deletions
--- a/drivers/gpu/nvgpu/clk/clk_arb.c
+++ b/drivers/gpu/nvgpu/clk/clk_arb.c
@@ -403,7 +403,8 @@ void nvgpu_clk_arb_schedule_alarm(struct gk20a *g, u32 alarm)
 	struct nvgpu_clk_arb *arb = g->clk_arb;

 	nvgpu_clk_arb_set_global_alarm(g, alarm);
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);
 }

 static void nvgpu_clk_arb_clear_global_alarm(struct gk20a *g, u32 alarm)
@@ -455,8 +456,30 @@ static void nvgpu_clk_arb_set_global_alarm(struct gk20a *g, u32 alarm)

 void nvgpu_clk_arb_cleanup_arbiter(struct gk20a *g)
 {
+	struct nvgpu_clk_arb *arb = g->clk_arb;
+	int index;
+
+	if (arb) {
+		cancel_work_sync(&arb->vf_table_fn_work);
+		destroy_workqueue(arb->vf_table_work_queue);
+		arb->vf_table_work_queue = NULL;
+
+		cancel_work_sync(&arb->update_fn_work);
+		destroy_workqueue(arb->update_work_queue);
+		arb->update_work_queue = NULL;
+
+		kfree(arb->gpc2clk_f_points);
+		kfree(arb->mclk_f_points);
+
+		for (index = 0; index < 2; index++) {
+			kfree(arb->vf_table_pool[index].gpc2clk_points);
+			kfree(arb->vf_table_pool[index].mclk_points);
+		}
+	}
+
 	nvgpu_mutex_destroy(&g->clk_arb->pstate_lock);
 	kfree(g->clk_arb);
+	g->clk_arb = NULL;
 }

 static int nvgpu_clk_arb_install_fd(struct gk20a *g,
@@ -575,9 +598,11 @@ static void nvgpu_clk_arb_free_session(struct kref *refcount)

 	gk20a_dbg_fn("");

-	nvgpu_spinlock_acquire(&arb->sessions_lock);
-	list_del_rcu(&session->link);
-	nvgpu_spinlock_release(&arb->sessions_lock);
+	if (arb) {
+		nvgpu_spinlock_acquire(&arb->sessions_lock);
+		list_del_rcu(&session->link);
+		nvgpu_spinlock_release(&arb->sessions_lock);
+	}

 	head = llist_del_all(&session->targets);
 	llist_for_each_entry_safe(dev, tmp, head, node) {
@@ -596,8 +621,8 @@ void nvgpu_clk_arb_release_session(struct gk20a *g,

 	session->zombie = true;
 	kref_put(&session->refcount, nvgpu_clk_arb_free_session);
-
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb && arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);
 }

 int nvgpu_clk_arb_install_event_fd(struct gk20a *g,
@@ -964,8 +989,8 @@ exit_vf_table:
 	if (status < 0)
 		nvgpu_clk_arb_set_global_alarm(g,
 			EVENT(ALARM_VF_TABLE_UPDATE_FAILED));
-
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);

 	return status;
 }
@@ -973,8 +998,8 @@ exit_vf_table:
 void nvgpu_clk_arb_schedule_vf_table_update(struct gk20a *g)
 {
 	struct nvgpu_clk_arb *arb = g->clk_arb;
-
-	queue_work(arb->vf_table_work_queue, &arb->vf_table_fn_work);
+	if (arb->vf_table_work_queue)
+		queue_work(arb->vf_table_work_queue, &arb->vf_table_fn_work);
 }

 static void nvgpu_clk_arb_run_vf_table_cb(struct work_struct *work)
@@ -991,8 +1016,9 @@ static void nvgpu_clk_arb_run_vf_table_cb(struct work_struct *work)
 			"failed to cache VF table");
 		nvgpu_clk_arb_set_global_alarm(g,
 			EVENT(ALARM_VF_TABLE_UPDATE_FAILED));
-
-		queue_work(arb->update_work_queue, &arb->update_fn_work);
+		if (arb->update_work_queue)
+			queue_work(arb->update_work_queue,
+				&arb->update_fn_work);

 		return;
 	}
@@ -1490,8 +1516,8 @@ int nvgpu_clk_arb_commit_request_fd(struct gk20a *g,
 	}
 	kref_get(&dev->refcount);
 	llist_add(&dev->node, &session->targets);
-
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);

 fdput_fd:
 	fdput(fd);
@@ -1568,15 +1594,12 @@ static int nvgpu_clk_arb_release_completion_dev(struct inode *inode,
 {
 	struct nvgpu_clk_dev *dev = filp->private_data;
 	struct nvgpu_clk_session *session = dev->session;
-	struct nvgpu_clk_arb *arb;

-	arb = session->g->clk_arb;

 	gk20a_dbg_fn("");

 	kref_put(&session->refcount, nvgpu_clk_arb_free_session);
 	kref_put(&dev->refcount, nvgpu_clk_arb_free_fd);
-
 	return 0;
 }

@@ -1591,15 +1614,17 @@ static int nvgpu_clk_arb_release_event_dev(struct inode *inode,

 	gk20a_dbg_fn("");

-	nvgpu_spinlock_acquire(&arb->users_lock);
-	list_del_rcu(&dev->link);
-	nvgpu_spinlock_release(&arb->users_lock);
+	if (arb) {
+		nvgpu_spinlock_acquire(&arb->users_lock);
+		list_del_rcu(&dev->link);
+		nvgpu_spinlock_release(&arb->users_lock);
+	}

 	synchronize_rcu();
 	kref_put(&session->refcount, nvgpu_clk_arb_free_session);

 	nvgpu_clk_notification_queue_free(&dev->queue);
-	kfree(dev);
+	kref_put(&dev->refcount, nvgpu_clk_arb_free_fd);

 	return 0;
 }
--- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -39,6 +39,7 @@

 struct gk20a_ctrl_priv {
 	struct device *dev;
+	struct gk20a *g;
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	struct nvgpu_clk_session *clk_session;
 #endif
@@ -58,35 +59,42 @@ int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
 	priv = kzalloc(sizeof(struct gk20a_ctrl_priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
-
 	filp->private_data = priv;
 	priv->dev = g->dev;
+	/*
+	 * We dont close the arbiter fd's after driver teardown to support
+	 * GPU_LOST events, so we store g here, instead of dereferencing the
+	 * dev structure on teardown
+	 */
+	priv->g = g;

 	if (!g->gr.sw_ready) {
 		err = gk20a_busy(g->dev);
 		if (err)
 			return err;
-
 		gk20a_idle(g->dev);
 	}

 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	err = nvgpu_clk_arb_init_session(g, &priv->clk_session);
+	if (err)
+		return err;
 #endif
+
 	return err;
 }
-
 int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp)
 {
 	struct gk20a_ctrl_priv *priv = filp->private_data;
+	struct gk20a *g = priv->g;

 	gk20a_dbg_fn("");

 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	if (priv->clk_session)
-		nvgpu_clk_arb_release_session(gk20a_from_dev(priv->dev),
-				priv->clk_session);
+		nvgpu_clk_arb_release_session(g, priv->clk_session);
 #endif
+
 	kfree(priv);

 	return 0;
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -460,6 +460,10 @@ static void nvgpu_pci_remove(struct pci_dev *pdev)
 	gk20a_wait_for_idle(&pdev->dev);
 	gk20a_dbg(gpu_dbg_shutdown, "Driver idle.\n");

+#ifdef CONFIG_ARCH_TEGRA_18x_SOC
+	nvgpu_clk_arb_cleanup_arbiter(g);
+#endif
+
 	gk20a_user_deinit(g->dev, &nvgpu_pci_class);
 	gk20a_dbg(gpu_dbg_shutdown, "User de-init done.\b");