From 179e3cf84ae9b702c943c7a58c5e4f778267c6ca Mon Sep 17 00:00:00 2001
From: Abdul Salam <absalam@nvidia.com>
Date: Sat, 6 Apr 2019 15:32:49 +0530
Subject: [PATCH] gpu: nvpgu: Fix exit path of PMU and clk_arb

This patch fixes below two issues.
1.Currently clk arb exit is called after GPU registers are released.
  This causes crash when clk arb WQ accesses GPU HW register for status.
  The ideal way is to exit the clk_arb which removes the WQ from running
  before calling lockout register.
2.Check if dGPU is dying during processing of PMU Commands.
  This prevents race condition when PMU is waiting for response and device
  is shutdown.

Bug 200488054

Change-Id: I812b07af7db4494d5ea2ed6197742ceb23d30a4b
Signed-off-by: Abdul Salam <absalam@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2081916
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/clk_arb/clk_arb.c       | 11 +++++------
 drivers/gpu/nvgpu/common/clk_arb/clk_arb_gp10b.c |  1 +
 drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.c |  7 +++++++
 drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.h |  2 +-
 drivers/gpu/nvgpu/common/init/nvgpu_init.c       |  3 +++
 drivers/gpu/nvgpu/common/pmu/ipc/pmu_cmd.c       |  4 ++++
 drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c       | 16 ++++++++++++++++
 drivers/gpu/nvgpu/include/nvgpu/clk_arb.h        |  1 +
 drivers/gpu/nvgpu/include/nvgpu/gk20a.h          |  1 +
 drivers/gpu/nvgpu/tu104/hal_tu104.c              |  1 +
 10 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c b/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c
index c37e33e52..098add86c 100644
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c
@@ -442,11 +442,11 @@ static void nvgpu_clk_arb_worker_process(struct gk20a *g, int *get)
 
 		if (!work_item) {
 			/*
-			 * Woke up for some other reason, but there are no
-			 * other reasons than a work item added in the items list
-			 * currently, so warn and ack the message.
+			 * Woke up but found work_item empty ?
+			 * This can happen when thread is stopped as driver is
+			 * dying, so inform and ack the message.
 			 */
-			nvgpu_warn(g, "Spurious worker event!");
+			nvgpu_info(g, "Spurious worker event!");
 			++*get;
 			break;
 		}
@@ -621,7 +621,7 @@ void nvgpu_clk_arb_send_thermal_alarm(struct gk20a *g)
 		BIT32(NVGPU_EVENT_ALARM_THERMAL_ABOVE_THRESHOLD));
 }
 
-static void nvgpu_clk_arb_worker_deinit(struct gk20a *g)
+void nvgpu_clk_arb_worker_deinit(struct gk20a *g)
 {
 	nvgpu_mutex_acquire(&g->clk_arb_worker.start_lock);
 	nvgpu_thread_stop(&g->clk_arb_worker.poll_task);
@@ -635,7 +635,6 @@ void nvgpu_clk_arb_cleanup_arbiter(struct gk20a *g)
 	nvgpu_mutex_acquire(&g->clk_arb_enable_lock);
 
 	if (arb) {
-		nvgpu_clk_arb_worker_deinit(g);
 		g->ops.clk_arb.clk_arb_cleanup(g->clk_arb);
 	}
 
diff --git a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gp10b.c b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gp10b.c
index 6c477d86f..9728c13d1 100644
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gp10b.c
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gp10b.c
@@ -396,6 +396,7 @@ void gp10b_clk_arb_cleanup(struct nvgpu_clk_arb *arb)
 	struct gk20a *g = arb->g;
 	int index;
 
+	nvgpu_clk_arb_worker_deinit(g);
 	nvgpu_kfree(g, arb->gpc2clk_f_points);
 
 	for (index = 0; index < 2; index++) {
diff --git a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.c b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.c
index 6c63303be..e5a586925 100644
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.c
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.c
@@ -522,3 +522,10 @@ void gv100_clk_arb_cleanup(struct nvgpu_clk_arb *arb)
 
 	g->clk_arb = NULL;
 }
+
+void gv100_stop_clk_arb_threads(struct gk20a *g)
+{
+	nvgpu_mutex_acquire(&g->clk_arb_worker.start_lock);
+	nvgpu_thread_stop(&g->clk_arb_worker.poll_task);
+	nvgpu_mutex_release(&g->clk_arb_worker.start_lock);
+}
diff --git a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.h b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.h
index a18b0e1e2..f8e1112ff 100644
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.h
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.h
@@ -36,5 +36,5 @@ int gv100_get_arbiter_clk_default(struct gk20a *g, u32 api_domain,
 int gv100_init_clk_arbiter(struct gk20a *g);
 void gv100_clk_arb_run_arbiter_cb(struct nvgpu_clk_arb *arb);
 void gv100_clk_arb_cleanup(struct nvgpu_clk_arb *arb);
-
+void gv100_stop_clk_arb_threads(struct gk20a *g);
 #endif /* CLK_ARB_GV100_H */
diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
index 757c0adca..0a81c1419 100644
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -126,6 +126,9 @@ int gk20a_prepare_poweroff(struct gk20a *g)
 	if (g->ops.clk.suspend_clk_support != NULL) {
 		g->ops.clk.suspend_clk_support(g);
 	}
+	if (g->ops.clk_arb.stop_clk_arb_threads != NULL) {
+		g->ops.clk_arb.stop_clk_arb_threads(g);
+	}
 	gk20a_mask_interrupts(g);
 
 	g->power_on = false;
diff --git a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_cmd.c b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_cmd.c
index c30f8c130..9c6a0c364 100644
--- a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_cmd.c
+++ b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_cmd.c
@@ -615,6 +615,10 @@ int nvgpu_pmu_rpc_execute(struct nvgpu_pmu *pmu, struct nv_pmu_rpc_header *rpc,
 	void *rpc_buff = NULL;
 	int status = 0;
 
+	if (nvgpu_can_busy(g) == 0) {
+		return 0;
+	}
+
 	if (!pmu->pmu_ready) {
 		nvgpu_warn(g, "PMU is not ready to process RPC");
 		status = EINVAL;
diff --git a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
index 941173381..684dc342f 100644
--- a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
+++ b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
@@ -480,6 +480,10 @@ int nvgpu_pmu_process_message(struct nvgpu_pmu *pmu)
 	struct gk20a *g = gk20a_from_pmu(pmu);
 	int err;
 
+	if (nvgpu_can_busy(g) == 0) {
+		return 0;
+	}
+
 	if (unlikely(!pmu->pmu_ready)) {
 		err = pmu_process_init_msg(pmu, &msg);
 		if (err != 0) {
@@ -499,6 +503,10 @@ int nvgpu_pmu_process_message(struct nvgpu_pmu *pmu)
 
 	while (pmu_read_message(pmu, PMU_MESSAGE_QUEUE, &msg, &status)) {
 
+		if (nvgpu_can_busy(g) == 0) {
+			return 0;
+		}
+
 		nvgpu_pmu_dbg(g, "read msg hdr: ");
 		nvgpu_pmu_dbg(g, "unit_id = 0x%08x, size = 0x%08x",
 			msg.hdr.unit_id, msg.hdr.size);
@@ -525,6 +533,10 @@ void nvgpu_pmu_rpc_handler(struct gk20a *g, struct pmu_msg *msg,
 	struct rpc_handler_payload *rpc_payload =
 		(struct rpc_handler_payload *)param;
 
+	if (nvgpu_can_busy(g) == 0) {
+		return ;
+	}
+
 	(void) memset(&rpc, 0, sizeof(struct nv_pmu_rpc_header));
 	nvgpu_memcpy((u8 *)&rpc, (u8 *)rpc_payload->rpc_buff,
 		sizeof(struct nv_pmu_rpc_header));
@@ -604,6 +616,10 @@ int pmu_wait_message_cond_status(struct nvgpu_pmu *pmu, u32 timeout_ms,
 			return 0;
 		}
 
+		if (nvgpu_can_busy(g) == 0) {
+			return 0;
+		}
+
 		if (g->ops.pmu.pmu_is_interrupted(pmu)) {
 			g->ops.pmu.pmu_isr(g);
 		}
diff --git a/drivers/gpu/nvgpu/include/nvgpu/clk_arb.h b/drivers/gpu/nvgpu/include/nvgpu/clk_arb.h
index d9f12bfc2..8cf60fb1c 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/clk_arb.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/clk_arb.h
@@ -366,6 +366,7 @@ void nvgpu_clk_notification_queue_free(struct gk20a *g,
 void nvgpu_clk_arb_event_post_event(struct nvgpu_clk_dev *dev);
 
 unsigned long nvgpu_clk_measure_freq(struct gk20a *g, u32 api_domain);
+void nvgpu_clk_arb_worker_deinit(struct gk20a *g);
 
 #ifdef CONFIG_DEBUG_FS
 int nvgpu_clk_arb_debugfs_init(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index b98055c2a..603ff9f71 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -1513,6 +1513,7 @@ struct gpu_ops {
 		 *  before calling this function */
 		u32 (*get_current_pstate)(struct gk20a *g);
 		void (*clk_arb_cleanup)(struct nvgpu_clk_arb *arb);
+		void (*stop_clk_arb_threads)(struct gk20a *g);
 	} clk_arb;
 	struct {
 		int (*handle_pmu_perf_event)(struct gk20a *g, void *pmu_msg);
diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c
index 55c5ada7a..4c6e4a697 100644
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -1193,6 +1193,7 @@ static const struct gpu_ops tu104_ops = {
 		.arbiter_clk_init = gv100_init_clk_arbiter,
 		.clk_arb_run_arbiter_cb = gv100_clk_arb_run_arbiter_cb,
 		.clk_arb_cleanup = gv100_clk_arb_cleanup,
+		.stop_clk_arb_threads = gv100_stop_clk_arb_threads,
 	},
 	.regops = {
 		.exec_regops = exec_regops_gk20a,