gpu: nvpgu: Fix exit path of PMU and clk_arb

This patch fixes below two issues. 1.Currently clk arb exit is called after GPU registers are released. This causes crash when clk arb WQ accesses GPU HW register for status. The ideal way is to exit the clk_arb which removes the WQ from running before calling lockout register. 2.Check if dGPU is dying during processing of PMU Commands. This prevents race condition when PMU is waiting for response and device is shutdown. Bug 200488054 Change-Id: I812b07af7db4494d5ea2ed6197742ceb23d30a4b Signed-off-by: Abdul Salam <absalam@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2081916 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-25 02:52:51 +03:00 · 2019-04-06 15:32:49 +05:30
parent 1a843ba051
commit 179e3cf84a
10 changed files with 40 additions and 7 deletions
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c
@@ -442,11 +442,11 @@ static void nvgpu_clk_arb_worker_process(struct gk20a *g, int *get)

 		if (!work_item) {
 			/*
-			 * Woke up for some other reason, but there are no
-			 * other reasons than a work item added in the items list
-			 * currently, so warn and ack the message.
+			 * Woke up but found work_item empty ?
+			 * This can happen when thread is stopped as driver is
+			 * dying, so inform and ack the message.
 			 */
-			nvgpu_warn(g, "Spurious worker event!");
+			nvgpu_info(g, "Spurious worker event!");
 			++*get;
 			break;
 		}
@@ -621,7 +621,7 @@ void nvgpu_clk_arb_send_thermal_alarm(struct gk20a *g)
 		BIT32(NVGPU_EVENT_ALARM_THERMAL_ABOVE_THRESHOLD));
 }

-static void nvgpu_clk_arb_worker_deinit(struct gk20a *g)
+void nvgpu_clk_arb_worker_deinit(struct gk20a *g)
 {
 	nvgpu_mutex_acquire(&g->clk_arb_worker.start_lock);
 	nvgpu_thread_stop(&g->clk_arb_worker.poll_task);
@@ -635,7 +635,6 @@ void nvgpu_clk_arb_cleanup_arbiter(struct gk20a *g)
 	nvgpu_mutex_acquire(&g->clk_arb_enable_lock);

 	if (arb) {
-		nvgpu_clk_arb_worker_deinit(g);
 		g->ops.clk_arb.clk_arb_cleanup(g->clk_arb);
 	}

--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gp10b.c
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gp10b.c
@@ -396,6 +396,7 @@ void gp10b_clk_arb_cleanup(struct nvgpu_clk_arb *arb)
 	struct gk20a *g = arb->g;
 	int index;

+	nvgpu_clk_arb_worker_deinit(g);
 	nvgpu_kfree(g, arb->gpc2clk_f_points);

 	for (index = 0; index < 2; index++) {
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.c
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.c
@@ -522,3 +522,10 @@ void gv100_clk_arb_cleanup(struct nvgpu_clk_arb *arb)

 	g->clk_arb = NULL;
 }
+
+void gv100_stop_clk_arb_threads(struct gk20a *g)
+{
+	nvgpu_mutex_acquire(&g->clk_arb_worker.start_lock);
+	nvgpu_thread_stop(&g->clk_arb_worker.poll_task);
+	nvgpu_mutex_release(&g->clk_arb_worker.start_lock);
+}
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.h
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb_gv100.h
@@ -36,5 +36,5 @@ int gv100_get_arbiter_clk_default(struct gk20a *g, u32 api_domain,
 int gv100_init_clk_arbiter(struct gk20a *g);
 void gv100_clk_arb_run_arbiter_cb(struct nvgpu_clk_arb *arb);
 void gv100_clk_arb_cleanup(struct nvgpu_clk_arb *arb);
-
+void gv100_stop_clk_arb_threads(struct gk20a *g);
 #endif /* CLK_ARB_GV100_H */
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -126,6 +126,9 @@ int gk20a_prepare_poweroff(struct gk20a *g)
 	if (g->ops.clk.suspend_clk_support != NULL) {
 		g->ops.clk.suspend_clk_support(g);
 	}
+	if (g->ops.clk_arb.stop_clk_arb_threads != NULL) {
+		g->ops.clk_arb.stop_clk_arb_threads(g);
+	}
 	gk20a_mask_interrupts(g);

 	g->power_on = false;
--- a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_cmd.c
+++ b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_cmd.c
@@ -615,6 +615,10 @@ int nvgpu_pmu_rpc_execute(struct nvgpu_pmu *pmu, struct nv_pmu_rpc_header *rpc,
 	void *rpc_buff = NULL;
 	int status = 0;

+	if (nvgpu_can_busy(g) == 0) {
+		return 0;
+	}
+
 	if (!pmu->pmu_ready) {
 		nvgpu_warn(g, "PMU is not ready to process RPC");
 		status = EINVAL;
--- a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
+++ b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
@@ -480,6 +480,10 @@ int nvgpu_pmu_process_message(struct nvgpu_pmu *pmu)
 	struct gk20a *g = gk20a_from_pmu(pmu);
 	int err;

+	if (nvgpu_can_busy(g) == 0) {
+		return 0;
+	}
+
 	if (unlikely(!pmu->pmu_ready)) {
 		err = pmu_process_init_msg(pmu, &msg);
 		if (err != 0) {
@@ -499,6 +503,10 @@ int nvgpu_pmu_process_message(struct nvgpu_pmu *pmu)

 	while (pmu_read_message(pmu, PMU_MESSAGE_QUEUE, &msg, &status)) {

+		if (nvgpu_can_busy(g) == 0) {
+			return 0;
+		}
+
 		nvgpu_pmu_dbg(g, "read msg hdr: ");
 		nvgpu_pmu_dbg(g, "unit_id = 0x%08x, size = 0x%08x",
 			msg.hdr.unit_id, msg.hdr.size);
@@ -525,6 +533,10 @@ void nvgpu_pmu_rpc_handler(struct gk20a *g, struct pmu_msg *msg,
 	struct rpc_handler_payload *rpc_payload =
 		(struct rpc_handler_payload *)param;

+	if (nvgpu_can_busy(g) == 0) {
+		return ;
+	}
+
 	(void) memset(&rpc, 0, sizeof(struct nv_pmu_rpc_header));
 	nvgpu_memcpy((u8 *)&rpc, (u8 *)rpc_payload->rpc_buff,
 		sizeof(struct nv_pmu_rpc_header));
@@ -604,6 +616,10 @@ int pmu_wait_message_cond_status(struct nvgpu_pmu *pmu, u32 timeout_ms,
 			return 0;
 		}

+		if (nvgpu_can_busy(g) == 0) {
+			return 0;
+		}
+
 		if (g->ops.pmu.pmu_is_interrupted(pmu)) {
 			g->ops.pmu.pmu_isr(g);
 		}
--- a/drivers/gpu/nvgpu/include/nvgpu/clk_arb.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/clk_arb.h
@@ -366,6 +366,7 @@ void nvgpu_clk_notification_queue_free(struct gk20a *g,
 void nvgpu_clk_arb_event_post_event(struct nvgpu_clk_dev *dev);

 unsigned long nvgpu_clk_measure_freq(struct gk20a *g, u32 api_domain);
+void nvgpu_clk_arb_worker_deinit(struct gk20a *g);

 #ifdef CONFIG_DEBUG_FS
 int nvgpu_clk_arb_debugfs_init(struct gk20a *g);
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -1513,6 +1513,7 @@ struct gpu_ops {
 		 *  before calling this function */
 		u32 (*get_current_pstate)(struct gk20a *g);
 		void (*clk_arb_cleanup)(struct nvgpu_clk_arb *arb);
+		void (*stop_clk_arb_threads)(struct gk20a *g);
 	} clk_arb;
 	struct {
 		int (*handle_pmu_perf_event)(struct gk20a *g, void *pmu_msg);
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -1193,6 +1193,7 @@ static const struct gpu_ops tu104_ops = {
 		.arbiter_clk_init = gv100_init_clk_arbiter,
 		.clk_arb_run_arbiter_cb = gv100_clk_arb_run_arbiter_cb,
 		.clk_arb_cleanup = gv100_clk_arb_cleanup,
+		.stop_clk_arb_threads = gv100_stop_clk_arb_threads,
 	},
 	.regops = {
 		.exec_regops = exec_regops_gk20a,