gpu: nvgpu: Set unserviceable flag early during RC

During recovery, we set ch->unserviceable at the end after we preempt the TSG and reset the engines. It might be too late and user-space might submit more work to the broken channel which is not desirable. Move setting this unserviceable flag right at the start of recovery sequence. Another thread doing a submit can still read the unserviceable flag just before it is set here, leaving that submit stuck if recovery completes before the submit thread advances enough to set up a post fence visible for other threads. This could be fixed with a big lock or with a double check at the end of the submit code after the job data has been made visible. We still release the fences, semaphore and error notifier wait queues at the end; so user-space would not trigger channel unbind while channel is being recovered. Also, change the handle_mmu_fault APIs to return void as the debug_dump return value is not used in any of the caller APIs. JIRA NVGPU-5843 Change-Id: Ib42c2816dd1dca542e4f630805411cab75fad90e Signed-off-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2385256 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2020-07-24 11:54:01 +00:00
parent 650ce63466
commit ab2b0b5949
9 changed files with 151 additions and 36 deletions
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -1694,12 +1694,9 @@ static bool nvgpu_channel_ctxsw_timeout_debug_dump_state(
 	return verbose;
 }

-static void nvgpu_channel_set_has_timedout_and_wakeup_wqs(struct gk20a *g,
-		struct nvgpu_channel *ch)
+void nvgpu_channel_wakeup_wqs(struct gk20a *g,
+				struct nvgpu_channel *ch)
 {
-	/* mark channel as faulted */
-	nvgpu_channel_set_unserviceable(ch);
-
 	/* unblock pending waits */
 	if (nvgpu_cond_broadcast_interruptible(&ch->semaphore_wq) != 0) {
 		nvgpu_warn(g, "failed to broadcast");
@@ -1714,7 +1711,11 @@ bool nvgpu_channel_mark_error(struct gk20a *g, struct nvgpu_channel *ch)
 	bool verbose;

 	verbose = nvgpu_channel_ctxsw_timeout_debug_dump_state(ch);
-	nvgpu_channel_set_has_timedout_and_wakeup_wqs(g, ch);
+
+	/* mark channel as faulted */
+	nvgpu_channel_set_unserviceable(ch);
+
+	nvgpu_channel_wakeup_wqs(g, ch);

 	return verbose;
 }
@@ -1736,7 +1737,8 @@ void nvgpu_channel_sw_quiesce(struct gk20a *g)
 		if (ch != NULL) {
 			nvgpu_channel_set_error_notifier(g, ch,
 				NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
-			nvgpu_channel_set_has_timedout_and_wakeup_wqs(g, ch);
+			nvgpu_channel_set_unserviceable(ch);
+			nvgpu_channel_wakeup_wqs(g, ch);
 			nvgpu_channel_put(ch);
 		}
 	}
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -423,6 +423,36 @@ clean_up_mutex:
 	return err;
 }

+void nvgpu_tsg_set_unserviceable(struct gk20a *g,
+				struct nvgpu_tsg *tsg)
+{
+	struct nvgpu_channel *ch = NULL;
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+	nvgpu_list_for_each_entry(ch, &tsg->ch_list, nvgpu_channel, ch_entry) {
+		if (nvgpu_channel_get(ch) != NULL) {
+			nvgpu_channel_set_unserviceable(ch);
+			nvgpu_channel_put(ch);
+		}
+	}
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+}
+
+void nvgpu_tsg_wakeup_wqs(struct gk20a *g,
+				struct nvgpu_tsg *tsg)
+{
+	struct nvgpu_channel *ch = NULL;
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+	nvgpu_list_for_each_entry(ch, &tsg->ch_list, nvgpu_channel, ch_entry) {
+		if (nvgpu_channel_get(ch) != NULL) {
+			nvgpu_channel_wakeup_wqs(g, ch);
+			nvgpu_channel_put(ch);
+		}
+	}
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+}
+
 bool nvgpu_tsg_mark_error(struct gk20a *g,
 		struct nvgpu_tsg *tsg)
 {
--- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -188,7 +188,7 @@ void gk20a_fifo_intr_0_isr(struct gk20a *g)
 	}

 	if ((fifo_intr & fifo_intr_0_mmu_fault_pending_f()) != 0U) {
-		(void) gk20a_fifo_handle_mmu_fault(g, 0, INVAL_ID, false);
+		gk20a_fifo_handle_mmu_fault(g, 0, INVAL_ID, false);
 		clear_intr |= fifo_intr_0_mmu_fault_pending_f();
 	}

--- a/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.c
@@ -231,7 +231,7 @@ void gk20a_fifo_handle_dropped_mmu_fault(struct gk20a *g)
 	nvgpu_err(g, "dropped mmu fault (0x%08x)", fault_id);
 }

-bool gk20a_fifo_handle_mmu_fault_locked(
+void gk20a_fifo_handle_mmu_fault_locked(
 	struct gk20a *g,
 	u32 mmu_fault_engines, /* queried from HW if 0 */
 	u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
@@ -240,7 +240,6 @@ bool gk20a_fifo_handle_mmu_fault_locked(
 	bool fake_fault;
 	unsigned long fault_id;
 	unsigned long engine_mmu_fault_id;
-	bool debug_dump = true;
 	struct nvgpu_engine_status_info engine_status;
 	bool deferred_reset_pending = false;

@@ -340,6 +339,28 @@ bool gk20a_fifo_handle_mmu_fault_locked(
 			}
 		}

+		/* Set unserviceable flag right at start of recovery to reduce
+		 * the window of race between job submit and recovery on same
+		 * TSG.
+		 * The unserviceable flag is checked during job submit and
+		 * prevent new jobs from being submitted to TSG which is headed
+		 * for teardown.
+		 */
+		if (tsg != NULL) {
+			/* Set error notifier before letting userspace
+			 * know about faulty channel.
+			 * The unserviceable flag is moved early to
+			 * disallow submits on the broken channel. If
+			 * userspace checks the notifier code when a
+			 * submit fails, we need it set to convey to
+			 * userspace that channel is no longer usable.
+			 */
+			if (!fake_fault) {
+				nvgpu_tsg_set_ctx_mmu_error(g, tsg);
+			}
+			nvgpu_tsg_set_unserviceable(g, tsg);
+		}
+
 		/* check if engine reset should be deferred */
 		if (engine_id != NVGPU_INVALID_ENG_ID) {
 #ifdef CONFIG_NVGPU_DEBUGGER
@@ -381,10 +402,7 @@ bool gk20a_fifo_handle_mmu_fault_locked(
 			if (deferred_reset_pending) {
 				g->ops.tsg.disable(tsg);
 			} else {
-				if (!fake_fault) {
-					nvgpu_tsg_set_ctx_mmu_error(g, tsg);
-				}
-				debug_dump = nvgpu_tsg_mark_error(g, tsg);
+				nvgpu_tsg_wakeup_wqs(g, tsg);
 				nvgpu_tsg_abort(g, tsg, false);
 			}

@@ -426,17 +444,14 @@ bool gk20a_fifo_handle_mmu_fault_locked(
 	if (nvgpu_cg_pg_enable(g) != 0) {
 		nvgpu_warn(g, "fail to enable power mgmt");
 	}
-	return debug_dump;
 }

-bool gk20a_fifo_handle_mmu_fault(
+void gk20a_fifo_handle_mmu_fault(
 	struct gk20a *g,
 	u32 mmu_fault_engines, /* queried from HW if 0 */
 	u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
 	bool id_is_tsg)
 {
-	bool debug_dump;
-
 	nvgpu_log_fn(g, " ");

 	nvgpu_log_info(g, "acquire engines_reset_mutex");
@@ -444,13 +459,11 @@ bool gk20a_fifo_handle_mmu_fault(

 	nvgpu_runlist_lock_active_runlists(g);

-	debug_dump = gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines,
+	gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines,
 			hw_id, id_is_tsg);

 	nvgpu_runlist_unlock_active_runlists(g);

 	nvgpu_log_info(g, "release engines_reset_mutex");
 	nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
-
-	return debug_dump;
 }
--- a/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -39,10 +39,10 @@ void gk20a_fifo_mmu_fault_info_dump(struct gk20a *g, u32 engine_id,

 void gk20a_fifo_handle_dropped_mmu_fault(struct gk20a *g);

-bool gk20a_fifo_handle_mmu_fault(struct gk20a *g, u32 mmu_fault_engines,
+void gk20a_fifo_handle_mmu_fault(struct gk20a *g, u32 mmu_fault_engines,
 	u32 hw_id, bool id_is_tsg);

-bool gk20a_fifo_handle_mmu_fault_locked(struct gk20a *g, u32 mmu_fault_engines,
+void gk20a_fifo_handle_mmu_fault_locked(struct gk20a *g, u32 mmu_fault_engines,
 	u32 hw_id, bool id_is_tsg);

 #endif /* NVGPU_FIFO_MMU_FAULT_GK20A_H */
--- a/drivers/gpu/nvgpu/hal/rc/rc_gk20a.c
+++ b/drivers/gpu/nvgpu/hal/rc/rc_gk20a.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -116,12 +116,8 @@ void gk20a_fifo_recover(struct gk20a *g, u32 eng_bitmask,
 		g->ops.fifo.intr_set_recover_mask(g);

 		g->ops.fifo.trigger_mmu_fault(g, engine_ids);
-		/*
-		 * Ignore the "Verbose" flag from
-		 * gk20a_fifo_handle_mmu_fault_locked since it is not needed
-		 * here
-		 */
-		(void) gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines,
+
+		gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines,
 				ref_id, ref_id_is_tsg);

 		g->ops.fifo.intr_unset_recover_mask(g);
--- a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c
@@ -175,6 +175,28 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 			"act_eng_bitmask = 0x%x, mmufault ptr = 0x%p",
 			 id, id_type, rc_type, act_eng_bitmask, mmufault);

+	/* Set unserviceable flag right at start of recovery to reduce
+	 * the window of race between job submit and recovery on same
+	 * TSG.
+	 * The unserviceable flag is checked during job submit and
+	 * prevent new jobs from being submitted to TSG which is headed
+	 * for teardown.
+	 */
+	if (tsg != NULL) {
+		/* Set error notifier before letting userspace
+		 * know about faulty channel
+		 * The unserviceable flag is moved early to
+		 * disallow submits on the broken channel. If
+		 * userspace checks the notifier code when a
+		 * submit fails, we need it set to convey to
+		 * userspace that channel is no longer usable.
+		 */
+		if (rc_type == RC_TYPE_MMU_FAULT) {
+			nvgpu_tsg_set_ctx_mmu_error(g, tsg);
+		}
+		nvgpu_tsg_set_unserviceable(g, tsg);
+	}
+
 	if (rc_type == RC_TYPE_MMU_FAULT && mmufault != NULL) {
 		if(mmufault->faulted_pbdma != INVAL_ID) {
 			pbdma_bitmask = BIT32(mmufault->faulted_pbdma);
@@ -290,10 +312,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
 			g->ops.tsg.disable(tsg);
 		} else {
 #endif
-			if (rc_type == RC_TYPE_MMU_FAULT) {
-				nvgpu_tsg_set_ctx_mmu_error(g, tsg);
-			}
-			(void)nvgpu_tsg_mark_error(g, tsg);
+			nvgpu_tsg_wakeup_wqs(g, tsg);
 			nvgpu_tsg_abort(g, tsg, false);
 #ifdef CONFIG_NVGPU_DEBUGGER
 		}
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -1050,6 +1050,18 @@ void nvgpu_channel_set_unserviceable(struct nvgpu_channel *ch);
 */
 bool nvgpu_channel_check_unserviceable(struct nvgpu_channel *ch);

+/**
+ * @brief Signal on wait queues (notify_wq and semaphore_wq).
+ *
+ * @param g [in]	Pointer to GPU driver struct.
+ * @param ch [in]	Channel pointer.
+ *
+ * Unblock pending waits on this channel (semaphore and error
+ * notifier wait queues).
+ *
+ */
+void nvgpu_channel_wakeup_wqs(struct gk20a *g, struct nvgpu_channel *ch);
+
 #ifdef CONFIG_NVGPU_USERD
 /**
 * @brief Channel userd physical address.
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
@@ -541,6 +541,49 @@ void nvgpu_tsg_disable_sched(struct gk20a *g, struct nvgpu_tsg *tsg);
 int nvgpu_tsg_alloc_sm_error_states_mem(struct gk20a *g,
 					struct nvgpu_tsg *tsg,
 					u32 num_sm);
+
+/**
+ * @brief Mark for all the referenceable channels of tsg's channel
+ *        list as unserviceable.
+ *
+ * @param g [in]		The GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ *
+ * The channels are set unserviceable to convey that an uncorrectable
+ * error has occurred on the channel. It is not possible to take more
+ * references on this channel and only available option for userspace
+ * is to close the channel fd.
+ * - Acquire #nvgpu_tsg.ch_list_lock of the tsg.
+ * - For each entry of the channels in #nvgpu_tsg.ch_list of the tsg,
+ * -- Get reference to the channel.
+ * -- If channel is referenceable,
+ * --- Call #nvgpu_channel_set_unserviceable.
+ * --- Put reference to the channel.
+ * - Release #nvgpu_tsg.ch_list_lock of the tsg.
+ *
+ */
+void nvgpu_tsg_set_unserviceable(struct gk20a *g, struct nvgpu_tsg *tsg);
+
+/**
+ * @brief Release waiters for all the referenceable channels of tsg's
+ *        channel list.
+ *
+ * @param g [in]		The GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ *
+ * Unblock pending waits on this channel (semaphore and error
+ * notifier wait queues)
+ * - Acquire #nvgpu_tsg.ch_list_lock of the tsg.
+ * - For each entry of the channels in #nvgpu_tsg.ch_list of the tsg,
+ * -- Get reference to the channel.
+ * -- If channel is referenceable,
+ * --- Call #nvgpu_channel_wakeup_wqs.
+ * --- Put reference to the channel.
+ * - Release #nvgpu_tsg.ch_list_lock of the tsg.
+ *
+ */
+void nvgpu_tsg_wakeup_wqs(struct gk20a *g, struct nvgpu_tsg *tsg);
+
 #ifdef CONFIG_NVGPU_DEBUGGER
 int nvgpu_tsg_set_sm_exception_type_mask(struct nvgpu_channel *ch,
 		u32 exception_mask);