gpu: nvgpu: decouple channel watchdog dependencies

The channel code needs the watchdog code and vice versa. Cut this circular dependency with a few simplifications so that the watchdog wouldn't depend on so much. When calling watchdog APIs that cause stores or comparisons of channel progress, provide a snapshot of the current progress instead of a whole channel pointer. struct nvgpu_channel_wdt_state is added as an interface for this to track gp_get and pb_get. When periodically checking the watchdog state, make the channel code ask whether a hang has been detected and abort the channel from within channel code instead of asking the watchdog to abort the channel. The debug dump verbosity flag is also moved back to the channel data. Move the functionality to restart all channels' watchdogs to channel code from watchdog code. Looping over active channels is not a good feature for the watchdog; it's better for the channel handling to just use the watchdog as a tracking tool. Move a few unserviceable checks up in the stack to the callers of the wdt code. They're a kludge but this will do for now and demonstrates what needs to be eventually fixed. This does not leave much code in the watchdog unit. Now the purpose of the watchdog is to only isolate the logic to couple a timer and progress snapshots with careful locking to start and stop the tracking. Jira NVGPU-5582 Change-Id: I7c728542ff30d88b1414500210be3fbaf61e6e8a Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2369820 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2020-08-12 18:10:40 +03:00
parent 281006ae7d
commit e8201d6ce3
6 changed files with 214 additions and 125 deletions
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -459,6 +459,114 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
 };

 #ifdef CONFIG_NVGPU_CHANNEL_WDT
+void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, bool dump)
+{
+	ch->wdt_debug_dump = dump;
+}
+
+static struct nvgpu_channel_wdt_state nvgpu_channel_collect_wdt_state(
+		struct nvgpu_channel *ch)
+{
+	struct gk20a *g = ch->g;
+	struct nvgpu_channel_wdt_state state = { 0, 0 };
+
+	/*
+	 * Note: just checking for nvgpu_channel_wdt_enabled() is not enough at
+	 * the moment because system suspend puts g->regs away but doesn't stop
+	 * the worker thread that runs the watchdog. This might need to be
+	 * cleared up in the future.
+	 */
+	if (nvgpu_channel_wdt_running(ch->wdt)) {
+		/*
+		 * Read the state only if the wdt is on to avoid unnecessary
+		 * accesses. The kernel mem for userd may not even exist; this
+		 * channel could be in usermode submit mode.
+		 */
+		state.gp_get = g->ops.userd.gp_get(g, ch);
+		state.pb_get = g->ops.userd.pb_get(g, ch);
+	}
+
+	return state;
+}
+
+static void nvgpu_channel_launch_wdt(struct nvgpu_channel *ch)
+{
+	struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch);
+
+	/*
+	 * FIXME: channel recovery can race the submit path and can start even
+	 * after this, but this check is the best we can do for now.
+	 */
+	if (!nvgpu_channel_check_unserviceable(ch)) {
+		nvgpu_channel_wdt_start(ch->wdt, &state);
+	}
+}
+
+
+void nvgpu_channel_restart_all_wdts(struct gk20a *g)
+{
+	struct nvgpu_fifo *f = &g->fifo;
+	u32 chid;
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
+
+		if (ch != NULL) {
+			if ((ch->wdt != NULL) &&
+			    !nvgpu_channel_check_unserviceable(ch)) {
+				struct nvgpu_channel_wdt_state state =
+					nvgpu_channel_collect_wdt_state(ch);
+
+				nvgpu_channel_wdt_rewind(ch->wdt, &state);
+			}
+			nvgpu_channel_put(ch);
+		}
+	}
+}
+
+static void nvgpu_channel_recover_from_wdt(struct nvgpu_channel *ch)
+{
+	struct gk20a *g = ch->g;
+
+	nvgpu_log_fn(g, " ");
+
+	if (nvgpu_channel_check_unserviceable(ch)) {
+		/* channel is already recovered */
+		nvgpu_info(g, "chid: %d unserviceable but wdt was ON", ch->chid);
+		return;
+	}
+
+	nvgpu_err(g, "Job on channel %d timed out", ch->chid);
+
+	/* force reset calls gk20a_debug_dump but not this */
+	if (ch->wdt_debug_dump) {
+		gk20a_gr_debug_dump(g);
+	}
+
+#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
+	if (g->ops.tsg.force_reset(ch,
+	    NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
+	    ch->wdt_debug_dump) != 0) {
+		nvgpu_err(g, "failed tsg force reset for chid: %d", ch->chid);
+	}
+#endif
+}
+
+/*
+ * Test the watchdog progress. If the channel is stuck, reset it.
+ *
+ * The gpu is implicitly on at this point because the watchdog can only run on
+ * channels that have submitted jobs pending for cleanup.
+ */
+static void nvgpu_channel_check_wdt(struct nvgpu_channel *ch)
+{
+	struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch);
+
+	if (nvgpu_channel_wdt_check(ch->wdt, &state)) {
+		nvgpu_channel_recover_from_wdt(ch);
+	}
+}
+
 static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
 {
 	struct nvgpu_channel_worker *ch_worker =
@@ -486,7 +594,7 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g)

 		if (ch != NULL) {
 			if (!nvgpu_channel_check_unserviceable(ch)) {
-				nvgpu_channel_wdt_check(ch->wdt, ch);
+				nvgpu_channel_check_wdt(ch);
 			}
 			nvgpu_channel_put(ch);
 		}
@@ -521,6 +629,8 @@ static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(

 	return ch_worker->watchdog_interval;
 }
+#else
+static void nvgpu_channel_launch_wdt(struct nvgpu_channel *ch) {}
 #endif /* CONFIG_NVGPU_CHANNEL_WDT */

 static inline struct nvgpu_channel *
@@ -635,7 +745,7 @@ int nvgpu_channel_add_job(struct nvgpu_channel *c,
 		job->num_mapped_buffers = num_mapped_buffers;
 		job->mapped_buffers = mapped_buffers;

-		nvgpu_channel_wdt_start(c->wdt, c);
+		nvgpu_channel_launch_wdt(c);

 		nvgpu_channel_joblist_lock(c);
 		nvgpu_channel_joblist_add(c, job);
@@ -1456,11 +1566,12 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
 	ch->unserviceable = true;

 #ifdef CONFIG_NVGPU_CHANNEL_WDT
-	ch->wdt = nvgpu_channel_wdt_alloc(ch);
+	ch->wdt = nvgpu_channel_wdt_alloc(g);
 	if (ch->wdt == NULL) {
 		nvgpu_err(g, "wdt alloc failed");
 		goto clean_up;
 	}
+	ch->wdt_debug_dump = true;
 #endif

 	ch->obj_class = 0;