gpu: nvgpu: decouple channel watchdog dependencies

The channel code needs the watchdog code and vice versa. Cut this circular dependency with a few simplifications so that the watchdog wouldn't depend on so much. When calling watchdog APIs that cause stores or comparisons of channel progress, provide a snapshot of the current progress instead of a whole channel pointer. struct nvgpu_channel_wdt_state is added as an interface for this to track gp_get and pb_get. When periodically checking the watchdog state, make the channel code ask whether a hang has been detected and abort the channel from within channel code instead of asking the watchdog to abort the channel. The debug dump verbosity flag is also moved back to the channel data. Move the functionality to restart all channels' watchdogs to channel code from watchdog code. Looping over active channels is not a good feature for the watchdog; it's better for the channel handling to just use the watchdog as a tracking tool. Move a few unserviceable checks up in the stack to the callers of the wdt code. They're a kludge but this will do for now and demonstrates what needs to be eventually fixed. This does not leave much code in the watchdog unit. Now the purpose of the watchdog is to only isolate the logic to couple a timer and progress snapshots with careful locking to start and stop the tracking. Jira NVGPU-5582 Change-Id: I7c728542ff30d88b1414500210be3fbaf61e6e8a Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2369820 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2020-08-12 18:10:40 +03:00
parent 281006ae7d
commit e8201d6ce3
6 changed files with 214 additions and 125 deletions
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -377,6 +377,7 @@ struct nvgpu_channel {

 	/* kernel watchdog to kill stuck jobs */
 	struct nvgpu_channel_wdt *wdt;
+	bool wdt_debug_dump;

 	/** Fence allocator in case of deterministic submit. */
 	struct nvgpu_allocator fence_allocator;
@@ -1161,4 +1162,27 @@ int nvgpu_channel_deferred_reset_engines(struct gk20a *g,
 		struct nvgpu_channel *ch);
 #endif

+#ifdef CONFIG_NVGPU_CHANNEL_WDT
+/**
+ * @brief Rewind the timeout on each non-dormant channel.
+ *
+ * Reschedule the timeout of each active channel for which timeouts are running
+ * as if something was happened on each channel right now. This should be
+ * called when a global hang is detected that could cause a false positive on
+ * other innocent channels.
+ */
+void nvgpu_channel_restart_all_wdts(struct gk20a *g);
+/**
+ * @brief Enable or disable full debug dump on wdt error.
+ *
+ * Set the policy on whether or not to do the verbose channel and gr debug dump
+ * when the channel gets recovered as a result of a watchdog timeout.
+ */
+void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, bool dump);
+#else
+static inline void nvgpu_channel_restart_all_wdts(struct gk20a *g) {}
+static inline void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch,
+		bool dump) {}
+#endif
+
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/watchdog.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/watchdog.h
@@ -23,14 +23,19 @@
 #ifndef NVGPU_WATCHDOG_H
 #define NVGPU_WATCHDOG_H

-#ifdef CONFIG_NVGPU_CHANNEL_WDT
+#include <nvgpu/types.h>

 struct gk20a;
-struct nvgpu_channel;
-struct nvgpu_worker;
 struct nvgpu_channel_wdt;

-struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct nvgpu_channel *ch);
+struct nvgpu_channel_wdt_state {
+	u64 gp_get;
+	u64 pb_get;
+};
+
+#ifdef CONFIG_NVGPU_CHANNEL_WDT
+
+struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(struct gk20a *g);
 void nvgpu_channel_wdt_destroy(struct nvgpu_channel_wdt *wdt);

 void nvgpu_channel_wdt_enable(struct nvgpu_channel_wdt *wdt);
@@ -39,21 +44,21 @@ bool nvgpu_channel_wdt_enabled(struct nvgpu_channel_wdt *wdt);

 void nvgpu_channel_wdt_set_limit(struct nvgpu_channel_wdt *wdt, u32 limit_ms);
 u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt);
-void nvgpu_channel_wdt_set_debug_dump(struct nvgpu_channel_wdt *wdt, bool dump);

 void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt,
-		struct nvgpu_channel *ch);
-void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt);
+		struct nvgpu_channel_wdt_state *state);
 bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt);
-void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
-		struct nvgpu_channel *ch);
-
-void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g);
+void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt);
+void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt,
+		struct nvgpu_channel_wdt_state *state);
+bool nvgpu_channel_wdt_running(struct nvgpu_channel_wdt *wdt);
+bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
+		struct nvgpu_channel_wdt_state *state);

 #else /* CONFIG_NVGPU_CHANNEL_WDT */

 static inline struct nvgpu_channel_wdt *nvgpu_channel_wdt_alloc(
-		struct nvgpu_channel *ch)
+		struct gk20a *g)
 {
 	return NULL;
 }
@@ -71,21 +76,19 @@ static inline u32 nvgpu_channel_wdt_limit(struct nvgpu_channel_wdt *wdt)
 {
 	return 0U;
 }
-static inline void nvgpu_channel_wdt_set_debug_dump(
-		struct nvgpu_channel_wdt *wdt,
-		bool dump) {}
-
 static inline void nvgpu_channel_wdt_start(struct nvgpu_channel_wdt *wdt,
-		struct nvgpu_channel *ch) {}
-static inline void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) {}
+		struct nvgpu_channel_wdt_state *state) {}
 static inline bool nvgpu_channel_wdt_stop(struct nvgpu_channel_wdt *wdt)
 {
 	return false;
 }
-static inline void nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
-		struct nvgpu_channel *ch) {}
-
-static inline void nvgpu_channel_wdt_restart_all_channels(struct gk20a *g) {}
+static inline void nvgpu_channel_wdt_continue(struct nvgpu_channel_wdt *wdt) {}
+static inline void nvgpu_channel_wdt_rewind(struct nvgpu_channel_wdt *wdt,
+		struct nvgpu_channel_wdt_state *state) {}
+static inline bool nvgpu_channel_wdt_check(struct nvgpu_channel_wdt *wdt,
+		struct nvgpu_channel_wdt_state *state) {
+	return false;
+}

 #endif /* CONFIG_NVGPU_CHANNEL_WDT */