gpu: nvgpu: add domain scheduler worker

Move away from the prototype call in channel wdt worker and create a separate worker thread for the domain scheduler. The details of runlist domains are still encapsulated in the runlist code; the domain scheduler controls when to switch domains. Switching happens based on domain timeslices or when the current domain is deleted. The worker thread is paused on railgate and spun back on poweron. The scheduler data was also left dangling, so fix that by deinitializing all nvs-related when gk20a_remove_support() is called. The runlist domains already get freed as part of fifo removal. Jira NVGPU-6427 Change-Id: I64f42498f8789448d9becdd209b7878ef0fdb124 Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2632579 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2021-11-25 10:33:57 +02:00
parent 8ce1f48e2e
commit d086c678fd
6 changed files with 228 additions and 18 deletions
--- a/drivers/gpu/nvgpu/common/fifo/channel_wdt.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel_wdt.c
@@ -25,7 +25,6 @@

 #include <nvgpu/watchdog.h>
 #include <nvgpu/channel.h>
-#include <nvgpu/runlist.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/gk20a.h>

@@ -176,7 +175,6 @@ void nvgpu_channel_worker_poll_wakeup_post_process_item(

 	if (nvgpu_timeout_peek_expired(&ch_worker->timeout)) {
 		nvgpu_channel_poll_wdt(g);
-		nvgpu_runlist_tick(g);
 		nvgpu_timeout_init_cpu_timer(g, &ch_worker->timeout,
 				ch_worker->watchdog_interval);
 	}
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -339,6 +339,10 @@ int nvgpu_prepare_poweroff(struct gk20a *g)
 	if (tmp_ret != 0) {
 		ret = tmp_ret;
 	}
+	tmp_ret = nvgpu_nvs_suspend(g);
+	if (tmp_ret != 0) {
+		ret = tmp_ret;
+	}
 	tmp_ret = g->ops.fifo.fifo_suspend(g);
 	if (tmp_ret != 0) {
 		ret = tmp_ret;
--- a/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
+++ b/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
@@ -33,6 +33,135 @@ static struct nvs_sched_ops nvgpu_nvs_ops = {
 	.recover = NULL,
 };

+/*
+ * TODO: make use of worker items when
+ * 1) the active domain gets modified
+ *    - currently updates happen asynchronously elsewhere
+ *    - either resubmit the domain or do the updates later
+ * 2) recovery gets triggered
+ *    - currently it just locks all affected runlists
+ *    - consider pausing the scheduler logic and signaling users
+ */
+struct nvgpu_nvs_worker_item {
+	struct nvgpu_list_node list;
+};
+
+static inline struct nvgpu_nvs_worker_item *
+nvgpu_nvs_worker_item_from_worker_item(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_nvs_worker_item *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_nvs_worker_item, list));
+};
+
+static inline struct nvgpu_nvs_worker *
+nvgpu_nvs_worker_from_worker(struct nvgpu_worker *worker)
+{
+	return (struct nvgpu_nvs_worker *)
+	   ((uintptr_t)worker - offsetof(struct nvgpu_nvs_worker, worker));
+};
+
+static void nvgpu_nvs_worker_poll_init(struct nvgpu_worker *worker)
+{
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+
+	/* 100 ms is a nice arbitrary timeout for default status */
+	nvs_worker->current_timeout = 100;
+	nvgpu_timeout_init_cpu_timer(worker->g, &nvs_worker->timeout,
+			nvs_worker->current_timeout);
+}
+
+static u32 nvgpu_nvs_worker_wakeup_timeout(struct nvgpu_worker *worker)
+{
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+
+	return nvs_worker->current_timeout;
+}
+
+static void nvgpu_nvs_worker_wakeup_process_item(
+		struct nvgpu_list_node *work_item)
+{
+	struct nvgpu_nvs_worker_item *item =
+		nvgpu_nvs_worker_item_from_worker_item(work_item);
+	(void)item;
+	/* placeholder; never called yet */
+}
+
+static u32 nvgpu_nvs_tick(struct gk20a *g)
+{
+	struct nvgpu_nvs_scheduler *sched = g->scheduler;
+	struct nvgpu_nvs_domain *domain;
+	struct nvs_domain *nvs_domain;
+	u32 timeslice;
+
+	nvs_dbg(g, "nvs tick");
+
+	nvgpu_mutex_acquire(&g->sched_mutex);
+
+	domain = sched->active_domain;
+
+	if (domain == NULL) {
+		/* nothing to schedule, TODO wait for an event instead */
+		nvgpu_mutex_release(&g->sched_mutex);
+		return 100000;
+	}
+
+	nvs_domain = domain->parent->next;
+	if (nvs_domain == NULL) {
+		nvs_domain = g->scheduler->sched->domain_list->domains;
+	}
+	timeslice = nvs_domain->timeslice_us;
+
+	nvgpu_runlist_tick(g);
+	sched->active_domain = nvs_domain->priv;
+
+	nvgpu_mutex_release(&g->sched_mutex);
+
+	return timeslice;
+}
+
+static void nvgpu_nvs_worker_wakeup_post_process(struct nvgpu_worker *worker)
+{
+	struct gk20a *g = worker->g;
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+
+	if (nvgpu_timeout_peek_expired(&nvs_worker->timeout)) {
+		u32 next_timeout_us = nvgpu_nvs_tick(g);
+
+		if (next_timeout_us != 0U) {
+			nvs_worker->current_timeout = (next_timeout_us + 999U) / 1000U;
+		}
+
+		nvgpu_timeout_init_cpu_timer(g, &nvs_worker->timeout,
+				nvs_worker->current_timeout);
+	}
+}
+
+static const struct nvgpu_worker_ops nvs_worker_ops = {
+	.pre_process = nvgpu_nvs_worker_poll_init,
+	.wakeup_timeout = nvgpu_nvs_worker_wakeup_timeout,
+	.wakeup_process_item = nvgpu_nvs_worker_wakeup_process_item,
+	.wakeup_post_process = nvgpu_nvs_worker_wakeup_post_process,
+};
+
+static int nvgpu_nvs_worker_init(struct gk20a *g)
+{
+	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+
+	nvgpu_worker_init_name(worker, "nvgpu_nvs", g->name);
+
+	return nvgpu_worker_init(g, worker, &nvs_worker_ops);
+}
+
+static void nvgpu_nvs_worker_deinit(struct gk20a *g)
+{
+	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+
+	nvgpu_worker_deinit(worker);
+}
+
 int nvgpu_nvs_init(struct gk20a *g)
 {
 	struct nvgpu_nvs_domain *domain;
@@ -55,6 +184,43 @@ int nvgpu_nvs_init(struct gk20a *g)
 	return 0;
 }

+void nvgpu_nvs_remove_support(struct gk20a *g)
+{
+	struct nvgpu_nvs_scheduler *sched = g->scheduler;
+	struct nvs_domain *nvs_dom;
+
+	if (sched == NULL) {
+		/* never powered on to init anything */
+		return;
+	}
+
+	nvs_domain_for_each(sched->sched, nvs_dom) {
+		struct nvgpu_nvs_domain *nvgpu_dom = nvs_dom->priv;
+		if (nvgpu_dom->ref != 1U) {
+			nvgpu_warn(g,
+				   "domain %llu is still in use during shutdown! refs: %u",
+				   nvgpu_dom->id, nvgpu_dom->ref);
+		}
+
+		/* runlist removal will clear the rl domains */
+		nvgpu_kfree(g, nvgpu_dom);
+	}
+
+	nvs_sched_close(sched->sched);
+	nvgpu_kfree(g, sched->sched);
+	nvgpu_kfree(g, sched);
+	g->scheduler = NULL;
+	nvgpu_mutex_destroy(&g->sched_mutex);
+}
+
+int nvgpu_nvs_suspend(struct gk20a *g)
+{
+	nvgpu_nvs_worker_deinit(g);
+	nvs_dbg(g, "NVS worker suspended");
+
+	return 0;
+}
+
 int nvgpu_nvs_open(struct gk20a *g)
 {
 	int err = 0;
@@ -63,11 +229,10 @@ int nvgpu_nvs_open(struct gk20a *g)

 	nvgpu_mutex_acquire(&g->sched_mutex);

-	/*
-	 * If there's already a scheduler present, we are done; no need for
-	 * further action.
-	 */
 	if (g->scheduler != NULL) {
+		/* resuming from railgate */
+		err = nvgpu_nvs_worker_init(g);
+		nvs_dbg(g, "NVS worker resume, err=%d", err);
 		goto unlock;
 	}

@@ -84,12 +249,19 @@ int nvgpu_nvs_open(struct gk20a *g)
 		goto unlock;
 	}

-	nvs_dbg(g, "  Creating scheduler.");
+	err = nvgpu_nvs_worker_init(g);
+	if (err != 0) {
+		goto unlock;
+	}
+
+	nvs_dbg(g, "  Creating NVS scheduler.");
 	err = nvs_sched_create(g->scheduler->sched, &nvgpu_nvs_ops, g);
+	if (err != 0) {
+		nvgpu_nvs_worker_deinit(g);
+		goto unlock;
+	}

 unlock:
-	nvgpu_mutex_release(&g->sched_mutex);
-
 	if (err) {
 		nvs_dbg(g, "  Failed! Error code: %d", err);
 		if (g->scheduler) {
@@ -99,6 +271,8 @@ unlock:
 		}
 	}

+	nvgpu_mutex_release(&g->sched_mutex);
+
 	return err;
 }

@@ -154,6 +328,10 @@ int nvgpu_nvs_add_domain(struct gk20a *g, const char *name, u32 timeslice,

 	nvgpu_dom->parent = nvs_dom;

+	if (g->scheduler->active_domain == NULL) {
+		g->scheduler->active_domain = nvgpu_dom;
+	}
+
 	*pdomain = nvgpu_dom;
 unlock:
 	nvgpu_mutex_release(&g->sched_mutex);
@@ -208,15 +386,16 @@ void nvgpu_nvs_domain_put(struct gk20a *g, struct nvgpu_nvs_domain *dom)

 int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id)
 {
+	struct nvgpu_nvs_scheduler *s = g->scheduler;
 	struct nvgpu_nvs_domain *nvgpu_dom;
-	struct nvs_domain *nvs_dom;
+	struct nvs_domain *nvs_dom, *nvs_next;
 	int err = 0;

 	nvgpu_mutex_acquire(&g->sched_mutex);

 	nvs_dbg(g, "Attempting to remove domain: %llu", dom_id);

-	nvgpu_dom = nvgpu_nvs_get_dom_by_id(g, g->scheduler->sched, dom_id);
+	nvgpu_dom = nvgpu_nvs_get_dom_by_id(g, s->sched, dom_id);
 	if (nvgpu_dom == NULL) {
 		nvs_dbg(g, "domain %llu does not exist!", dom_id);
 		err = -ENOENT;
@@ -245,7 +424,16 @@ int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id)

 	nvgpu_dom->ref = 0U;

-	nvs_domain_destroy(g->scheduler->sched, nvs_dom);
+	/* note: same wraparound logic as in RL domains to keep in sync */
+	if (s->active_domain == nvgpu_dom) {
+		nvs_next = nvs_dom->next;
+		if (nvs_next == NULL) {
+			nvs_next = s->sched->domain_list->domains;
+		}
+		s->active_domain = nvs_next->priv;
+	}
+
+	nvs_domain_destroy(s->sched, nvs_dom);
 	nvgpu_kfree(g, nvgpu_dom);

 unlock:
--- a/drivers/gpu/nvgpu/include/nvgpu/nvs.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvs.h
@@ -29,6 +29,8 @@

 #include <nvgpu/atomic.h>
 #include <nvgpu/lock.h>
+#include <nvgpu/worker.h>
+#include <nvgpu/timers.h>

 /*
 * Max size we'll parse from an NVS log entry.
@@ -65,14 +67,24 @@ struct nvgpu_nvs_domain {
 	u32 ref;
 };

+struct nvgpu_nvs_worker {
+	struct nvgpu_worker worker;
+	struct nvgpu_timeout timeout;
+	u32 current_timeout;
+};
+
 struct nvgpu_nvs_scheduler {
 	struct nvs_sched *sched;
 	nvgpu_atomic64_t id_counter;
+	struct nvgpu_nvs_worker worker;
+	struct nvgpu_nvs_domain *active_domain;
 };

 #ifdef CONFIG_NVS_PRESENT
 int nvgpu_nvs_init(struct gk20a *g);
 int nvgpu_nvs_open(struct gk20a *g);
+void nvgpu_nvs_remove_support(struct gk20a *g);
+int nvgpu_nvs_suspend(struct gk20a *g);
 void nvgpu_nvs_get_log(struct gk20a *g, s64 *timestamp, const char **msg);
 u32 nvgpu_nvs_domain_count(struct gk20a *g);
 int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id);
@@ -96,11 +108,22 @@ static inline int nvgpu_nvs_init(struct gk20a *g)
 {
 	return 0;
 }
+
+static inline void nvgpu_nvs_remove_support(struct gk20a *g)
+{
+}
+
+static inline int nvgpu_nvs_suspend(struct gk20a *g)
+{
+	return 0;
+}
+
 static inline struct nvgpu_nvs_domain *
 nvgpu_nvs_domain_get(struct gk20a *g, const char *name)
 {
 	return NULL;
 }
+
 static inline void nvgpu_nvs_domain_put(struct gk20a *g, struct nvgpu_nvs_domain *dom)
 {
 }
--- a/drivers/gpu/nvgpu/os/linux/ioctl_nvs.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_nvs.c
@@ -30,16 +30,10 @@ int nvgpu_nvs_dev_open(struct inode *inode, struct file *filp)
 {
 	struct nvgpu_cdev *cdev;
 	struct gk20a *g;
-	int err;

 	cdev = container_of(inode->i_cdev, struct nvgpu_cdev, cdev);
 	g = nvgpu_get_gk20a_from_cdev(cdev);

-	err = nvgpu_nvs_open(g);
-	if (err != 0) {
-		return err;
-	}
-
 	filp->private_data = g;

 	return 0;
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -66,6 +66,7 @@
 #include <nvgpu/cic_mon.h>
 #include <nvgpu/cic_rm.h>
 #include <nvgpu/fb.h>
+#include <nvgpu/nvs.h>

 #include "platform_gk20a.h"
 #include "sysfs.h"
@@ -1027,6 +1028,8 @@ void gk20a_remove_support(struct gk20a *g)
 		g->mm.remove_ce_support(&g->mm);
 #endif

+	nvgpu_nvs_remove_support(g);
+
 	if (g->fifo.remove_support)
 		g->fifo.remove_support(&g->fifo);