diff --git a/drivers/gpu/nvgpu/common/fifo/channel_wdt.c b/drivers/gpu/nvgpu/common/fifo/channel_wdt.c
index 645cc8365..4cb06ec04 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel_wdt.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel_wdt.c
@@ -25,7 +25,6 @@
 
 #include <nvgpu/watchdog.h>
 #include <nvgpu/channel.h>
-#include <nvgpu/runlist.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/gk20a.h>
 
@@ -176,7 +175,6 @@ void nvgpu_channel_worker_poll_wakeup_post_process_item(
 
 	if (nvgpu_timeout_peek_expired(&ch_worker->timeout)) {
 		nvgpu_channel_poll_wdt(g);
-		nvgpu_runlist_tick(g);
 		nvgpu_timeout_init_cpu_timer(g, &ch_worker->timeout,
 				ch_worker->watchdog_interval);
 	}
diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
index f254a535f..f7d698abd 100644
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -339,6 +339,10 @@ int nvgpu_prepare_poweroff(struct gk20a *g)
 	if (tmp_ret != 0) {
 		ret = tmp_ret;
 	}
+	tmp_ret = nvgpu_nvs_suspend(g);
+	if (tmp_ret != 0) {
+		ret = tmp_ret;
+	}
 	tmp_ret = g->ops.fifo.fifo_suspend(g);
 	if (tmp_ret != 0) {
 		ret = tmp_ret;
diff --git a/drivers/gpu/nvgpu/common/nvs/nvs_sched.c b/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
index 333ca665c..19c92bdcc 100644
--- a/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
+++ b/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
@@ -33,6 +33,135 @@ static struct nvs_sched_ops nvgpu_nvs_ops = {
 	.recover = NULL,
 };
 
+/*
+ * TODO: make use of worker items when
+ * 1) the active domain gets modified
+ *    - currently updates happen asynchronously elsewhere
+ *    - either resubmit the domain or do the updates later
+ * 2) recovery gets triggered
+ *    - currently it just locks all affected runlists
+ *    - consider pausing the scheduler logic and signaling users
+ */
+struct nvgpu_nvs_worker_item {
+	struct nvgpu_list_node list;
+};
+
+static inline struct nvgpu_nvs_worker_item *
+nvgpu_nvs_worker_item_from_worker_item(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_nvs_worker_item *)
+	   ((uintptr_t)node - offsetof(struct nvgpu_nvs_worker_item, list));
+};
+
+static inline struct nvgpu_nvs_worker *
+nvgpu_nvs_worker_from_worker(struct nvgpu_worker *worker)
+{
+	return (struct nvgpu_nvs_worker *)
+	   ((uintptr_t)worker - offsetof(struct nvgpu_nvs_worker, worker));
+};
+
+static void nvgpu_nvs_worker_poll_init(struct nvgpu_worker *worker)
+{
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+
+	/* 100 ms is a nice arbitrary timeout for default status */
+	nvs_worker->current_timeout = 100;
+	nvgpu_timeout_init_cpu_timer(worker->g, &nvs_worker->timeout,
+			nvs_worker->current_timeout);
+}
+
+static u32 nvgpu_nvs_worker_wakeup_timeout(struct nvgpu_worker *worker)
+{
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+
+	return nvs_worker->current_timeout;
+}
+
+static void nvgpu_nvs_worker_wakeup_process_item(
+		struct nvgpu_list_node *work_item)
+{
+	struct nvgpu_nvs_worker_item *item =
+		nvgpu_nvs_worker_item_from_worker_item(work_item);
+	(void)item;
+	/* placeholder; never called yet */
+}
+
+static u32 nvgpu_nvs_tick(struct gk20a *g)
+{
+	struct nvgpu_nvs_scheduler *sched = g->scheduler;
+	struct nvgpu_nvs_domain *domain;
+	struct nvs_domain *nvs_domain;
+	u32 timeslice;
+
+	nvs_dbg(g, "nvs tick");
+
+	nvgpu_mutex_acquire(&g->sched_mutex);
+
+	domain = sched->active_domain;
+
+	if (domain == NULL) {
+		/* nothing to schedule, TODO wait for an event instead */
+		nvgpu_mutex_release(&g->sched_mutex);
+		return 100000;
+	}
+
+	nvs_domain = domain->parent->next;
+	if (nvs_domain == NULL) {
+		nvs_domain = g->scheduler->sched->domain_list->domains;
+	}
+	timeslice = nvs_domain->timeslice_us;
+
+	nvgpu_runlist_tick(g);
+	sched->active_domain = nvs_domain->priv;
+
+	nvgpu_mutex_release(&g->sched_mutex);
+
+	return timeslice;
+}
+
+static void nvgpu_nvs_worker_wakeup_post_process(struct nvgpu_worker *worker)
+{
+	struct gk20a *g = worker->g;
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+
+	if (nvgpu_timeout_peek_expired(&nvs_worker->timeout)) {
+		u32 next_timeout_us = nvgpu_nvs_tick(g);
+
+		if (next_timeout_us != 0U) {
+			nvs_worker->current_timeout = (next_timeout_us + 999U) / 1000U;
+		}
+
+		nvgpu_timeout_init_cpu_timer(g, &nvs_worker->timeout,
+				nvs_worker->current_timeout);
+	}
+}
+
+static const struct nvgpu_worker_ops nvs_worker_ops = {
+	.pre_process = nvgpu_nvs_worker_poll_init,
+	.wakeup_timeout = nvgpu_nvs_worker_wakeup_timeout,
+	.wakeup_process_item = nvgpu_nvs_worker_wakeup_process_item,
+	.wakeup_post_process = nvgpu_nvs_worker_wakeup_post_process,
+};
+
+static int nvgpu_nvs_worker_init(struct gk20a *g)
+{
+	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+
+	nvgpu_worker_init_name(worker, "nvgpu_nvs", g->name);
+
+	return nvgpu_worker_init(g, worker, &nvs_worker_ops);
+}
+
+static void nvgpu_nvs_worker_deinit(struct gk20a *g)
+{
+	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+
+	nvgpu_worker_deinit(worker);
+}
+
 int nvgpu_nvs_init(struct gk20a *g)
 {
 	struct nvgpu_nvs_domain *domain;
@@ -55,6 +184,43 @@ int nvgpu_nvs_init(struct gk20a *g)
 	return 0;
 }
 
+void nvgpu_nvs_remove_support(struct gk20a *g)
+{
+	struct nvgpu_nvs_scheduler *sched = g->scheduler;
+	struct nvs_domain *nvs_dom;
+
+	if (sched == NULL) {
+		/* never powered on to init anything */
+		return;
+	}
+
+	nvs_domain_for_each(sched->sched, nvs_dom) {
+		struct nvgpu_nvs_domain *nvgpu_dom = nvs_dom->priv;
+		if (nvgpu_dom->ref != 1U) {
+			nvgpu_warn(g,
+				   "domain %llu is still in use during shutdown! refs: %u",
+				   nvgpu_dom->id, nvgpu_dom->ref);
+		}
+
+		/* runlist removal will clear the rl domains */
+		nvgpu_kfree(g, nvgpu_dom);
+	}
+
+	nvs_sched_close(sched->sched);
+	nvgpu_kfree(g, sched->sched);
+	nvgpu_kfree(g, sched);
+	g->scheduler = NULL;
+	nvgpu_mutex_destroy(&g->sched_mutex);
+}
+
+int nvgpu_nvs_suspend(struct gk20a *g)
+{
+	nvgpu_nvs_worker_deinit(g);
+	nvs_dbg(g, "NVS worker suspended");
+
+	return 0;
+}
+
 int nvgpu_nvs_open(struct gk20a *g)
 {
 	int err = 0;
@@ -63,11 +229,10 @@ int nvgpu_nvs_open(struct gk20a *g)
 
 	nvgpu_mutex_acquire(&g->sched_mutex);
 
-	/*
-	 * If there's already a scheduler present, we are done; no need for
-	 * further action.
-	 */
 	if (g->scheduler != NULL) {
+		/* resuming from railgate */
+		err = nvgpu_nvs_worker_init(g);
+		nvs_dbg(g, "NVS worker resume, err=%d", err);
 		goto unlock;
 	}
 
@@ -84,12 +249,19 @@ int nvgpu_nvs_open(struct gk20a *g)
 		goto unlock;
 	}
 
-	nvs_dbg(g, "  Creating scheduler.");
+	err = nvgpu_nvs_worker_init(g);
+	if (err != 0) {
+		goto unlock;
+	}
+
+	nvs_dbg(g, "  Creating NVS scheduler.");
 	err = nvs_sched_create(g->scheduler->sched, &nvgpu_nvs_ops, g);
+	if (err != 0) {
+		nvgpu_nvs_worker_deinit(g);
+		goto unlock;
+	}
 
 unlock:
-	nvgpu_mutex_release(&g->sched_mutex);
-
 	if (err) {
 		nvs_dbg(g, "  Failed! Error code: %d", err);
 		if (g->scheduler) {
@@ -99,6 +271,8 @@ unlock:
 		}
 	}
 
+	nvgpu_mutex_release(&g->sched_mutex);
+
 	return err;
 }
 
@@ -154,6 +328,10 @@ int nvgpu_nvs_add_domain(struct gk20a *g, const char *name, u32 timeslice,
 
 	nvgpu_dom->parent = nvs_dom;
 
+	if (g->scheduler->active_domain == NULL) {
+		g->scheduler->active_domain = nvgpu_dom;
+	}
+
 	*pdomain = nvgpu_dom;
 unlock:
 	nvgpu_mutex_release(&g->sched_mutex);
@@ -208,15 +386,16 @@ void nvgpu_nvs_domain_put(struct gk20a *g, struct nvgpu_nvs_domain *dom)
 
 int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id)
 {
+	struct nvgpu_nvs_scheduler *s = g->scheduler;
 	struct nvgpu_nvs_domain *nvgpu_dom;
-	struct nvs_domain *nvs_dom;
+	struct nvs_domain *nvs_dom, *nvs_next;
 	int err = 0;
 
 	nvgpu_mutex_acquire(&g->sched_mutex);
 
 	nvs_dbg(g, "Attempting to remove domain: %llu", dom_id);
 
-	nvgpu_dom = nvgpu_nvs_get_dom_by_id(g, g->scheduler->sched, dom_id);
+	nvgpu_dom = nvgpu_nvs_get_dom_by_id(g, s->sched, dom_id);
 	if (nvgpu_dom == NULL) {
 		nvs_dbg(g, "domain %llu does not exist!", dom_id);
 		err = -ENOENT;
@@ -245,7 +424,16 @@ int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id)
 
 	nvgpu_dom->ref = 0U;
 
-	nvs_domain_destroy(g->scheduler->sched, nvs_dom);
+	/* note: same wraparound logic as in RL domains to keep in sync */
+	if (s->active_domain == nvgpu_dom) {
+		nvs_next = nvs_dom->next;
+		if (nvs_next == NULL) {
+			nvs_next = s->sched->domain_list->domains;
+		}
+		s->active_domain = nvs_next->priv;
+	}
+
+	nvs_domain_destroy(s->sched, nvs_dom);
 	nvgpu_kfree(g, nvgpu_dom);
 
 unlock:
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvs.h b/drivers/gpu/nvgpu/include/nvgpu/nvs.h
index 3debccafc..ddc147052 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvs.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvs.h
@@ -29,6 +29,8 @@
 
 #include <nvgpu/atomic.h>
 #include <nvgpu/lock.h>
+#include <nvgpu/worker.h>
+#include <nvgpu/timers.h>
 
 /*
  * Max size we'll parse from an NVS log entry.
@@ -65,14 +67,24 @@ struct nvgpu_nvs_domain {
 	u32 ref;
 };
 
+struct nvgpu_nvs_worker {
+	struct nvgpu_worker worker;
+	struct nvgpu_timeout timeout;
+	u32 current_timeout;
+};
+
 struct nvgpu_nvs_scheduler {
 	struct nvs_sched *sched;
 	nvgpu_atomic64_t id_counter;
+	struct nvgpu_nvs_worker worker;
+	struct nvgpu_nvs_domain *active_domain;
 };
 
 #ifdef CONFIG_NVS_PRESENT
 int nvgpu_nvs_init(struct gk20a *g);
 int nvgpu_nvs_open(struct gk20a *g);
+void nvgpu_nvs_remove_support(struct gk20a *g);
+int nvgpu_nvs_suspend(struct gk20a *g);
 void nvgpu_nvs_get_log(struct gk20a *g, s64 *timestamp, const char **msg);
 u32 nvgpu_nvs_domain_count(struct gk20a *g);
 int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id);
@@ -96,11 +108,22 @@ static inline int nvgpu_nvs_init(struct gk20a *g)
 {
 	return 0;
 }
+
+static inline void nvgpu_nvs_remove_support(struct gk20a *g)
+{
+}
+
+static inline int nvgpu_nvs_suspend(struct gk20a *g)
+{
+	return 0;
+}
+
 static inline struct nvgpu_nvs_domain *
 nvgpu_nvs_domain_get(struct gk20a *g, const char *name)
 {
 	return NULL;
 }
+
 static inline void nvgpu_nvs_domain_put(struct gk20a *g, struct nvgpu_nvs_domain *dom)
 {
 }
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_nvs.c b/drivers/gpu/nvgpu/os/linux/ioctl_nvs.c
index 0e5f906c4..56e871219 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_nvs.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_nvs.c
@@ -30,16 +30,10 @@ int nvgpu_nvs_dev_open(struct inode *inode, struct file *filp)
 {
 	struct nvgpu_cdev *cdev;
 	struct gk20a *g;
-	int err;
 
 	cdev = container_of(inode->i_cdev, struct nvgpu_cdev, cdev);
 	g = nvgpu_get_gk20a_from_cdev(cdev);
 
-	err = nvgpu_nvs_open(g);
-	if (err != 0) {
-		return err;
-	}
-
 	filp->private_data = g;
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c
index 21ffbe243..cde92284a 100644
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -66,6 +66,7 @@
 #include <nvgpu/cic_mon.h>
 #include <nvgpu/cic_rm.h>
 #include <nvgpu/fb.h>
+#include <nvgpu/nvs.h>
 
 #include "platform_gk20a.h"
 #include "sysfs.h"
@@ -1027,6 +1028,8 @@ void gk20a_remove_support(struct gk20a *g)
 		g->mm.remove_ce_support(&g->mm);
 #endif
 
+	nvgpu_nvs_remove_support(g);
+
 	if (g->fifo.remove_support)
 		g->fifo.remove_support(&g->fifo);