diff --git a/drivers/gpu/nvgpu/common/fifo/runlist.c b/drivers/gpu/nvgpu/common/fifo/runlist.c
index 59f2b4b6b..a5a95526b 100644
--- a/drivers/gpu/nvgpu/common/fifo/runlist.c
+++ b/drivers/gpu/nvgpu/common/fifo/runlist.c
@@ -432,7 +432,7 @@ static int nvgpu_runlist_reconstruct_locked(struct gk20a *g,
 	return 0;
 }
 
-static void nvgpu_runlist_swap_mem(struct nvgpu_runlist_domain *domain)
+void nvgpu_runlist_swap_mem(struct gk20a *g, struct nvgpu_runlist_domain *domain)
 {
 	struct nvgpu_runlist_mem *mem_tmp;
 
@@ -440,43 +440,36 @@ static void nvgpu_runlist_swap_mem(struct nvgpu_runlist_domain *domain)
 	 * mem becomes the previously scheduled buffer and it can be modified once
 	 * the runlist lock is released.
 	 */
+	rl_dbg(g, "Swapping mem for rl domain[%s]", domain->name);
 
 	mem_tmp = domain->mem;
 	domain->mem = domain->mem_hw;
 	domain->mem_hw = mem_tmp;
 }
 
-static int nvgpu_runlist_submit_locked(struct gk20a *g, struct nvgpu_runlist *rl,
-				struct nvgpu_runlist_domain *domain, bool wait_for_finish)
+static int nvgpu_runlist_domain_actual_submit(struct gk20a *g, struct nvgpu_runlist *rl,
+		bool swap_buffer, bool wait_for_finish)
 {
 	int ret = 0;
 
-	/*
-	 * hw_submit updates mem_hw to hardware; swap the buffers now.
-	 */
-	nvgpu_runlist_swap_mem(domain);
+	rl_dbg(g, "Runlist[%u]: submitting domain %s",
+		rl->id, rl->domain->name);
+
+	if (swap_buffer) {
+		nvgpu_runlist_swap_mem(g, rl->domain);
+	}
+
+	nvgpu_atomic_set(&rl->domain->pending_update, 0);
 
-	/*
-	 * A non-active domain may be updated, but submit still the currently
-	 * active one just for simplicity.
-	 *
-	 * TODO: Later on, updates and submits will need to be totally
-	 * decoupled so that submits are done only in the domain scheduler.
-	 */
 	g->ops.runlist.hw_submit(g, rl);
 
 	if (wait_for_finish) {
 		ret = g->ops.runlist.wait_pending(g, rl);
-
 		if (ret == -ETIMEDOUT) {
 			nvgpu_err(g, "runlist %d update timeout", rl->id);
 			/* trigger runlist update timeout recovery */
 			return ret;
 
-		} else {
-			if (ret == -EINTR) {
-				nvgpu_err(g, "runlist update interrupted");
-			}
 		}
 	}
 
@@ -515,6 +508,8 @@ static int nvgpu_runlist_update_mem_locked(struct gk20a *g, struct nvgpu_runlist
 		return ret;
 	}
 
+	nvgpu_atomic_set(&domain->pending_update, 1);
+
 	return ret;
 }
 
@@ -524,7 +519,7 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 				bool wait_for_finish)
 {
 	int ret = 0;
-
+	(void)wait_for_finish;
 	/*
 	 * Certain use-cases might not have existing user rl domains, fall
 	 * back to shadow domain.
@@ -539,16 +534,6 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 		if (ret != 0) {
 			return ret;
 		}
-
-		/*
-		 * A submit assumes domain->mem_hw to be the active buffer,
-		 * and the reconstruction above updates domain->mem, and the swap happens
-		 * in nvgpu_runlist_submit_locked which is done below for only the user
-		 * domain so calling swap_mem here is "equivalent" to nvgpu_runlist_submit_locked
-		 * to keep the ordering for any shadow rl domain submits that may happen in the
-		 * future without going via this nvgpu_runlist_update_locked path.
-		 */
-		nvgpu_runlist_swap_mem(rl->shadow_rl_domain);
 	}
 
 	ret = nvgpu_runlist_update_mem_locked(g, rl, domain, ch, add, true);
@@ -556,8 +541,6 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 		return ret;
 	}
 
-	ret = nvgpu_runlist_submit_locked(g, rl, domain, wait_for_finish);
-
 	return ret;
 }
 
@@ -578,18 +561,29 @@ int nvgpu_runlist_reschedule(struct nvgpu_channel *ch, bool preempt_next,
 	if (nvgpu_mutex_tryacquire(&runlist->runlist_lock) == 0) {
 		return -EBUSY;
 	}
+
 #ifdef CONFIG_NVGPU_LS_PMU
 	mutex_ret = nvgpu_pmu_lock_acquire(
 		g, g->pmu, PMU_MUTEX_ID_FIFO, &token);
 #endif
 
-	/*
-	 * Note that the runlist memory is not rewritten; the currently active
-	 * buffer is just resubmitted so that scheduling begins from the first
-	 * entry in it.
-	 */
-	g->ops.runlist.hw_submit(g, runlist);
+	nvgpu_atomic_set(&runlist->domain->pending_update, 1);
 
+#ifdef CONFIG_NVS_PRESENT
+	ret = g->nvs_worker_submit(g, runlist, runlist->domain, false, wait_preempt);
+#else
+	ret = nvgpu_rl_domain_sync_submit(g, runlist, runlist->domain, false, wait_preempt);
+#endif
+	if (ret != 0) {
+		if (ret == 1) {
+			ret = 0;
+		}
+		goto done;
+	}
+
+	/* Acquiring runlist lock above guarantees that the current
+	 * domain won't be switched.
+	 */
 	if (preempt_next) {
 		if (g->ops.runlist.reschedule_preempt_next_locked(ch,
 				wait_preempt) != 0) {
@@ -601,6 +595,7 @@ int nvgpu_runlist_reschedule(struct nvgpu_channel *ch, bool preempt_next,
 		nvgpu_err(g, "wait pending failed for runlist %u",
 				runlist->id);
 	}
+done:
 #ifdef CONFIG_NVGPU_LS_PMU
 	if (mutex_ret == 0) {
 		if (nvgpu_pmu_lock_release(g, g->pmu,
@@ -624,11 +619,12 @@ static int nvgpu_runlist_do_update(struct gk20a *g, struct nvgpu_runlist *rl,
 				   struct nvgpu_channel *ch,
 				   bool add, bool wait_for_finish)
 {
+	int ret = 0;
+
 #ifdef CONFIG_NVGPU_LS_PMU
 	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
 	int mutex_ret = 0;
 #endif
-	int ret = 0;
 
 	nvgpu_log_fn(g, " ");
 
@@ -638,6 +634,17 @@ static int nvgpu_runlist_do_update(struct gk20a *g, struct nvgpu_runlist *rl,
 		PMU_MUTEX_ID_FIFO, &token);
 #endif
 	ret = nvgpu_runlist_update_locked(g, rl, domain, ch, add, wait_for_finish);
+	if (ret == 0) {
+	#ifdef CONFIG_NVS_PRESENT
+		ret = g->nvs_worker_submit(g, rl, domain, true, wait_for_finish);
+	#else
+		ret = nvgpu_rl_domain_sync_submit(g, rl, domain, true, wait_for_finish);
+	#endif
+		/* Deferred Update */
+		if (ret == 1) {
+			ret = 0;
+		}
+	}
 #ifdef CONFIG_NVGPU_LS_PMU
 	if (mutex_ret == 0) {
 		if (nvgpu_pmu_lock_release(g, g->pmu,
@@ -655,8 +662,27 @@ static int nvgpu_runlist_do_update(struct gk20a *g, struct nvgpu_runlist *rl,
 	return ret;
 }
 
-static void runlist_select_locked(struct gk20a *g, struct nvgpu_runlist *runlist,
-		struct nvgpu_runlist_domain *next_domain)
+/*
+ * This is expected to be called only when device is powered on.
+ */
+static int runlist_submit_powered(struct gk20a *g, struct nvgpu_runlist *runlist,
+		struct nvgpu_runlist_domain *next_domain, bool swap_buffer,
+			bool wait_for_finish)
+{
+	int err;
+
+	runlist->domain = next_domain;
+
+	rl_dbg(g, "Runlist[%u]: switching to domain %s",
+		runlist->id, next_domain->name);
+
+	err = nvgpu_runlist_domain_actual_submit(g, runlist, swap_buffer, wait_for_finish);
+
+	return err;
+}
+
+static int runlist_select_and_submit(struct gk20a *g, struct nvgpu_runlist *runlist,
+		struct nvgpu_runlist_domain *next_domain, bool wait_for_finish)
 {
 	int err;
 
@@ -670,7 +696,7 @@ static void runlist_select_locked(struct gk20a *g, struct nvgpu_runlist *runlist
 		rl_dbg(g, "Runlist[%u]: power is off, skip submit",
 				runlist->id);
 		gk20a_idle_nosuspend(g);
-		return;
+		return 0;
 	}
 
 	err = gk20a_busy(g);
@@ -683,67 +709,71 @@ static void runlist_select_locked(struct gk20a *g, struct nvgpu_runlist *runlist
 		 * the error. Power is already on when the domain scheduler is
 		 * actually in use.
 		 */
-		return;
+		return err;
 	}
 
-	/*
-	 * Just submit the previously built mem (in nvgpu_runlist_update_locked)
-	 * of the active domain to hardware. In the future, the main scheduling
-	 * loop will get signaled when the RL mem is modified and the same domain
-	 * with new data needs to be submitted (typically triggered by a channel
-	 * getting opened or closed). For now, that code path executes separately.
-	 */
-	g->ops.runlist.hw_submit(g, runlist);
+	err = runlist_submit_powered(g, runlist, next_domain, false, wait_for_finish);
 
 	gk20a_idle(g);
+
+	return err;
 }
 
-static void runlist_switch_domain_locked(struct gk20a *g,
-					 struct nvgpu_runlist *runlist)
+int nvgpu_rl_domain_sync_submit(struct gk20a *g, struct nvgpu_runlist *runlist,
+		struct nvgpu_runlist_domain *next_domain, bool swap_buffers,
+			bool wait_for_finish)
 {
-	struct nvgpu_runlist_domain *domain;
-	struct nvgpu_runlist_domain *last;
+	int err = 0;
 
-	/*
-	 * When the last of user created rl domains is removed,
-	 * driver switches to the default domain. Hence, exit.
+	if (next_domain == NULL) {
+		next_domain = runlist->shadow_rl_domain;
+	}
+
+	if (nvgpu_atomic_read(&next_domain->pending_update) == 1) {
+		err = runlist_submit_powered(g, runlist, next_domain, swap_buffers,
+			wait_for_finish);
+	}
+
+	return err;
+}
+
+static int runlist_switch_domain_and_submit(struct gk20a *g,
+		struct nvgpu_runlist *runlist, struct nvgpu_runlist_domain *rl_domain)
+{
+	int ret = 0;
+	struct nvgpu_runlist_domain *prev_rl_domain = runlist->domain;
+
+	/* If no user domains exist, submit the shadow_rl_domain if
+	 * pending is set to true. When the last user domain is removed,
+	 * shadow_rl_domain will have pending_update set to true.
+	 * Eventually, this logic will change. For manual mode, this needs
+	 * to be submitted irrespective of the status of pending_update.
 	 */
 	if (nvgpu_list_empty(&runlist->user_rl_domains)) {
-		return;
-	}
-
-	/*
-	 * If there are user created rl domains available,
-	 * runlist->domain always points to one of them.
-	 */
-	domain = runlist->domain;
-	last = nvgpu_list_last_entry(&runlist->user_rl_domains,
-			nvgpu_runlist_domain, domains_list);
-
-	if (domain == last) {
-		domain = nvgpu_list_first_entry(&runlist->user_rl_domains,
-				nvgpu_runlist_domain, domains_list);
+		if (nvgpu_atomic_read(&rl_domain->pending_update) == 0) {
+			return 0;
+		}
 	} else {
-		domain = nvgpu_list_next_entry(domain,
-				nvgpu_runlist_domain, domains_list);
+		/* If only one user domain exists, return if no pending
+		 * update exists.
+		 */
+		if (prev_rl_domain == rl_domain) {
+			if (nvgpu_atomic_read(&prev_rl_domain->pending_update) == 0) {
+				return 0;
+			}
+		}
 	}
 
-	if (domain != runlist->domain) {
-		runlist_select_locked(g, runlist, domain);
-	}
+	ret = runlist_select_and_submit(g, runlist, rl_domain, false);
+
+	return ret;
 }
 
-static void runlist_switch_domain(struct gk20a *g, struct nvgpu_runlist *runlist)
-{
-	nvgpu_mutex_acquire(&runlist->runlist_lock);
-	runlist_switch_domain_locked(g, runlist);
-	nvgpu_mutex_release(&runlist->runlist_lock);
-}
-
-void nvgpu_runlist_tick(struct gk20a *g)
+void nvgpu_runlist_tick(struct gk20a *g, struct nvgpu_runlist_domain **rl_domain)
 {
 	struct nvgpu_fifo *f = &g->fifo;
 	u32 i;
+	int err = 0;
 
 	rl_dbg(g, "domain tick");
 
@@ -751,7 +781,10 @@ void nvgpu_runlist_tick(struct gk20a *g)
 		struct nvgpu_runlist *runlist;
 
 		runlist = &f->active_runlists[i];
-		runlist_switch_domain(g, runlist);
+		err = runlist_switch_domain_and_submit(g, runlist, rl_domain[i]);
+		if (err != 0) {
+			nvgpu_err(g, "Failed to schedule domain [%s]", rl_domain[i]->name);
+		}
 	}
 }
 
@@ -864,13 +897,13 @@ static void free_rl_mem(struct gk20a *g, struct nvgpu_runlist_mem *mem)
 	nvgpu_kfree(g, mem);
 }
 
-static void nvgpu_runlist_domain_unlink(struct nvgpu_runlist_domain *domain)
+static void nvgpu_runlist_domain_unlink_locked(struct nvgpu_runlist_domain *domain)
 {
 	/* added in nvgpu_runlist_domain_alloc() */
 	nvgpu_list_del(&domain->domains_list);
 }
 
-static void nvgpu_runlist_domain_free(struct gk20a *g,
+void nvgpu_runlist_domain_free(struct gk20a *g,
 		struct nvgpu_runlist_domain *domain)
 {
 	free_rl_mem(g, domain->mem);
@@ -885,51 +918,12 @@ static void nvgpu_runlist_domain_free(struct gk20a *g,
 	nvgpu_kfree(g, domain);
 }
 
-static void nvgpu_runlist_domain_unlink_and_free(struct gk20a *g,
+void nvgpu_runlist_unlink_domain(struct nvgpu_runlist *runlist,
 		struct nvgpu_runlist_domain *domain)
 {
-	nvgpu_runlist_domain_unlink(domain);
-	nvgpu_runlist_domain_free(g, domain);
-}
-
-int nvgpu_rl_domain_delete(struct gk20a *g, const char *name)
-{
-	struct nvgpu_fifo *f = &g->fifo;
-	u32 i;
-
-	for (i = 0; i < f->num_runlists; i++) {
-		struct nvgpu_runlist *runlist;
-		struct nvgpu_runlist_domain *domain;
-
-		runlist = &f->active_runlists[i];
-
-		nvgpu_mutex_acquire(&runlist->runlist_lock);
-		domain = nvgpu_rl_domain_get(g, runlist->id, name);
-		if (domain != NULL) {
-			struct nvgpu_runlist_domain *first;
-			struct nvgpu_runlist_domain *last;
-
-			rl_dbg(g, "deleting rl domain [%s]", domain->name);
-
-			first = nvgpu_list_first_entry(&runlist->user_rl_domains,
-					nvgpu_runlist_domain, domains_list);
-
-			last = nvgpu_list_last_entry(&runlist->user_rl_domains,
-					nvgpu_runlist_domain, domains_list);
-
-			if (first == last) {
-				/* Last of the user created rl domains, switch to default rl domain */
-				runlist_select_locked(g, runlist, runlist->shadow_rl_domain);
-			} else if (domain == runlist->domain) {
-				/* Don't let the HW access this anymore, switch to another rl domain */
-				runlist_switch_domain_locked(g, runlist);
-			}
-			nvgpu_runlist_domain_unlink_and_free(g, domain);
-		}
-		nvgpu_mutex_release(&runlist->runlist_lock);
-	}
-
-	return 0;
+	nvgpu_mutex_acquire(&runlist->runlist_lock);
+	nvgpu_runlist_domain_unlink_locked(domain);
+	nvgpu_mutex_release(&runlist->runlist_lock);
 }
 
 void nvgpu_runlist_cleanup_sw(struct gk20a *g)
@@ -954,7 +948,7 @@ void nvgpu_runlist_cleanup_sw(struct gk20a *g)
 						     nvgpu_runlist_domain,
 						     domains_list);
 
-			nvgpu_runlist_domain_unlink_and_free(g, domain);
+			nvgpu_runlist_domain_unlink_locked(domain);
 		}
 		/* this isn't an owning pointer, just reset */
 		runlist->domain = NULL;
@@ -1110,19 +1104,16 @@ static struct nvgpu_runlist_mem *init_rl_mem(struct gk20a *g, u32 runlist_size)
 	return mem;
 }
 
-static void nvgpu_runlist_link_domain(struct nvgpu_runlist *runlist,
+void nvgpu_runlist_link_domain(struct nvgpu_runlist *runlist,
 		struct nvgpu_runlist_domain *domain)
 {
-	/* deleted in nvgpu_runlist_domain_unlink() */
+	nvgpu_mutex_acquire(&runlist->runlist_lock);
+	/* deleted in nvgpu_runlist_domain_unlink_locked() */
 	nvgpu_list_add_tail(&domain->domains_list, &runlist->user_rl_domains);
-
-	/* Select the first created domain as the boot-time default */
-	if (runlist->domain == runlist->shadow_rl_domain) {
-		runlist->domain = domain;
-	}
+	nvgpu_mutex_release(&runlist->runlist_lock);
 }
 
-static struct nvgpu_runlist_domain *nvgpu_runlist_domain_alloc(struct gk20a *g,
+struct nvgpu_runlist_domain *nvgpu_runlist_domain_alloc(struct gk20a *g,
 		const char *name)
 {
 	struct nvgpu_runlist_domain *domain = nvgpu_kzalloc(g, sizeof(*domain));
@@ -1160,6 +1151,8 @@ static struct nvgpu_runlist_domain *nvgpu_runlist_domain_alloc(struct gk20a *g,
 		goto free_active_channels;
 	}
 
+	nvgpu_atomic_set(&domain->pending_update, 0);
+
 	return domain;
 free_active_channels:
 	nvgpu_kfree(g, domain->active_channels);
@@ -1189,43 +1182,6 @@ struct nvgpu_runlist_domain *nvgpu_rl_domain_get(struct gk20a *g, u32 runlist_id
 	return NULL;
 }
 
-int nvgpu_rl_domain_alloc(struct gk20a *g, const char *name)
-{
-	struct nvgpu_fifo *f = &g->fifo;
-	int err;
-	u32 i;
-
-	for (i = 0U; i < f->num_runlists; i++) {
-		struct nvgpu_runlist *runlist;
-		struct nvgpu_runlist_domain *domain;
-
-		runlist = &f->active_runlists[i];
-
-		nvgpu_mutex_acquire(&runlist->runlist_lock);
-		/* this may only happen on the very first runlist */
-		if (nvgpu_rl_domain_get(g, runlist->id, name) != NULL) {
-			nvgpu_mutex_release(&runlist->runlist_lock);
-			return -EEXIST;
-		}
-
-		domain = nvgpu_runlist_domain_alloc(g, name);
-		if (domain == NULL) {
-			nvgpu_mutex_release(&runlist->runlist_lock);
-			err = -ENOMEM;
-			goto clear;
-		}
-
-		nvgpu_runlist_link_domain(runlist, domain);
-		nvgpu_mutex_release(&runlist->runlist_lock);
-	}
-
-	return 0;
-clear:
-	/* deletion skips runlists where the domain isn't found */
-	(void)nvgpu_rl_domain_delete(g, name);
-	return err;
-}
-
 static void nvgpu_init_active_runlist_mapping(struct gk20a *g)
 {
 	struct nvgpu_fifo *f = &g->fifo;
diff --git a/drivers/gpu/nvgpu/common/nvs/nvs_sched.c b/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
index e11ad8668..131e21312 100644
--- a/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
+++ b/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
@@ -43,8 +43,18 @@ static struct nvs_sched_ops nvgpu_nvs_ops = {
  *    - currently it just locks all affected runlists
  *    - consider pausing the scheduler logic and signaling users
  */
+
 struct nvgpu_nvs_worker_item {
+	struct gk20a *g;
+	struct nvgpu_runlist *rl;
+	struct nvgpu_runlist_domain *rl_domain;
+	struct nvgpu_cond cond;
+	bool swap_buffer;
+	bool wait_for_finish;
+	bool locked;
+	int status;
 	struct nvgpu_list_node list;
+	nvgpu_atomic_t state;
 };
 
 static inline struct nvgpu_nvs_worker_item *
@@ -70,6 +80,9 @@ static void nvgpu_nvs_worker_poll_init(struct nvgpu_worker *worker)
 	nvs_worker->current_timeout = 100;
 	nvgpu_timeout_init_cpu_timer_sw(worker->g, &nvs_worker->timeout,
 			nvs_worker->current_timeout);
+
+	nvgpu_atomic_set(&nvs_worker->nvs_sched_init, 1);
+	nvgpu_cond_signal(&nvs_worker->worker.wq);
 }
 
 static u32 nvgpu_nvs_worker_wakeup_timeout(struct nvgpu_worker *worker)
@@ -80,20 +93,12 @@ static u32 nvgpu_nvs_worker_wakeup_timeout(struct nvgpu_worker *worker)
 	return nvs_worker->current_timeout;
 }
 
-static void nvgpu_nvs_worker_wakeup_process_item(
-		struct nvgpu_list_node *work_item)
-{
-	struct nvgpu_nvs_worker_item *item =
-		nvgpu_nvs_worker_item_from_worker_item(work_item);
-	(void)item;
-	/* placeholder; never called yet */
-}
-
 static u64 nvgpu_nvs_tick(struct gk20a *g)
 {
 	struct nvgpu_nvs_scheduler *sched = g->scheduler;
 	struct nvgpu_nvs_domain *domain;
 	struct nvs_domain *nvs_next;
+	struct nvgpu_nvs_domain *nvgpu_domain_next;
 	u64 timeslice;
 
 	nvs_dbg(g, "nvs tick");
@@ -109,8 +114,9 @@ static u64 nvgpu_nvs_tick(struct gk20a *g)
 	}
 
 	timeslice = nvs_next->timeslice_ns;
+	nvgpu_domain_next = nvs_next->priv;
 
-	nvgpu_runlist_tick(g);
+	nvgpu_runlist_tick(g, nvgpu_domain_next->rl_domains);
 	sched->active_domain = nvs_next->priv;
 
 	nvgpu_mutex_release(&g->sched_mutex);
@@ -118,6 +124,113 @@ static u64 nvgpu_nvs_tick(struct gk20a *g)
 	return timeslice;
 }
 
+static void nvgpu_nvs_worker_wakeup_process_item(struct nvgpu_list_node *work_item)
+{
+	struct nvgpu_nvs_worker_item *work =
+			nvgpu_nvs_worker_item_from_worker_item(work_item);
+	struct gk20a *g = work->g;
+	int ret = 0;
+	struct nvgpu_nvs_scheduler *sched = g->scheduler;
+	struct nvs_domain *nvs_domain;
+	struct nvgpu_runlist *runlist = work->rl;
+	struct nvgpu_runlist_domain *rl_domain = work->rl_domain;
+
+	nvgpu_mutex_acquire(&g->sched_mutex);
+
+	if (rl_domain == NULL) {
+		nvs_domain = sched->shadow_domain->parent;
+		rl_domain = runlist->shadow_rl_domain;
+	} else if (strcmp(rl_domain->name, SHADOW_DOMAIN_NAME) == 0) {
+		nvs_domain = sched->shadow_domain->parent;
+	} else {
+		nvs_domain = nvs_domain_by_name(sched->sched, rl_domain->name);
+		if (nvs_domain == NULL) {
+			nvgpu_err(g, "Unable to find domain[%s]", rl_domain->name);
+			ret = -EINVAL;
+			goto done;
+		}
+	}
+
+	if (sched->active_domain == nvs_domain->priv) {
+		/* Instantly switch domain and force runlist updates */
+		ret = nvgpu_rl_domain_sync_submit(g, runlist, rl_domain, work->swap_buffer, work->wait_for_finish);
+	} else {
+		/* Swap buffers here even if its deferred for correctness */
+		if (work->swap_buffer) {
+			nvgpu_runlist_swap_mem(g, rl_domain);
+		}
+		ret = 1;
+	}
+
+	nvs_dbg(g, " ");
+
+done:
+	nvgpu_mutex_release(&g->sched_mutex);
+	work->status = ret;
+	nvgpu_atomic_set(&work->state, 1);
+	/* Wakeup threads waiting on runlist submit */
+	nvgpu_cond_signal(&work->cond);
+}
+
+static int nvgpu_nvs_worker_submit(struct gk20a *g, struct nvgpu_runlist *rl,
+		struct nvgpu_runlist_domain *rl_domain, bool swap_buffer,
+		bool wait_for_finish)
+{
+	struct nvgpu_nvs_scheduler *sched = g->scheduler;
+	struct nvgpu_nvs_worker *worker = &sched->worker;
+	struct nvgpu_nvs_worker_item *work;
+	int ret = 0;
+
+	if (sched == NULL) {
+		return -ENODEV;
+	}
+
+	nvs_dbg(g, " ");
+
+	work = nvgpu_kzalloc(g, sizeof(*work));
+	if (work == NULL) {
+		nvgpu_err(g, "Unable to allocate memory for runlist job");
+		ret = -ENOMEM;
+		goto free_domain;
+	}
+
+	work->g = g;
+	work->rl = rl;
+	work->rl_domain = rl_domain;
+	nvgpu_cond_init(&work->cond);
+	nvgpu_init_list_node(&work->list);
+	work->swap_buffer = swap_buffer;
+	work->wait_for_finish = wait_for_finish;
+	nvgpu_atomic_set(&work->state, 0);
+
+	nvs_dbg(g, " enqueueing runlist submit");
+
+	ret = nvgpu_worker_enqueue(&worker->worker, &work->list);
+	if (ret != 0) {
+		goto fail;
+	}
+
+	nvs_dbg(g, " ");
+
+	ret = NVGPU_COND_WAIT(&work->cond, nvgpu_atomic_read(&work->state) == 1, 0U);
+	if (ret != 0) {
+		nvgpu_err(g, "Runlist submit interrupted while waiting for submit");
+		goto fail;
+	}
+
+	nvs_dbg(g, " ");
+
+	ret = work->status;
+
+fail:
+	nvgpu_cond_destroy(&work->cond);
+	nvgpu_kfree(g, work);
+
+free_domain:
+
+	return ret;
+}
+
 static void nvgpu_nvs_worker_wakeup_post_process(struct nvgpu_worker *worker)
 {
 	struct gk20a *g = worker->g;
@@ -146,19 +259,40 @@ static const struct nvgpu_worker_ops nvs_worker_ops = {
 
 static int nvgpu_nvs_worker_init(struct gk20a *g)
 {
+	int err = 0;
 	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+	struct nvgpu_nvs_worker *nvs_worker = &g->scheduler->worker;
+
+	nvgpu_cond_init(&nvs_worker->wq_init);
+	nvgpu_atomic_set(&nvs_worker->nvs_sched_init, 0);
 
 	nvgpu_worker_init_name(worker, "nvgpu_nvs", g->name);
 
-	return nvgpu_worker_init(g, worker, &nvs_worker_ops);
+	err = nvgpu_worker_init(g, worker, &nvs_worker_ops);
+	if (err != 0) {
+		/* Ensure that scheduler thread is started as soon as possible to handle
+		 * minimal uptime for applications.
+		 */
+		err = NVGPU_COND_WAIT(&nvs_worker->worker.wq,
+				nvgpu_atomic_read(&nvs_worker->nvs_sched_init) == 1, 0);
+		if (err != 0) {
+			nvgpu_err(g, "Interrupted while waiting for scheduler thread");
+		}
+	}
+
+	return err;
 }
 
 static void nvgpu_nvs_worker_deinit(struct gk20a *g)
 {
 	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+	struct nvgpu_nvs_worker *nvs_worker = &g->scheduler->worker;
 
 	nvgpu_worker_deinit(worker);
 
+	nvgpu_atomic_set(&nvs_worker->nvs_sched_init, 0);
+	nvgpu_cond_destroy(&nvs_worker->wq_init);
+
 	nvs_dbg(g, "NVS worker suspended");
 }
 
@@ -166,8 +300,10 @@ static struct nvgpu_nvs_domain *
 	nvgpu_nvs_gen_domain(struct gk20a *g, const char *name, u64 id,
 		u64 timeslice, u64 preempt_grace)
 {
+	struct nvgpu_fifo *f = &g->fifo;
 	struct nvs_domain *nvs_dom = NULL;
 	struct nvgpu_nvs_domain *nvgpu_dom = NULL;
+	u32 num_runlists = f->num_runlists;
 
 	nvs_dbg(g, "Adding new domain: %s", name);
 
@@ -177,6 +313,14 @@ static struct nvgpu_nvs_domain *
 		return nvgpu_dom;
 	}
 
+	nvgpu_dom->rl_domains = nvgpu_kzalloc(g, sizeof(*nvgpu_dom->rl_domains) * num_runlists);
+	if (nvgpu_dom->rl_domains == NULL) {
+		nvs_dbg(g, "failed to allocate memory for domain->rl_domains");
+		nvgpu_kfree(g, nvgpu_dom);
+		nvgpu_dom = NULL;
+		return nvgpu_dom;
+	}
+
 	nvgpu_dom->id = id;
 	nvgpu_dom->ref = 1U;
 
@@ -185,6 +329,7 @@ static struct nvgpu_nvs_domain *
 
 	if (nvs_dom == NULL) {
 		nvs_dbg(g, "failed to create nvs domain for %s", name);
+		nvgpu_kfree(g, nvgpu_dom->rl_domains);
 		nvgpu_kfree(g, nvgpu_dom);
 		nvgpu_dom = NULL;
 		return nvgpu_dom;
@@ -195,6 +340,19 @@ static struct nvgpu_nvs_domain *
 	return nvgpu_dom;
 }
 
+static void nvgpu_nvs_link_shadow_rl_domains(struct gk20a *g,
+		struct nvgpu_nvs_domain *nvgpu_dom)
+{
+	struct nvgpu_fifo *f = &g->fifo;
+	u32 num_runlists = f->num_runlists;
+	u32 i;
+
+	for (i = 0U; i < num_runlists; i++) {
+		struct nvgpu_runlist *runlist = &f->active_runlists[i];
+		nvgpu_dom->rl_domains[i] = runlist->shadow_rl_domain;
+	}
+}
+
 static int nvgpu_nvs_gen_shadow_domain(struct gk20a *g)
 {
 	int err = 0;
@@ -211,6 +369,8 @@ static int nvgpu_nvs_gen_shadow_domain(struct gk20a *g)
 		goto error;
 	}
 
+	nvgpu_nvs_link_shadow_rl_domains(g, nvgpu_dom);
+
 	g->scheduler->shadow_domain = nvgpu_dom;
 
 	/* Set active_domain to shadow_domain during Init */
@@ -243,6 +403,8 @@ static void nvgpu_nvs_remove_shadow_domain(struct gk20a *g)
 	nvs_dom = sched->shadow_domain->parent;
 	nvs_domain_destroy(sched->sched, nvs_dom);
 
+	nvgpu_kfree(g, sched->shadow_domain->rl_domains);
+	sched->shadow_domain->rl_domains = NULL;
 	nvgpu_kfree(g, sched->shadow_domain);
 	sched->shadow_domain = NULL;
 }
@@ -339,6 +501,7 @@ int nvgpu_nvs_open(struct gk20a *g)
 		goto unlock;
 	}
 
+	g->nvs_worker_submit = nvgpu_nvs_worker_submit;
 unlock:
 	if (err) {
 		nvs_dbg(g, "  Failed! Error code: %d", err);
@@ -362,12 +525,52 @@ static u64 nvgpu_nvs_new_id(struct gk20a *g)
 	return nvgpu_atomic64_inc_return(&g->scheduler->id_counter);
 }
 
+static int nvgpu_nvs_create_rl_domain_mem(struct gk20a *g,
+		struct nvgpu_nvs_domain *domain, const char *name)
+{
+	struct nvgpu_fifo *f = &g->fifo;
+	u32 i, j;
+	int err = 0;
+
+	for (i = 0U; i < f->num_runlists; i++) {
+		domain->rl_domains[i] = nvgpu_runlist_domain_alloc(g, name);
+		if (domain->rl_domains[i] == NULL) {
+			err = -ENOMEM;
+			break;
+		}
+	}
+
+	if (err != 0) {
+		for (j = 0; j != i; j++) {
+			nvgpu_runlist_domain_free(g, domain->rl_domains[j]);
+			domain->rl_domains[j] = NULL;
+		}
+	}
+
+	return err;
+}
+
+static void nvgpu_nvs_link_rl_domains(struct gk20a *g,
+		struct nvgpu_nvs_domain *domain)
+{
+	struct nvgpu_fifo *f = &g->fifo;
+	u32 i;
+
+	for (i = 0U; i < f->num_runlists; i++) {
+		struct nvgpu_runlist *runlist;
+
+		runlist = &f->active_runlists[i];
+		nvgpu_runlist_link_domain(runlist, domain->rl_domains[i]);
+	}
+}
+
 int nvgpu_nvs_add_domain(struct gk20a *g, const char *name, u64 timeslice,
 			 u64 preempt_grace, struct nvgpu_nvs_domain **pdomain)
 {
 	int err = 0;
 	struct nvs_domain *nvs_dom;
 	struct nvgpu_nvs_domain *nvgpu_dom;
+	struct nvgpu_nvs_scheduler *sched = g->scheduler;
 
 	nvgpu_mutex_acquire(&g->sched_mutex);
 
@@ -383,28 +586,26 @@ int nvgpu_nvs_add_domain(struct gk20a *g, const char *name, u64 timeslice,
 		goto unlock;
 	}
 
-	nvs_dom = nvgpu_dom->parent;
-
-	nvs_domain_scheduler_attach(g->scheduler->sched, nvs_dom);
-
-	err = nvgpu_rl_domain_alloc(g, name);
+	err = nvgpu_nvs_create_rl_domain_mem(g, nvgpu_dom, name);
 	if (err != 0) {
-		nvs_dbg(g, "failed to alloc rl domain for %s", name);
-		nvs_domain_unlink_and_destroy(g->scheduler->sched, nvs_dom);
+		nvs_domain_destroy(sched->sched, nvgpu_dom->parent);
+		nvgpu_kfree(g, nvgpu_dom->rl_domains);
 		nvgpu_kfree(g, nvgpu_dom);
 		goto unlock;
 	}
 
-	nvgpu_dom->parent = nvs_dom;
+	nvgpu_nvs_link_rl_domains(g, nvgpu_dom);
 
-	/* Set the first user created domain as active domain */
-	if (g->scheduler->active_domain == g->scheduler->shadow_domain) {
-		g->scheduler->active_domain = nvgpu_dom;
-	}
+	nvs_dom = nvgpu_dom->parent;
+
+	nvs_domain_scheduler_attach(g->scheduler->sched, nvs_dom);
+
+	nvgpu_dom->parent = nvs_dom;
 
 	*pdomain = nvgpu_dom;
 unlock:
 	nvgpu_mutex_release(&g->sched_mutex);
+
 	return err;
 }
 
@@ -492,6 +693,30 @@ void nvgpu_nvs_domain_put(struct gk20a *g, struct nvgpu_nvs_domain *dom)
 	nvgpu_mutex_release(&g->sched_mutex);
 }
 
+static void nvgpu_nvs_delete_rl_domain_mem(struct gk20a *g, struct nvgpu_nvs_domain *dom)
+{
+	struct nvgpu_fifo *f = &g->fifo;
+	u32 i;
+
+	for (i = 0U; i < f->num_runlists; i++) {
+		nvgpu_runlist_domain_free(g, dom->rl_domains[i]);
+		dom->rl_domains[i] = NULL;
+	}
+}
+
+static void nvgpu_nvs_unlink_rl_domains(struct gk20a *g, struct nvgpu_nvs_domain *domain)
+{
+	struct nvgpu_fifo *f = &g->fifo;
+	u32 i;
+
+	for (i = 0; i < f->num_runlists; i++) {
+		struct nvgpu_runlist *runlist;
+		runlist = &f->active_runlists[i];
+
+		nvgpu_runlist_unlink_domain(runlist, domain->rl_domains[i]);
+	}
+}
+
 int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id)
 {
 	struct nvgpu_nvs_scheduler *s = g->scheduler;
@@ -519,23 +744,13 @@ int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id)
 
 	nvs_dom = nvgpu_dom->parent;
 
-	err = nvgpu_rl_domain_delete(g, nvs_dom->name);
-	if (err != 0) {
-		nvs_dbg(g, "failed to delete RL domains on %llu!", dom_id);
-		/*
-		 * The RL domains require the existence of at least one domain;
-		 * this path inherits that logic until it's been made more
-		 * flexible.
-		 */
-		goto unlock;
-	}
-
+	nvgpu_nvs_unlink_rl_domains(g, nvgpu_dom);
+	nvgpu_nvs_delete_rl_domain_mem(g, nvgpu_dom);
 	nvgpu_dom->ref = 0U;
 
-	/* note: same wraparound logic as in RL domains to keep in sync */
 	if (s->active_domain == nvgpu_dom) {
 		nvs_next = nvs_domain_get_next_domain(s->sched, nvs_dom);
-		/* Its the only entry in the list. Set the default domain as the active domain */
+		/* Its the only entry in the list. Set the shadow domain as the active domain */
 		if (nvs_next == nvs_dom) {
 			nvs_next = s->shadow_domain->parent;
 		}
@@ -543,6 +758,8 @@ int nvgpu_nvs_del_domain(struct gk20a *g, u64 dom_id)
 	}
 
 	nvs_domain_unlink_and_destroy(s->sched, nvs_dom);
+
+	nvgpu_kfree(g, nvgpu_dom->rl_domains);
 	nvgpu_kfree(g, nvgpu_dom);
 
 unlock:
diff --git a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c
index 45728cb52..f36c8bd95 100644
--- a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c
@@ -123,6 +123,16 @@ static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
 				nvgpu_err(g, "runlist id %d is not cleaned up",
 					runlist->id);
 			}
+#ifdef CONFIG_NVS_PRESENT
+			/* Special case. Submit the recovery runlist now */
+			err = g->nvs_worker_submit(g, runlist, runlist->domain, true, false);
+#else
+			err = nvgpu_rl_domain_sync_submit(g, runlist, runlist->domain, true, false);
+#endif
+			if (err != 0 && err != 1) {
+				nvgpu_err(g, "runlist id %d is not cleaned up",
+					runlist->id);
+			}
 
 			nvgpu_tsg_abort(g, tsg, false);
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index c1313a493..8928ff797 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -74,6 +74,8 @@ struct nvgpu_fifo;
 struct nvgpu_channel;
 struct nvgpu_gr;
 struct nvgpu_fbp;
+struct nvgpu_runlist;
+struct nvgpu_runlist_domain;
 #ifdef CONFIG_NVGPU_SIM
 struct sim_nvgpu;
 #endif
@@ -900,6 +902,20 @@ struct gk20a {
 #ifdef CONFIG_NVS_PRESENT
 	struct nvgpu_nvs_scheduler *scheduler;
 	struct nvgpu_mutex sched_mutex;
+
+	/**
+	 * A global interface to notify NVS thread about a domain
+	 * modification. Wakes up the worker thread to process domain
+	 * submission synchronously. If the current submitted rl_domain is
+	 * active, update immediately, otherwise return as NVS thread will
+	 * eventually schedule the domain updates.
+	 *
+	 * Must not hold NVS lock while invoking this interface.
+	 * Must hold runlist lock while invoking this interface.
+	 */
+	int (*nvs_worker_submit)(struct gk20a *g, struct nvgpu_runlist *rl,
+		struct nvgpu_runlist_domain *rl_domain, bool swap_buffer,
+			bool wait_for_finish);
 #endif
 
 #ifdef CONFIG_NVGPU_ENABLE_MISC_EC
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvs.h b/drivers/gpu/nvgpu/include/nvgpu/nvs.h
index 1ba508d6e..2c2ac81a3 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvs.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvs.h
@@ -39,6 +39,8 @@
 
 struct gk20a;
 struct nvgpu_nvs_domain_ioctl;
+struct nvgpu_runlist;
+struct nvgpu_runlist_domain;
 
 /*
  * NvGPU KMD domain implementation details for nvsched.
@@ -71,9 +73,16 @@ struct nvgpu_nvs_domain {
 	 * Userspace API on the device nodes.
 	 */
 	struct nvgpu_nvs_domain_ioctl *ioctl;
+
+	/*
+	 * One corresponding to every runlist
+	 */
+	struct nvgpu_runlist_domain **rl_domains;
 };
 
 struct nvgpu_nvs_worker {
+	nvgpu_atomic_t nvs_sched_init;
+	struct nvgpu_cond wq_init;
 	struct nvgpu_worker worker;
 	struct nvgpu_timeout timeout;
 	u32 current_timeout;
diff --git a/drivers/gpu/nvgpu/include/nvgpu/runlist.h b/drivers/gpu/nvgpu/include/nvgpu/runlist.h
index 6a2da5d2d..a90192f45 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/runlist.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/runlist.h
@@ -26,6 +26,7 @@
 #include <nvgpu/types.h>
 #include <nvgpu/nvgpu_mem.h>
 #include <nvgpu/lock.h>
+#include <nvgpu/atomic.h>
 
 /**
  * @file
@@ -139,6 +140,14 @@ struct nvgpu_runlist_domain {
 
 	/** Currently active buffer submitted for hardware. */
 	struct nvgpu_runlist_mem *mem_hw;
+
+	/**
+	 * When a channel is removed or added, this value is set to true.
+	 * When this rl domain is scheduled to be submitted to the h/w,
+	 * swap mem and mem_hw and submit mem_hw and then its value is
+	 * set to false.
+	 */
+	nvgpu_atomic_t pending_update;
 };
 
 struct nvgpu_runlist {
@@ -190,11 +199,34 @@ struct nvgpu_runlist {
 	/** @endcond DOXYGEN_SHOULD_SKIP_THIS */
 };
 
-int nvgpu_rl_domain_alloc(struct gk20a *g, const char *name);
-int nvgpu_rl_domain_delete(struct gk20a *g, const char *name);
+bool nvgpu_rl_domain_exists(struct gk20a *g, const char *name);
+struct nvgpu_runlist_domain *nvgpu_runlist_domain_alloc(struct gk20a *g,
+		const char *name);
+void nvgpu_runlist_domain_free(struct gk20a *g,
+		struct nvgpu_runlist_domain *domain);
+void nvgpu_runlist_swap_mem(struct gk20a *g, struct nvgpu_runlist_domain *domain);
+void nvgpu_runlist_link_domain(struct nvgpu_runlist *runlist,
+		struct nvgpu_runlist_domain *domain);
+void nvgpu_runlist_unlink_domain(struct nvgpu_runlist *runlist,
+		struct nvgpu_runlist_domain *domain);
 struct nvgpu_runlist_domain *nvgpu_rl_domain_get(struct gk20a *g, u32 runlist_id,
 						 const char *name);
 
+/**
+ * @brief Schedule runlist domain
+ *
+ * @param g			Global gk20a struct
+ * @param runlist	Runlist context
+ * @param next_domain	-> Actual domain data thats meant to be scheduled
+ * @param wait_for_finish	-> Wait for finish
+ * @return int	0 in case of success, less than 0 otherwise,
+ *
+ * Submit next_domain if there is a pending update.
+ */
+int nvgpu_rl_domain_sync_submit(struct gk20a *g, struct nvgpu_runlist *runlist,
+		struct nvgpu_runlist_domain *next_domain, bool swap_buffers,
+		bool wait_for_finish);
+
 static inline struct nvgpu_runlist_domain *
 nvgpu_runlist_domain_from_domains_list(struct nvgpu_list_node *node)
 {
@@ -202,7 +234,7 @@ nvgpu_runlist_domain_from_domains_list(struct nvgpu_list_node *node)
 	((uintptr_t)node - offsetof(struct nvgpu_runlist_domain, domains_list));
 }
 
-void nvgpu_runlist_tick(struct gk20a *g);
+void nvgpu_runlist_tick(struct gk20a *g, struct nvgpu_runlist_domain **rl_domain);
 
 /**
  * @brief Rebuild runlist
diff --git a/libs/dgpu/libnvgpu-drv-dgpu_safe.export b/libs/dgpu/libnvgpu-drv-dgpu_safe.export
index 3dd8481c0..0be8774aa 100644
--- a/libs/dgpu/libnvgpu-drv-dgpu_safe.export
+++ b/libs/dgpu/libnvgpu-drv-dgpu_safe.export
@@ -673,7 +673,6 @@ nvgpu_runlist_unlock_active_runlists
 nvgpu_runlist_unlock_runlists
 nvgpu_runlist_update
 nvgpu_runlist_update_locked
-nvgpu_rl_domain_alloc
 nvgpu_rwsem_init
 nvgpu_rwsem_down_read
 nvgpu_rwsem_down_write
diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export
index 8adf0a58a..041be4cde 100644
--- a/libs/igpu/libnvgpu-drv-igpu_safe.export
+++ b/libs/igpu/libnvgpu-drv-igpu_safe.export
@@ -691,7 +691,6 @@ nvgpu_runlist_unlock_active_runlists
 nvgpu_runlist_unlock_runlists
 nvgpu_runlist_update
 nvgpu_runlist_update_locked
-nvgpu_rl_domain_alloc
 nvgpu_rwsem_init
 nvgpu_rwsem_down_read
 nvgpu_rwsem_down_write
diff --git a/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c b/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
index 4de5b083b..7435cee70 100644
--- a/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
+++ b/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -81,8 +81,6 @@ int test_gk20a_runlist_hw_submit(struct unit_module *m,
 	struct nvgpu_runlist *runlist = g->fifo.runlists[runlist_id];
 	u32 count;
 
-	nvgpu_rl_domain_alloc(g, "(default)");
-
 	for (count = 0; count < 2; count++) {
 
 		nvgpu_writel(g, fifo_runlist_r(), 0);