diff --git a/drivers/gpu/nvgpu/common/fifo/runlist.c b/drivers/gpu/nvgpu/common/fifo/runlist.c
index dab8be749..41bee11ec 100644
--- a/drivers/gpu/nvgpu/common/fifo/runlist.c
+++ b/drivers/gpu/nvgpu/common/fifo/runlist.c
@@ -461,15 +461,18 @@ static int nvgpu_runlist_domain_actual_submit(struct gk20a *g, struct nvgpu_runl
 
 	nvgpu_atomic_set(&rl->domain->pending_update, 0);
 
-	g->ops.runlist.hw_submit(g, rl);
+	/* No submit exists for VGPU */
+	if (g->ops.runlist.hw_submit != NULL) {
+		g->ops.runlist.hw_submit(g, rl);
 
-	if (wait_for_finish) {
-		ret = g->ops.runlist.wait_pending(g, rl);
-		if (ret == -ETIMEDOUT) {
-			nvgpu_err(g, "runlist %d update timeout", rl->id);
-			/* trigger runlist update timeout recovery */
-			return ret;
+		if (wait_for_finish) {
+			ret = g->ops.runlist.wait_pending(g, rl);
+			if (ret == -ETIMEDOUT) {
+				nvgpu_err(g, "runlist %d update timeout", rl->id);
+				/* trigger runlist update timeout recovery */
+				return ret;
 
+			}
 		}
 	}
 
@@ -681,44 +684,6 @@ static int runlist_submit_powered(struct gk20a *g, struct nvgpu_runlist *runlist
 	return err;
 }
 
-static int runlist_select_and_submit(struct gk20a *g, struct nvgpu_runlist *runlist,
-		struct nvgpu_runlist_domain *next_domain, bool wait_for_finish)
-{
-	int err;
-
-	rl_dbg(g, "Runlist[%u]: switching to domain %llu",
-	       runlist->id, next_domain->domain_id);
-
-	runlist->domain = next_domain;
-
-	gk20a_busy_noresume(g);
-	if (nvgpu_is_powered_off(g)) {
-		rl_dbg(g, "Runlist[%u]: power is off, skip submit",
-				runlist->id);
-		gk20a_idle_nosuspend(g);
-		return 0;
-	}
-
-	err = gk20a_busy(g);
-	gk20a_idle_nosuspend(g);
-
-	if (err != 0) {
-		nvgpu_err(g, "failed to hold power for runlist submit");
-		/*
-		 * probably shutting down though, so don't bother propagating
-		 * the error. Power is already on when the domain scheduler is
-		 * actually in use.
-		 */
-		return err;
-	}
-
-	err = runlist_submit_powered(g, runlist, next_domain, false, wait_for_finish);
-
-	gk20a_idle(g);
-
-	return err;
-}
-
 int nvgpu_rl_domain_sync_submit(struct gk20a *g, struct nvgpu_runlist *runlist,
 		struct nvgpu_runlist_domain *next_domain, bool swap_buffers,
 			bool wait_for_finish)
@@ -764,7 +729,7 @@ static int runlist_switch_domain_and_submit(struct gk20a *g,
 		}
 	}
 
-	ret = runlist_select_and_submit(g, runlist, rl_domain, false);
+	ret = runlist_submit_powered(g, runlist, rl_domain, false, false);
 
 	return ret;
 }
diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
index cbb099d2a..c07446195 100644
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -313,6 +313,9 @@ int nvgpu_prepare_poweroff(struct gk20a *g)
 		}
 	}
 
+	/* Ensure that thread is paused before Engines suspend below */
+	nvgpu_nvs_worker_pause(g);
+
 #ifdef CONFIG_NVGPU_LS_PMU
 	/* disable elpg before gr or fifo suspend */
 	if (g->support_ls_pmu) {
diff --git a/drivers/gpu/nvgpu/common/nvs/nvs_sched.c b/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
index 60e340938..2ef9d85ef 100644
--- a/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
+++ b/drivers/gpu/nvgpu/common/nvs/nvs_sched.c
@@ -85,8 +85,11 @@ static void nvgpu_nvs_worker_poll_init(struct nvgpu_worker *worker)
 	nvgpu_timeout_init_cpu_timer_sw(worker->g, &nvs_worker->timeout,
 			nvs_worker->current_timeout);
 
-	nvgpu_atomic_set(&nvs_worker->nvs_sched_init, 1);
-	nvgpu_cond_signal(&nvs_worker->worker.wq);
+	nvgpu_atomic_set(&nvs_worker->nvs_sched_state, NVS_WORKER_STATE_RUNNING);
+
+	/* Atomic Set() and Read() operations donot have implicit barriers */
+	nvgpu_wmb();
+	nvgpu_cond_signal(&nvs_worker->wq_request);
 }
 
 static u32 nvgpu_nvs_worker_wakeup_timeout(struct nvgpu_worker *worker)
@@ -117,7 +120,8 @@ static u64 nvgpu_nvs_tick(struct gk20a *g)
 		nvs_next = sched->shadow_domain->parent;
 	}
 
-	if (nvs_next->priv == domain) {
+
+	if (nvs_next->priv == sched->shadow_domain) {
 		/*
 		 * This entire thread is going to be changed soon.
 		 * The above check ensures that there are no other domain,
@@ -236,7 +240,10 @@ static int nvgpu_nvs_worker_submit(struct gk20a *g, struct nvgpu_runlist *rl,
 		goto fail;
 	}
 
-	nvs_dbg(g, " ");
+	/* Add a barrier here to ensure that worker thread is interrupted
+	 * before waiting on the condition below
+	 */
+	nvgpu_mb();
 
 	ret = NVGPU_COND_WAIT(&work->cond, nvgpu_atomic_read(&work->state) == 1, 0U);
 	if (ret != 0) {
@@ -257,6 +264,59 @@ free_domain:
 	return ret;
 }
 
+static bool nvgpu_nvs_worker_wakeup_condition(struct nvgpu_worker *worker)
+{
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+	struct gk20a *g = worker->g;
+	int nvs_worker_state;
+
+	nvs_worker_state = nvgpu_atomic_read(&nvs_worker->nvs_sched_state);
+
+	if (nvs_worker_state == NVS_WORKER_STATE_SHOULD_RESUME) {
+		/* Set the state to running. Worker will automatically update the timeout
+		 * in the subsequent if block as previous timeout is 0.
+		 */
+		nvgpu_atomic_set(&nvs_worker->nvs_sched_state, NVS_WORKER_STATE_RUNNING);
+
+		/* Atomic set donot have an implicit barrier.
+		 * Ensure, that value is updated before invoking signal below.
+		 */
+		nvgpu_wmb();
+		/* Signal waiting threads about resume */
+		nvgpu_cond_signal(&nvs_worker->wq_request);
+
+		nvs_dbg(g, "nvs set for resume");
+	} else if (nvs_worker_state == NVS_WORKER_STATE_SHOULD_PAUSE) {
+		return true;
+	}
+
+	return false;
+}
+
+static void nvgpu_nvs_handle_pause_requests(struct nvgpu_worker *worker)
+{
+	struct gk20a *g = worker->g;
+	struct nvgpu_nvs_worker *nvs_worker =
+		nvgpu_nvs_worker_from_worker(worker);
+	int nvs_worker_state = nvgpu_atomic_read(&nvs_worker->nvs_sched_state);
+
+	if (nvs_worker_state == NVS_WORKER_STATE_SHOULD_PAUSE) {
+		nvgpu_atomic_set(&nvs_worker->nvs_sched_state, NVS_WORKER_STATE_PAUSED);
+		/* Set the worker->timeout to 0, to allow the worker thread to sleep infinitely. */
+		nvgpu_timeout_init_cpu_timer_sw(g, &nvs_worker->timeout, 0);
+
+		/* Atomic_Set doesn't have an implicit barrier.
+		 * Ensure, that value is updated before invoking signal below.
+		 */
+		nvgpu_wmb();
+		/* Wakeup user threads waiting for pause state */
+		nvgpu_cond_signal(&nvs_worker->wq_request);
+
+		nvs_dbg(g, "nvs set for pause");
+	}
+}
+
 static void nvgpu_nvs_worker_wakeup_post_process(struct nvgpu_worker *worker)
 {
 	struct gk20a *g = worker->g;
@@ -274,23 +334,92 @@ static void nvgpu_nvs_worker_wakeup_post_process(struct nvgpu_worker *worker)
 		nvgpu_timeout_init_cpu_timer_sw(g, &nvs_worker->timeout,
 				nvs_worker->current_timeout);
 	}
+
+	nvgpu_nvs_handle_pause_requests(worker);
 }
 
 static const struct nvgpu_worker_ops nvs_worker_ops = {
 	.pre_process = nvgpu_nvs_worker_poll_init,
+	.wakeup_condition = nvgpu_nvs_worker_wakeup_condition,
 	.wakeup_timeout = nvgpu_nvs_worker_wakeup_timeout,
 	.wakeup_process_item = nvgpu_nvs_worker_wakeup_process_item,
 	.wakeup_post_process = nvgpu_nvs_worker_wakeup_post_process,
 };
 
+void nvgpu_nvs_worker_pause(struct gk20a *g)
+{
+	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+	struct nvgpu_nvs_worker *nvs_worker = &g->scheduler->worker;
+	int nvs_worker_state;
+
+	if (g->is_virtual) {
+		return;
+	}
+
+	nvs_worker_state = nvgpu_atomic_cmpxchg(&nvs_worker->nvs_sched_state,
+			NVS_WORKER_STATE_RUNNING, NVS_WORKER_STATE_SHOULD_PAUSE);
+
+	if (nvs_worker_state == NVS_WORKER_STATE_RUNNING) {
+		nvs_dbg(g, "Setting thread state to sleep.");
+		/* wakeup worker forcibly. */
+		nvgpu_cond_signal_interruptible(&worker->wq);
+
+		/* Ensure signal has happened before waiting */
+		nvgpu_mb();
+
+		NVGPU_COND_WAIT(&nvs_worker->wq_request,
+			nvgpu_atomic_read(
+				&nvs_worker->nvs_sched_state) == NVS_WORKER_STATE_PAUSED, 0);
+
+		nvs_dbg(g, "Thread is paused");
+	} else {
+		nvs_dbg(g, "Thread state is not running.");
+	}
+}
+
+void nvgpu_nvs_worker_resume(struct gk20a *g)
+{
+	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
+	struct nvgpu_nvs_worker *nvs_worker = &g->scheduler->worker;
+	int nvs_worker_state;
+
+	if (g->is_virtual) {
+		return;
+	}
+
+	nvs_worker_state = nvgpu_atomic_cmpxchg(&nvs_worker->nvs_sched_state,
+			NVS_WORKER_STATE_PAUSED, NVS_WORKER_STATE_SHOULD_RESUME);
+
+	if (nvs_worker_state == NVS_WORKER_STATE_PAUSED) {
+		nvs_dbg(g, "Waiting for nvs thread to be resumed");
+		/* wakeup worker forcibly. */
+		nvgpu_cond_signal_interruptible(&worker->wq);
+
+		/* Ensure signal has happened before waiting */
+		nvgpu_mb();
+
+		NVGPU_COND_WAIT(&nvs_worker->wq_request,
+			nvgpu_atomic_read(
+				&nvs_worker->nvs_sched_state) == NVS_WORKER_STATE_RUNNING, 0);
+
+		nvs_dbg(g, "Thread resumed");
+	} else {
+		nvs_dbg(g, "Thread not paused");
+	}
+}
+
 static int nvgpu_nvs_worker_init(struct gk20a *g)
 {
 	int err = 0;
 	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
 	struct nvgpu_nvs_worker *nvs_worker = &g->scheduler->worker;
 
-	nvgpu_cond_init(&nvs_worker->wq_init);
-	nvgpu_atomic_set(&nvs_worker->nvs_sched_init, 0);
+	if (g->is_virtual) {
+		return 0;
+	}
+
+	nvgpu_cond_init(&nvs_worker->wq_request);
+	(void)nvgpu_atomic_xchg(&nvs_worker->nvs_sched_state, NVS_WORKER_STATE_STOPPED);
 
 	nvgpu_worker_init_name(worker, "nvgpu_nvs", g->name);
 
@@ -299,11 +428,15 @@ static int nvgpu_nvs_worker_init(struct gk20a *g)
 		/* Ensure that scheduler thread is started as soon as possible to handle
 		 * minimal uptime for applications.
 		 */
-		err = NVGPU_COND_WAIT(&nvs_worker->worker.wq,
-				nvgpu_atomic_read(&nvs_worker->nvs_sched_init) == 1, 0);
+		err = NVGPU_COND_WAIT(&nvs_worker->wq_request,
+				nvgpu_atomic_read(
+					&nvs_worker->nvs_sched_state) == NVS_WORKER_STATE_RUNNING,
+					0);
 		if (err != 0) {
 			nvgpu_err(g, "Interrupted while waiting for scheduler thread");
 		}
+
+		nvs_dbg(g, "Thread started");
 	}
 
 	return err;
@@ -314,10 +447,14 @@ static void nvgpu_nvs_worker_deinit(struct gk20a *g)
 	struct nvgpu_worker *worker = &g->scheduler->worker.worker;
 	struct nvgpu_nvs_worker *nvs_worker = &g->scheduler->worker;
 
+	if (g->is_virtual) {
+		return;
+	}
+
 	nvgpu_worker_deinit(worker);
 
-	nvgpu_atomic_set(&nvs_worker->nvs_sched_init, 0);
-	nvgpu_cond_destroy(&nvs_worker->wq_init);
+	nvgpu_atomic_set(&nvs_worker->nvs_sched_state, NVS_WORKER_STATE_STOPPED);
+	nvgpu_cond_destroy(&nvs_worker->wq_request);
 
 	nvs_dbg(g, "NVS worker suspended");
 }
@@ -495,7 +632,9 @@ int nvgpu_nvs_open(struct gk20a *g)
 
 	if (g->scheduler != NULL) {
 		/* resuming from railgate */
-		goto unlock;
+		nvgpu_mutex_release(&g->sched_mutex);
+		nvgpu_nvs_worker_resume(g);
+		return err;
 	}
 
 	g->scheduler = nvgpu_kzalloc(g, sizeof(*g->scheduler));
@@ -525,6 +664,9 @@ int nvgpu_nvs_open(struct gk20a *g)
 		goto unlock;
 	}
 
+	/* Ensure all the previous writes are seen */
+	nvgpu_wmb();
+
 	err = nvgpu_nvs_gen_shadow_domain(g);
 	if (err != 0) {
 		goto unlock;
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvs.h b/drivers/gpu/nvgpu/include/nvgpu/nvs.h
index 2426a1908..c0937d2b5 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvs.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvs.h
@@ -125,9 +125,15 @@ struct nvgpu_nvs_domain {
 	struct nvgpu_runlist_domain **rl_domains;
 };
 
+#define NVS_WORKER_STATE_STOPPED 0
+#define NVS_WORKER_STATE_RUNNING 1
+#define NVS_WORKER_STATE_SHOULD_PAUSE 2
+#define NVS_WORKER_STATE_PAUSED 3
+#define NVS_WORKER_STATE_SHOULD_RESUME 4
+
 struct nvgpu_nvs_worker {
-	nvgpu_atomic_t nvs_sched_init;
-	struct nvgpu_cond wq_init;
+	nvgpu_atomic_t nvs_sched_state;
+	struct nvgpu_cond wq_request;
 	struct nvgpu_worker worker;
 	struct nvgpu_timeout timeout;
 	u32 current_timeout;
@@ -248,6 +254,8 @@ const char *nvgpu_nvs_domain_get_name(struct nvgpu_nvs_domain *dom);
 void nvgpu_nvs_ctrl_fifo_lock_queues(struct gk20a *g);
 void nvgpu_nvs_ctrl_fifo_unlock_queues(struct gk20a *g);
 
+void nvgpu_nvs_worker_pause(struct gk20a *g);
+void nvgpu_nvs_worker_resume(struct gk20a *g);
 struct nvgpu_nvs_domain_ctrl_fifo *nvgpu_nvs_ctrl_fifo_create(struct gk20a *g);
 bool nvgpu_nvs_ctrl_fifo_user_exists(struct nvgpu_nvs_domain_ctrl_fifo *sched_ctrl,
     int pid, bool rw);
@@ -288,6 +296,18 @@ void nvgpu_nvs_ctrl_fifo_erase_queue(struct gk20a *g, struct nvgpu_nvs_ctrl_queu
 void nvgpu_nvs_ctrl_fifo_erase_all_queues(struct gk20a *g);
 
 #else
+
+
+static inline void nvgpu_nvs_worker_pause(struct gk20a *g)
+{
+	(void)g;
+}
+
+static inline void nvgpu_nvs_worker_resume(struct gk20a *g)
+{
+	(void)g;
+}
+
 static inline int nvgpu_nvs_init(struct gk20a *g)
 {
 	(void)g;