diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index d0d33d110..3556d3ea5 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -4160,6 +4160,21 @@ long gk20a_channel_ioctl(struct file *filp,
 		err = gk20a_fifo_preempt(ch->g, ch);
 		gk20a_idle(g);
 		break;
+	case NVGPU_IOCTL_CHANNEL_PREEMPT_NEXT:
+		if (!capable(CAP_SYS_NICE))
+			return -EPERM;
+		if (!ch->g->ops.fifo.reschedule_preempt_next)
+			return -ENOSYS;
+		err = gk20a_busy(ch->g);
+		if (err) {
+			dev_err(dev,
+				"%s: failed to host gk20a for ioctl cmd: 0x%x",
+				__func__, cmd);
+			break;
+		}
+		err = ch->g->ops.fifo.reschedule_preempt_next(ch);
+		gk20a_idle(ch->g);
+		break;
 	case NVGPU_IOCTL_CHANNEL_FORCE_RESET:
 		err = gk20a_busy(g);
 		if (err) {
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index d738e6051..22c05c007 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -41,11 +41,13 @@
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
 
 #define FECS_METHOD_WFI_RESTORE 0x80000
+#define FECS_MAILBOX_0_ACK_RESTORE 0x4
 
 static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 					    u32 hw_chid, bool add,
 					    bool wait_for_finish);
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
+static int gk20a_fifo_is_preempt_pending(struct gk20a *g);
 
 #ifdef CONFIG_DEBUG_FS
 static void __gk20a_fifo_profile_free(struct kref *ref);
@@ -2405,10 +2407,30 @@ void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg)
 			fifo_preempt_type_channel_f());
 }
 
-static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
+static int gk20a_fifo_is_preempt_pending(struct gk20a *g)
 {
 	struct nvgpu_timeout timeout;
 	u32 delay = GR_IDLE_CHECK_DEFAULT;
+	int ret = -EBUSY;
+
+	nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
+				NVGPU_TIMER_CPU_TIMER);
+	do {
+		if (!(gk20a_readl(g, fifo_preempt_r()) &
+				fifo_preempt_pending_true_f())) {
+			ret = 0;
+			break;
+		}
+
+		usleep_range(delay, delay * 2);
+		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+	} while (!nvgpu_timeout_expired_msg(&timeout, "preempt timeout"));
+
+	return ret;
+}
+
+static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
+{
 	u32 ret = 0;
 
 	gk20a_dbg_fn("%d", id);
@@ -2418,19 +2440,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
 
 	gk20a_dbg_fn("%d", id);
 	/* wait for preempt */
-	ret = -EBUSY;
-	nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
-			   NVGPU_TIMER_CPU_TIMER);
-	do {
-		if (!(gk20a_readl(g, fifo_preempt_r()) &
-			fifo_preempt_pending_true_f())) {
-			ret = 0;
-			break;
-		}
-
-		usleep_range(delay, delay * 2);
-		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
-	} while (!nvgpu_timeout_expired(&timeout));
+	ret = gk20a_fifo_is_preempt_pending(g);
 
 	gk20a_dbg_fn("%d", id);
 	if (ret) {
@@ -3011,16 +3021,18 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 		count = 0;
 
 	if (count != 0) {
-		gk20a_writel(g, fifo_runlist_base_r(),
+		runlist->runlist_base_r =
 			fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12)) |
 			gk20a_aperture_mask(g, &runlist->mem[new_buf],
 			  fifo_runlist_base_target_sys_mem_ncoh_f(),
-			  fifo_runlist_base_target_vid_mem_f()));
+			  fifo_runlist_base_target_vid_mem_f());
+		gk20a_writel(g, fifo_runlist_base_r(), runlist->runlist_base_r);
 	}
 
-	gk20a_writel(g, fifo_runlist_r(),
+	runlist->runlist_r =
 		fifo_runlist_engine_f(runlist_id) |
-		fifo_eng_runlist_length_f(count));
+		fifo_eng_runlist_length_f(count);
+	gk20a_writel(g, fifo_runlist_r(), runlist->runlist_r);
 
 	if (wait_for_finish) {
 		ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);
@@ -3089,8 +3101,8 @@ int gk20a_fifo_reschedule_runlist(struct gk20a *g, u32 runlist_id)
 		mutex_ret = pmu_mutex_acquire(
 			&g->pmu, PMU_MUTEX_ID_FIFO, &token);
 
-		gk20a_writel(g, fifo_runlist_r(),
-			gk20a_readl(g, fifo_runlist_r()));
+		gk20a_writel(g, fifo_runlist_base_r(), runlist->runlist_base_r);
+		gk20a_writel(g, fifo_runlist_r(), runlist->runlist_r);
 		gk20a_fifo_runlist_wait_pending(g, runlist_id);
 
 		if (!mutex_ret)
@@ -3104,6 +3116,79 @@ int gk20a_fifo_reschedule_runlist(struct gk20a *g, u32 runlist_id)
 	return ret;
 }
 
+/* trigger host preempt of gr engine pending load context if that ctx is not for ch */
+int gk20a_fifo_reschedule_preempt_next(struct channel_gk20a *ch)
+{
+	struct gk20a *g = ch->g;
+	struct fifo_runlist_info_gk20a *runlist;
+	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+	u32 mutex_ret;
+	int ret = 0;
+	u32 gr_eng_id = 0;
+
+	runlist = &g->fifo.runlist_info[ch->runlist_id];
+	if (1 != gk20a_fifo_get_engine_ids(
+		g, &gr_eng_id, 1, ENGINE_GR_GK20A))
+		return 0;
+	if (!(runlist->eng_bitmask & (1 << gr_eng_id)))
+		return 0;
+
+	if (!nvgpu_mutex_tryacquire(&runlist->mutex))
+		return -EBUSY; /* someone else is writing fifo_runlist_r so not needed here */
+	mutex_ret = pmu_mutex_acquire(
+		&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+
+	do {
+		u32 engstat, ctxstat, fecsstat0, fecsstat1;
+		s32 preempt_id = -1;
+		u32 preempt_type = 0;
+		bool same_ctx;
+
+		if (gk20a_readl(g, fifo_preempt_r()) &
+			fifo_preempt_pending_true_f())
+			break;
+
+		fecsstat0 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(0));
+		engstat = gk20a_readl(g, fifo_engine_status_r(gr_eng_id));
+		ctxstat = fifo_engine_status_ctx_status_v(engstat);
+		if (ctxstat == fifo_engine_status_ctx_status_ctxsw_switch_v()) {
+			/* host switching to new context, preempt next context if needed */
+			preempt_id = fifo_engine_status_next_id_v(engstat);
+			preempt_type = fifo_engine_status_next_id_type_v(engstat);
+		} else {
+			break;
+		}
+		if (gk20a_is_channel_marked_as_tsg(ch))
+			same_ctx = (preempt_id == ch->tsgid && preempt_type);
+		else
+			same_ctx = (preempt_id == ch->hw_chid && !preempt_type);
+		if (same_ctx)
+			break;
+		fecsstat1 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(0));
+		if (fecsstat0 != FECS_MAILBOX_0_ACK_RESTORE ||
+			fecsstat1 != FECS_MAILBOX_0_ACK_RESTORE)
+			break; /* preempt is useless if FECS acked save and started restore */
+
+		gk20a_fifo_issue_preempt(g, preempt_id, preempt_type);
+
+		trace_gk20a_reschedule_preempt_next(ch->hw_chid, fecsstat0, engstat,
+			fecsstat1, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(0)),
+			gk20a_readl(g, fifo_preempt_r()));
+
+		gk20a_fifo_is_preempt_pending(g);
+
+		trace_gk20a_reschedule_preempted_next(ch->hw_chid);
+
+	} while (false);
+
+	if (!mutex_ret)
+		pmu_mutex_release(
+			&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+	nvgpu_mutex_release(&runlist->mutex);
+
+	return ret;
+}
+
 /* add/remove a channel from runlist
    special cases below: runlist->active_channels will NOT be changed.
    (hw_chid == ~0 && !add) means remove all active channels from runlist.
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 03c93a352..e0e57645a 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -52,6 +52,8 @@ struct fifo_runlist_info_gk20a {
 	u32  pbdma_bitmask;      /* pbdmas supported for this runlist*/
 	u32  eng_bitmask;        /* engines using this runlist */
 	u32  reset_eng_bitmask;  /* engines to be reset during recovery */
+	u32  runlist_base_r;     /* cached runlist_base_r */
+	u32  runlist_r;          /* cached runlist_r */
 	bool stopped;
 	bool support_tsg;
 	struct nvgpu_mutex mutex; /* protect channel preempt and runlist update */
@@ -219,6 +221,7 @@ int gk20a_fifo_nonstall_isr(struct gk20a *g);
 int gk20a_fifo_preempt_channel(struct gk20a *g, u32 hw_chid);
 int gk20a_fifo_preempt_tsg(struct gk20a *g, u32 tsgid);
 int gk20a_fifo_preempt(struct gk20a *g, struct channel_gk20a *ch);
+int gk20a_fifo_preempt_next(struct gk20a *g, struct channel_gk20a *ch);
 
 int gk20a_fifo_enable_engine_activity(struct gk20a *g,
 			struct fifo_engine_info_gk20a *eng_info);
@@ -231,6 +234,7 @@ int gk20a_fifo_disable_all_engine_activity(struct gk20a *g,
 u32 gk20a_fifo_engines_on_ch(struct gk20a *g, u32 hw_chid);
 
 int gk20a_fifo_reschedule_runlist(struct gk20a *g, u32 runlist_id);
+int gk20a_fifo_reschedule_preempt_next(struct channel_gk20a *ch);
 
 int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 hw_chid,
 			      bool add, bool wait_for_finish);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 50180551d..33b7b10f4 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -405,6 +405,7 @@ struct gpu_ops {
 		int (*preempt_channel)(struct gk20a *g, u32 hw_chid);
 		int (*preempt_tsg)(struct gk20a *g, u32 tsgid);
 		int (*reschedule_runlist)(struct gk20a *g, u32 runlist_id);
+		int (*reschedule_preempt_next)(struct channel_gk20a *ch);
 		int (*update_runlist)(struct gk20a *g, u32 runlist_id,
 				u32 hw_chid, bool add,
 				bool wait_for_finish);
diff --git a/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c b/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c
index 2880244a5..1e0fcc9b6 100644
--- a/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c
@@ -241,4 +241,5 @@ void gp10b_init_fifo(struct gpu_ops *gops)
 	gops->fifo.eng_runlist_base_size = fifo_eng_runlist_base__size_1_v;
 	gops->fifo.device_info_fault_id = top_device_info_data_fault_id_enum_v;
 	gops->fifo.reschedule_runlist = gk20a_fifo_reschedule_runlist;
+	gops->fifo.reschedule_preempt_next = gk20a_fifo_reschedule_preempt_next;
 }
diff --git a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_fifo_gp10b.c b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_fifo_gp10b.c
index 7053739cc..6cf4286d9 100644
--- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_fifo_gp10b.c
+++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_fifo_gp10b.c
@@ -46,4 +46,5 @@ void vgpu_gp10b_init_fifo_ops(struct gpu_ops *gops)
 	gops->fifo.init_engine_info = vgpu_gp10b_fifo_init_engine_info;
 	gops->fifo.resetup_ramfc = NULL;
 	gops->fifo.reschedule_runlist = NULL;
+	gops->fifo.reschedule_preempt_next = NULL;
 }
diff --git a/include/trace/events/gk20a.h b/include/trace/events/gk20a.h
index a6939833a..d02ac22af 100644
--- a/include/trace/events/gk20a.h
+++ b/include/trace/events/gk20a.h
@@ -354,6 +354,51 @@ TRACE_EVENT(gk20a_channel_submitted_gpfifo,
 		__entry->flags, __entry->incr_id, __entry->incr_value)
 );
 
+TRACE_EVENT(gk20a_reschedule_preempt_next,
+		TP_PROTO(u32 chid, u32 fecs0, u32 engstat, u32 fecs1, u32 fecs2,
+			u32 preempt),
+
+		TP_ARGS(chid, fecs0, engstat, fecs1, fecs2, preempt),
+
+	TP_STRUCT__entry(
+		__field(u32, chid)
+		__field(u32, fecs0)
+		__field(u32, engstat)
+		__field(u32, fecs1)
+		__field(u32, fecs2)
+		__field(u32, preempt)
+	),
+
+	TP_fast_assign(
+		__entry->chid = chid;
+		__entry->fecs0 = fecs0;
+		__entry->engstat = engstat;
+		__entry->fecs1 = fecs1;
+		__entry->fecs2 = fecs2;
+		__entry->preempt = preempt;
+	),
+
+	TP_printk("chid=%d, fecs0=%#x, engstat=%#x, fecs1=%#x, fecs2=%#x,"
+		" preempt=%#x", __entry->chid, __entry->fecs0, __entry->engstat,
+		__entry->fecs1,	__entry->fecs2, __entry->preempt)
+);
+
+TRACE_EVENT(gk20a_reschedule_preempted_next,
+		TP_PROTO(u32 chid),
+
+		TP_ARGS(chid),
+
+	TP_STRUCT__entry(
+		__field(u32, chid)
+	),
+
+	TP_fast_assign(
+		__entry->chid = chid;
+	),
+
+	TP_printk("chid=%d", __entry->chid)
+);
+
 TRACE_EVENT(gk20a_channel_reset,
 		TP_PROTO(u32 hw_chid, u32 tsgid),
 
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index fd3c1a578..5f02a3189 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -1637,9 +1637,11 @@ struct nvgpu_boosted_ctx_args {
 	_IOW(NVGPU_IOCTL_MAGIC, 123, struct nvgpu_alloc_gpfifo_ex_args)
 #define NVGPU_IOCTL_CHANNEL_SET_BOOSTED_CTX	\
 	_IOW(NVGPU_IOCTL_MAGIC, 124, struct nvgpu_boosted_ctx_args)
+#define NVGPU_IOCTL_CHANNEL_PREEMPT_NEXT	\
+	_IO(NVGPU_IOCTL_MAGIC,  126)
 
 #define NVGPU_IOCTL_CHANNEL_LAST	\
-	_IOC_NR(NVGPU_IOCTL_CHANNEL_SET_BOOSTED_CTX)
+	_IOC_NR(NVGPU_IOCTL_CHANNEL_PREEMPT_NEXT)
 #define NVGPU_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvgpu_alloc_gpfifo_ex_args)
 
 /*