diff --git a/drivers/gpu/nvgpu/common/fifo/runlist.c b/drivers/gpu/nvgpu/common/fifo/runlist.c
index 56cd4e6c1..6eb86d71e 100644
--- a/drivers/gpu/nvgpu/common/fifo/runlist.c
+++ b/drivers/gpu/nvgpu/common/fifo/runlist.c
@@ -302,10 +302,10 @@ static u32 nvgpu_runlist_append_flat(struct nvgpu_fifo *f,
 
 u32 nvgpu_runlist_construct_locked(struct nvgpu_fifo *f,
 				struct nvgpu_runlist *runlist,
-				u32 buf_id,
+				struct nvgpu_runlist_domain *domain,
 				u32 max_entries)
 {
-	u32 *runlist_entry_base = runlist->mem[buf_id].cpu_va;
+	u32 *runlist_entry_base = domain->mem->mem.cpu_va;
 
 	/*
 	 * The entry pointer and capacity counter that live on the stack here
@@ -323,6 +323,7 @@ u32 nvgpu_runlist_construct_locked(struct nvgpu_fifo *f,
 
 static bool nvgpu_runlist_modify_active_locked(struct gk20a *g,
 					       struct nvgpu_runlist *runlist,
+					       struct nvgpu_runlist_domain *domain,
 					       struct nvgpu_channel *ch, bool add)
 {
 	struct nvgpu_tsg *tsg = NULL;
@@ -370,29 +371,31 @@ static bool nvgpu_runlist_modify_active_locked(struct gk20a *g,
 
 static int nvgpu_runlist_reconstruct_locked(struct gk20a *g,
 					    struct nvgpu_runlist *runlist,
-					    u32 buf_id, bool add_entries)
+					    struct nvgpu_runlist_domain *domain,
+					    bool add_entries)
 {
 	u32 num_entries;
 	struct nvgpu_fifo *f = &g->fifo;
 
 	rl_dbg(g, "[%u] switch to new buffer 0x%16llx",
-		runlist->id, (u64)nvgpu_mem_get_addr(g, &runlist->mem[buf_id]));
+		runlist->id, (u64)nvgpu_mem_get_addr(g, &domain->mem->mem));
 
 	if (!add_entries) {
-		runlist->count = 0;
+		domain->mem->count = 0;
 		return 0;
 	}
 
-	num_entries = nvgpu_runlist_construct_locked(f, runlist, buf_id,
+	num_entries = nvgpu_runlist_construct_locked(f, runlist, domain,
 						f->num_runlist_entries);
 	if (num_entries == RUNLIST_APPEND_FAILURE) {
 		return -E2BIG;
 	}
-	runlist->count = num_entries;
+
+	domain->mem->count = num_entries;
 NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
 NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
 NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
-	WARN_ON(runlist->count > f->num_runlist_entries);
+	WARN_ON(domain->mem->count > f->num_runlist_entries);
 NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
 NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
 NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
@@ -405,11 +408,12 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 				bool wait_for_finish)
 {
 	int ret = 0;
-	u32 buf_id;
 	bool add_entries;
+	struct nvgpu_runlist_mem *mem_tmp;
+	struct nvgpu_runlist_domain *domain = rl->domain;
 
 	if (ch != NULL) {
-		bool update = nvgpu_runlist_modify_active_locked(g, rl, ch, add);
+		bool update = nvgpu_runlist_modify_active_locked(g, rl, domain, ch, add);
 		if (!update) {
 			/* no change in runlist contents */
 			return 0;
@@ -421,15 +425,29 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 		add_entries = add;
 	}
 
-	/* double buffering, swap to next */
-	buf_id = (rl->cur_buffer == 0U) ? 1U : 0U;
-
-	ret = nvgpu_runlist_reconstruct_locked(g, rl, buf_id, add_entries);
+	ret = nvgpu_runlist_reconstruct_locked(g, rl, domain, add_entries);
 	if (ret != 0) {
 		return ret;
 	}
 
-	g->ops.runlist.hw_submit(g, rl->id, rl->count, buf_id);
+	/*
+	 * hw_submit updates mem_hw to hardware; swap the buffers now. mem
+	 * becomes the previously scheduled buffer and it can be modified once
+	 * the runlist lock is released.
+	 */
+
+	mem_tmp = domain->mem;
+	domain->mem = domain->mem_hw;
+	domain->mem_hw = mem_tmp;
+
+	/*
+	 * A non-active domain may be updated, but submit still the currently
+	 * active one just for simplicity.
+	 *
+	 * TODO: Later on, updates and submits will need to be totally
+	 * decoupled so that submits are done only in the domain scheduler.
+	 */
+	g->ops.runlist.hw_submit(g, rl);
 
 	if (wait_for_finish) {
 		ret = g->ops.runlist.wait_pending(g, rl->id);
@@ -446,8 +464,6 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 		}
 	}
 
-	rl->cur_buffer = buf_id;
-
 	return ret;
 }
 
@@ -473,8 +489,12 @@ int nvgpu_runlist_reschedule(struct nvgpu_channel *ch, bool preempt_next,
 		g, g->pmu, PMU_MUTEX_ID_FIFO, &token);
 #endif
 
-	g->ops.runlist.hw_submit(
-		g, runlist->id, runlist->count, runlist->cur_buffer);
+	/*
+	 * Note that the runlist memory is not rewritten; the currently active
+	 * buffer is just resubmitted so that scheduling begins from the first
+	 * entry in it.
+	 */
+	g->ops.runlist.hw_submit(g, runlist);
 
 	if (preempt_next) {
 		if (g->ops.runlist.reschedule_preempt_next_locked(ch,
@@ -633,10 +653,16 @@ void nvgpu_runlist_set_state(struct gk20a *g, u32 runlists_mask,
 #endif
 }
 
+static void free_rl_mem(struct gk20a *g, struct nvgpu_runlist_mem *mem)
+{
+	nvgpu_dma_free(g, &mem->mem);
+	nvgpu_kfree(g, mem);
+}
+
 void nvgpu_runlist_cleanup_sw(struct gk20a *g)
 {
 	struct nvgpu_fifo *f = &g->fifo;
-	u32 i, j;
+	u32 i;
 	struct nvgpu_runlist *runlist;
 
 	if ((f->runlists == NULL) || (f->active_runlists == NULL)) {
@@ -647,8 +673,14 @@ void nvgpu_runlist_cleanup_sw(struct gk20a *g)
 
 	for (i = 0; i < f->num_runlists; i++) {
 		runlist = &f->active_runlists[i];
-		for (j = 0; j < MAX_RUNLIST_BUFFERS; j++) {
-			nvgpu_dma_free(g, &runlist->mem[j]);
+
+		if (runlist->domain != NULL) {
+			free_rl_mem(g, runlist->domain->mem);
+			runlist->domain->mem = NULL;
+			free_rl_mem(g, runlist->domain->mem_hw);
+			runlist->domain->mem_hw = NULL;
+			nvgpu_kfree(g, runlist->domain);
+			runlist->domain = NULL;
 		}
 
 		nvgpu_kfree(g, runlist->active_channels);
@@ -787,13 +819,61 @@ void nvgpu_runlist_init_enginfo(struct gk20a *g, struct nvgpu_fifo *f)
 	nvgpu_log_fn(g, "done");
 }
 
+static struct nvgpu_runlist_mem *init_rl_mem(struct gk20a *g, u32 runlist_size)
+{
+	struct nvgpu_runlist_mem *mem = nvgpu_kzalloc(g, sizeof(*mem));
+	int err;
+
+	if (mem == NULL) {
+		return NULL;
+	}
+
+	err = nvgpu_dma_alloc_flags_sys(g,
+			g->is_virtual ?
+			  0ULL : NVGPU_DMA_PHYSICALLY_ADDRESSED,
+			runlist_size,
+			&mem->mem);
+	if (err != 0) {
+		nvgpu_kfree(g, mem);
+		mem = NULL;
+	}
+
+	return mem;
+}
+
+static struct nvgpu_runlist_domain *nvgpu_init_rl_domain(struct gk20a *g, u32 runlist_size)
+{
+	struct nvgpu_runlist_domain *domain = nvgpu_kzalloc(g, sizeof(*domain));
+
+	if (domain == NULL) {
+		return NULL;
+	}
+
+	domain->mem = init_rl_mem(g, runlist_size);
+	if (domain->mem == NULL) {
+		goto free_domain;
+	}
+
+	domain->mem_hw = init_rl_mem(g, runlist_size);
+	if (domain->mem_hw == NULL) {
+		goto free_mem;
+	}
+
+	return domain;
+free_mem:
+	free_rl_mem(g, domain->mem);
+free_domain:
+	nvgpu_kfree(g, domain);
+	return NULL;
+}
+
 static int nvgpu_init_active_runlist_mapping(struct gk20a *g)
 {
 	struct nvgpu_runlist *runlist;
 	struct nvgpu_fifo *f = &g->fifo;
 	unsigned int runlist_id;
 	size_t runlist_size;
-	u32 i, j;
+	u32 i;
 	int err = 0;
 
 	rl_dbg(g, "Building active runlist map.");
@@ -840,26 +920,14 @@ static int nvgpu_init_active_runlist_mapping(struct gk20a *g)
 		rl_dbg(g, "    RL entries: %d", f->num_runlist_entries);
 		rl_dbg(g, "    RL size %zu", runlist_size);
 
-		for (j = 0; j < MAX_RUNLIST_BUFFERS; j++) {
-			err = nvgpu_dma_alloc_flags_sys(g,
-					g->is_virtual ?
-					  0ULL : NVGPU_DMA_PHYSICALLY_ADDRESSED,
-					runlist_size,
-					&runlist->mem[j]);
-			if (err != 0) {
-				nvgpu_err(g, "memory allocation failed");
-				err = -ENOMEM;
-				goto clean_up_runlist;
-			}
+		runlist->domain = nvgpu_init_rl_domain(g, runlist_size);
+		if (runlist->domain == NULL) {
+			nvgpu_err(g, "memory allocation failed");
+			err = -ENOMEM;
+			goto clean_up_runlist;
 		}
 
 		nvgpu_mutex_init(&runlist->runlist_lock);
-
-		/*
-                 * None of buffers is pinned if this value doesn't change.
-		 * Otherwise, one of them (cur_buffer) must have been pinned.
-                 */
-		runlist->cur_buffer = MAX_RUNLIST_BUFFERS;
 	}
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/common/vgpu/fifo/runlist_vgpu.c b/drivers/gpu/nvgpu/common/vgpu/fifo/runlist_vgpu.c
index 30b9a48df..edf563ec5 100644
--- a/drivers/gpu/nvgpu/common/vgpu/fifo/runlist_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/fifo/runlist_vgpu.c
@@ -74,6 +74,7 @@ done:
 }
 
 static bool vgpu_runlist_modify_active_locked(struct gk20a *g, u32 runlist_id,
+					    struct nvgpu_runlist_domain *domain,
 					    struct nvgpu_channel *ch, bool add)
 {
 	struct nvgpu_fifo *f = &g->fifo;
@@ -99,6 +100,7 @@ static bool vgpu_runlist_modify_active_locked(struct gk20a *g, u32 runlist_id,
 }
 
 static void vgpu_runlist_reconstruct_locked(struct gk20a *g, u32 runlist_id,
+				     struct nvgpu_runlist_domain *domain,
 				     bool add_entries)
 {
 	struct nvgpu_fifo *f = &g->fifo;
@@ -111,7 +113,7 @@ static void vgpu_runlist_reconstruct_locked(struct gk20a *g, u32 runlist_id,
 		u32 count = 0;
 		unsigned long chid;
 
-		runlist_entry = runlist->mem[0].cpu_va;
+		runlist_entry = domain->mem->mem.cpu_va;
 
 		nvgpu_assert(f->num_channels <= (unsigned int)U16_MAX);
 		for_each_set_bit(chid,
@@ -121,9 +123,9 @@ static void vgpu_runlist_reconstruct_locked(struct gk20a *g, u32 runlist_id,
 			count++;
 		}
 
-		runlist->count = count;
+		domain->mem->count = count;
 	} else {
-		runlist->count = 0;
+		domain->mem->count = 0;
 	}
 }
 
@@ -132,14 +134,15 @@ static int vgpu_runlist_update_locked(struct gk20a *g, u32 runlist_id,
 					bool wait_for_finish)
 {
 	struct nvgpu_fifo *f = &g->fifo;
-	struct nvgpu_runlist *runlist;
+	struct nvgpu_runlist *runlist = f->runlists[runlist_id];
+	struct nvgpu_runlist_domain *domain = runlist->domain;
 	bool add_entries;
 
 	nvgpu_log_fn(g, " ");
 
 	if (ch != NULL) {
 		bool update = vgpu_runlist_modify_active_locked(g, runlist_id,
-				ch, add);
+				domain, ch, add);
 		if (!update) {
 			/* no change in runlist contents */
 			return 0;
@@ -151,12 +154,11 @@ static int vgpu_runlist_update_locked(struct gk20a *g, u32 runlist_id,
 		add_entries = add;
 	}
 
-	runlist = f->runlists[runlist_id];
-
-	vgpu_runlist_reconstruct_locked(g, runlist_id, add_entries);
+	vgpu_runlist_reconstruct_locked(g, runlist_id, domain, add_entries);
 
 	return vgpu_submit_runlist(g, vgpu_get_handle(g), runlist_id,
-				runlist->mem[0].cpu_va, runlist->count);
+				domain->mem->mem.cpu_va,
+				domain->mem->count);
 }
 
 /* add/remove a channel from runlist
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100.h b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100.h
index 192e8d254..062298722 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100.h
@@ -30,6 +30,5 @@
 struct gk20a;
 
 u32 ga100_runlist_count_max(struct gk20a *g);
-void ga100_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-					u32 count, u32 buffer_index);
+void ga100_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 #endif /* NVGPU_RUNLIST_FIFO_GA100_H */
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100_fusa.c b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100_fusa.c
index 196eb9a22..b461c10b6 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100_fusa.c
@@ -38,24 +38,20 @@ u32 ga100_runlist_count_max(struct gk20a *g)
 	return nvgpu_get_litter_value(g, GPU_LIT_MAX_RUNLISTS_SUPPORTED);
 }
 
-void ga100_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-				u32 count, u32 buffer_index)
+void ga100_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
 	u64 runlist_iova;
 	u32 runlist_iova_lo, runlist_iova_hi;
 
-	runlist = g->fifo.runlists[runlist_id];
-
-	runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);
 	runlist_iova_lo = u64_lo32(runlist_iova) >>
 			runlist_submit_base_lo_ptr_align_shift_v();
 	runlist_iova_hi = u64_hi32(runlist_iova);
 
-	if (count != 0U) {
+	if (runlist->domain->mem_hw->count != 0U) {
 		nvgpu_runlist_writel(g, runlist, runlist_submit_base_lo_r(),
 			runlist_submit_base_lo_ptr_lo_f(runlist_iova_lo) |
-			nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
+			nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
 			runlist_submit_base_lo_target_sys_mem_noncoherent_f(),
 			runlist_submit_base_lo_target_sys_mem_coherent_f(),
 			runlist_submit_base_lo_target_vid_mem_f()));
@@ -67,5 +63,5 @@ void ga100_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
 	/* TODO offset in runlist support */
 	nvgpu_runlist_writel(g, runlist, runlist_submit_r(),
 			runlist_submit_offset_f(0U) |
-			runlist_submit_length_f(count));
+			runlist_submit_length_f(runlist->domain->mem_hw->count));
 }
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b.h b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b.h
index faf9fe6ca..9de390ff0 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b.h
@@ -28,11 +28,11 @@
 #include <nvgpu/types.h>
 
 struct gk20a;
+struct nvgpu_runlist;
 
 u32 ga10b_runlist_count_max(struct gk20a *g);
 u32 ga10b_runlist_length_max(struct gk20a *g);
-void ga10b_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-					u32 count, u32 buffer_index);
+void ga10b_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 int ga10b_runlist_wait_pending(struct gk20a *g, u32 runlist_id);
 void ga10b_runlist_write_state(struct gk20a *g, u32 runlists_mask,
 				u32 runlist_state);
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b_fusa.c
index a097f7d00..f2081ff52 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b_fusa.c
@@ -48,24 +48,20 @@ u32 ga10b_runlist_length_max(struct gk20a *g)
 	return runlist_submit_length_max_v();
 }
 
-void ga10b_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-				u32 count, u32 buffer_index)
+void ga10b_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
 	u64 runlist_iova;
 	u32 runlist_iova_lo, runlist_iova_hi;
 
-	runlist = g->fifo.runlists[runlist_id];
-
-	runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);
 	runlist_iova_lo = u64_lo32(runlist_iova) >>
 			runlist_submit_base_lo_ptr_align_shift_v();
 	runlist_iova_hi = u64_hi32(runlist_iova);
 
-	if (count != 0U) {
+	if (runlist->domain->mem_hw->count != 0U) {
 		nvgpu_runlist_writel(g, runlist, runlist_submit_base_lo_r(),
 			runlist_submit_base_lo_ptr_lo_f(runlist_iova_lo) |
-			nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
+			nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
 			runlist_submit_base_lo_target_sys_mem_noncoherent_f(),
 			runlist_submit_base_lo_target_sys_mem_coherent_f(),
 			runlist_submit_base_lo_target_vid_mem_f()));
@@ -77,7 +73,7 @@ void ga10b_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
 	/* TODO offset in runlist support */
 	nvgpu_runlist_writel(g, runlist, runlist_submit_r(),
 			runlist_submit_offset_f(0U) |
-			runlist_submit_length_f(count));
+			runlist_submit_length_f(runlist->domain->mem_hw->count));
 }
 
 int ga10b_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.h b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.h
index fc149dc32..9d01fcdb3 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.h
@@ -28,6 +28,7 @@
 struct nvgpu_channel;
 struct nvgpu_tsg;
 struct gk20a;
+struct nvgpu_runlist;
 
 #ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
 int gk20a_runlist_reschedule(struct nvgpu_channel *ch, bool preempt_next);
@@ -39,8 +40,7 @@ u32 gk20a_runlist_count_max(struct gk20a *g);
 #endif
 
 u32 gk20a_runlist_length_max(struct gk20a *g);
-void gk20a_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-		u32 count, u32 buffer_index);
+void gk20a_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 int gk20a_runlist_wait_pending(struct gk20a *g, u32 runlist_id);
 void gk20a_runlist_write_state(struct gk20a *g, u32 runlists_mask,
 		u32 runlist_state);
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a_fusa.c b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a_fusa.c
index ed4ffab54..770f709a7 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a_fusa.c
@@ -38,29 +38,24 @@ u32 gk20a_runlist_length_max(struct gk20a *g)
 	return fifo_eng_runlist_length_max_v();
 }
 
-void gk20a_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-       u32 count, u32 buffer_index)
+void gk20a_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
-	u64 runlist_iova;
+	u64 runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);
 
-       runlist = g->fifo.runlists[runlist_id];
-       runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	nvgpu_spinlock_acquire(&g->fifo.runlist_submit_lock);
 
-       nvgpu_spinlock_acquire(&g->fifo.runlist_submit_lock);
+	if (runlist->domain->mem_hw->count != 0U) {
+		nvgpu_writel(g, fifo_runlist_base_r(),
+			     fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12U)) |
+			     nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
+				fifo_runlist_base_target_sys_mem_ncoh_f(),
+				fifo_runlist_base_target_sys_mem_coh_f(),
+				fifo_runlist_base_target_vid_mem_f()));
+	}
 
-       if (count != 0U) {
-               nvgpu_writel(g, fifo_runlist_base_r(),
-                       fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12U)) |
-                       nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
-                               fifo_runlist_base_target_sys_mem_ncoh_f(),
-                               fifo_runlist_base_target_sys_mem_coh_f(),
-                               fifo_runlist_base_target_vid_mem_f()));
-       }
-
-       nvgpu_writel(g, fifo_runlist_r(),
-               fifo_runlist_engine_f(runlist_id) |
-               fifo_eng_runlist_length_f(count));
+	nvgpu_writel(g, fifo_runlist_r(),
+		     fifo_runlist_engine_f(runlist->id) |
+		     fifo_eng_runlist_length_f(runlist->domain->mem_hw->count));
 
        nvgpu_spinlock_release(&g->fifo.runlist_submit_lock);
 }
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.c b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.c
index a167749a3..ec23916ab 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.c
@@ -34,34 +34,31 @@ u32 tu104_runlist_count_max(struct gk20a *g)
 	return fifo_runlist_base_lo__size_1_v();
 }
 
-void tu104_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-	u32 count, u32 buffer_index)
+void tu104_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
 	u64 runlist_iova;
 	u32 runlist_iova_lo, runlist_iova_hi;
 
-	runlist = g->fifo.runlists[runlist_id];
-	runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);
 
 	runlist_iova_lo = u64_lo32(runlist_iova) >>
 				fifo_runlist_base_lo_ptr_align_shift_v();
 	runlist_iova_hi = u64_hi32(runlist_iova);
 
-	if (count != 0U) {
-		nvgpu_writel(g, fifo_runlist_base_lo_r(runlist_id),
+	if (runlist->domain->mem_hw->count != 0U) {
+		nvgpu_writel(g, fifo_runlist_base_lo_r(runlist->id),
 			fifo_runlist_base_lo_ptr_lo_f(runlist_iova_lo) |
-			nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
+			nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
 				fifo_runlist_base_lo_target_sys_mem_ncoh_f(),
 				fifo_runlist_base_lo_target_sys_mem_coh_f(),
 				fifo_runlist_base_lo_target_vid_mem_f()));
 
-		nvgpu_writel(g, fifo_runlist_base_hi_r(runlist_id),
+		nvgpu_writel(g, fifo_runlist_base_hi_r(runlist->id),
 			fifo_runlist_base_hi_ptr_hi_f(runlist_iova_hi));
 	}
 
-	nvgpu_writel(g, fifo_runlist_submit_r(runlist_id),
-		fifo_runlist_submit_length_f(count));
+	nvgpu_writel(g, fifo_runlist_submit_r(runlist->id),
+		fifo_runlist_submit_length_f(runlist->domain->mem_hw->count));
 }
 
 int tu104_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.h b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.h
index 595934391..773ff440f 100644
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.h
@@ -26,10 +26,10 @@
 #include <nvgpu/types.h>
 
 struct gk20a;
+struct nvgpu_runlist;
 
 u32 tu104_runlist_count_max(struct gk20a *g);
-void tu104_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-		u32 count, u32 buffer_index);
+void tu104_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 int tu104_runlist_wait_pending(struct gk20a *g, u32 runlist_id);
 
 #endif /* NVGPU_RUNLIST_FIFO_TU104_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/runlist.h b/drivers/gpu/nvgpu/include/nvgpu/gops/runlist.h
index f3c7ab169..b22e9eb32 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/runlist.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/runlist.h
@@ -32,6 +32,7 @@
  */
 struct gk20a;
 struct nvgpu_channel;
+struct nvgpu_runlist;
 
 /**
  * Runlist HAL operations.
@@ -89,8 +90,7 @@ struct gops_runlist {
 	void (*get_tsg_entry)(struct nvgpu_tsg *tsg,
 			u32 *runlist, u32 timeslice);
 	void (*get_ch_entry)(struct nvgpu_channel *ch, u32 *runlist);
-	void (*hw_submit)(struct gk20a *g, u32 runlist_id,
-		u32 count, u32 buffer_index);
+	void (*hw_submit)(struct gk20a *g, struct nvgpu_runlist *runlist);
 	int (*wait_pending)(struct gk20a *g, u32 runlist_id);
 	void (*write_state)(struct gk20a *g, u32 runlists_mask,
 			u32 runlist_state);
diff --git a/drivers/gpu/nvgpu/include/nvgpu/runlist.h b/drivers/gpu/nvgpu/include/nvgpu/runlist.h
index d296a6f6a..ecd857d97 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/runlist.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/runlist.h
@@ -75,31 +75,50 @@ struct nvgpu_pbdma_info;
 /** Enable runlist. */
 #define RUNLIST_ENABLED			1U
 
-/** Double buffering is used to build runlists */
-#define MAX_RUNLIST_BUFFERS		2U
-
 /** Runlist identifier is invalid. */
 #define NVGPU_INVALID_RUNLIST_ID		U32_MAX
 
+/*
+ * Updates to this memory are still serialized by the runlist lock.
+ *
+ * TODO: add a mutex when domain updates get more fine-grained. The buffers in
+ * nvgpu_runlist_domain are pointer members for a reason to make this easier in
+ * the future; the buffers may get juggled around.
+ */
+struct nvgpu_runlist_mem {
+	/** Rendered runlist memory suitable for HW. */
+	struct nvgpu_mem mem;
+
+	/** Number of entries written in the buffer. */
+	u32 count;
+};
+
+struct nvgpu_runlist_domain {
+	/** Runlist buffer free to use in sw. Swapped with another mem on next load. */
+	struct nvgpu_runlist_mem *mem;
+
+	/** Currently active buffer submitted for hardware. */
+	struct nvgpu_runlist_mem *mem_hw;
+};
+
 struct nvgpu_runlist {
-	/** Runlist identifier. */
+	/** The HW has some designated RL IDs that are bound to engines. */
 	u32 id;
+
 	/** Bitmap of active channels in the runlist. One bit per chid. */
 	unsigned long *active_channels;
 	/** Bitmap of active TSGs in the runlist. One bit per tsgid. */
 	unsigned long *active_tsgs;
-	/** Runlist buffers. Double buffering is used for each engine. */
-	struct nvgpu_mem mem[MAX_RUNLIST_BUFFERS];
-	/** Indicates current runlist buffer used by HW. */
-	u32  cur_buffer;
+
+	/* The default domain is the only one that currently exists. */
+	struct nvgpu_runlist_domain *domain;
+
 	/** Bitmask of PBDMAs supported for this runlist. */
 	u32  pbdma_bitmask;
 	/** Bitmask of engines using this runlist. */
 	u32  eng_bitmask;
 	/** Bitmask of engines to be reset during recovery. */
 	u32  reset_eng_bitmask;
-	/** Cached hw_submit parameter. */
-	u32  count;
 	/** Protect ch/tsg/runlist preempt & runlist update. */
 	struct nvgpu_mutex runlist_lock;
 
@@ -139,7 +158,8 @@ struct nvgpu_runlist {
  */
 u32 nvgpu_runlist_construct_locked(struct nvgpu_fifo *f,
 		struct nvgpu_runlist *runlist,
-		u32 buf_id, u32 max_entries);
+		struct nvgpu_runlist_domain *domain,
+		u32 max_entries);
 
 /**
  * @brief Add/remove channel to/from runlist (locked)
diff --git a/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c b/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
index 0eab01cf2..69904003c 100644
--- a/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
+++ b/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
@@ -75,17 +75,19 @@ done:
 int test_gk20a_runlist_hw_submit(struct unit_module *m,
 		struct gk20a *g, void *args)
 {
+	struct nvgpu_fifo *f = &g->fifo;
 	int ret = UNIT_FAIL;
 	u32 runlist_id = nvgpu_engine_get_gr_runlist_id(g);
 	u32 count;
-	u32 buffer_index = 0;
 
 	for (count = 0; count < 2; count++) {
 
 		nvgpu_writel(g, fifo_runlist_r(), 0);
 		nvgpu_writel(g, fifo_runlist_base_r(), 0);
 
-		gk20a_runlist_hw_submit(g, runlist_id, count, buffer_index);
+		f->runlists[runlist_id]->domain->mem_hw->count = count;
+
+		gk20a_runlist_hw_submit(g, f->runlists[runlist_id]);
 		if (count == 0) {
 			unit_assert(nvgpu_readl(g, fifo_runlist_base_r()) == 0,
 					goto done);