gpu: nvgpu: introduce internal runlist domain

The current runlist code assumes a single runlist buffer to hold all TSG and channel entries. Create separate RL domain and domain memory types to hold data that is related to only a scheduling domain and not directly to the runlist hardware; in the future, more than one domains may exist and one of them is enabled at a time. The domain is used only internally by the runlist code at this point and is functionally equivalent to the current runlist memory that houses the round robin entries. The double buffering is still kept, although more domains might benefit from some cleverness. Although any number of created domains may be edited in runtime, nly one runlist memory is accessed by the hardware at a time. To spare some contiguous memory, this should be considered an opportunity for optimization in the future. Jira NVGPU-6425 Change-Id: Id99c55f058ad56daa48b732240f05b3195debfb1 Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2618386 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2021-10-28 22:10:34 +03:00
parent e616b2ba4d
commit 1d23b8f13a
13 changed files with 195 additions and 120 deletions
--- a/drivers/gpu/nvgpu/common/fifo/runlist.c
+++ b/drivers/gpu/nvgpu/common/fifo/runlist.c
@@ -302,10 +302,10 @@ static u32 nvgpu_runlist_append_flat(struct nvgpu_fifo *f,

 u32 nvgpu_runlist_construct_locked(struct nvgpu_fifo *f,
 				struct nvgpu_runlist *runlist,
-				u32 buf_id,
+				struct nvgpu_runlist_domain *domain,
 				u32 max_entries)
 {
-	u32 *runlist_entry_base = runlist->mem[buf_id].cpu_va;
+	u32 *runlist_entry_base = domain->mem->mem.cpu_va;

 	/*
 	 * The entry pointer and capacity counter that live on the stack here
@@ -323,6 +323,7 @@ u32 nvgpu_runlist_construct_locked(struct nvgpu_fifo *f,

 static bool nvgpu_runlist_modify_active_locked(struct gk20a *g,
 					       struct nvgpu_runlist *runlist,
+					       struct nvgpu_runlist_domain *domain,
 					       struct nvgpu_channel *ch, bool add)
 {
 	struct nvgpu_tsg *tsg = NULL;
@@ -370,29 +371,31 @@ static bool nvgpu_runlist_modify_active_locked(struct gk20a *g,

 static int nvgpu_runlist_reconstruct_locked(struct gk20a *g,
 					    struct nvgpu_runlist *runlist,
-					    u32 buf_id, bool add_entries)
+					    struct nvgpu_runlist_domain *domain,
+					    bool add_entries)
 {
 	u32 num_entries;
 	struct nvgpu_fifo *f = &g->fifo;

 	rl_dbg(g, "[%u] switch to new buffer 0x%16llx",
-		runlist->id, (u64)nvgpu_mem_get_addr(g, &runlist->mem[buf_id]));
+		runlist->id, (u64)nvgpu_mem_get_addr(g, &domain->mem->mem));

 	if (!add_entries) {
-		runlist->count = 0;
+		domain->mem->count = 0;
 		return 0;
 	}

-	num_entries = nvgpu_runlist_construct_locked(f, runlist, buf_id,
+	num_entries = nvgpu_runlist_construct_locked(f, runlist, domain,
 						f->num_runlist_entries);
 	if (num_entries == RUNLIST_APPEND_FAILURE) {
 		return -E2BIG;
 	}
-	runlist->count = num_entries;
+
+	domain->mem->count = num_entries;
 NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
 NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
 NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
-	WARN_ON(runlist->count > f->num_runlist_entries);
+	WARN_ON(domain->mem->count > f->num_runlist_entries);
 NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
 NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
 NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
@@ -405,11 +408,12 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 				bool wait_for_finish)
 {
 	int ret = 0;
-	u32 buf_id;
 	bool add_entries;
+	struct nvgpu_runlist_mem *mem_tmp;
+	struct nvgpu_runlist_domain *domain = rl->domain;

 	if (ch != NULL) {
-		bool update = nvgpu_runlist_modify_active_locked(g, rl, ch, add);
+		bool update = nvgpu_runlist_modify_active_locked(g, rl, domain, ch, add);
 		if (!update) {
 			/* no change in runlist contents */
 			return 0;
@@ -421,15 +425,29 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 		add_entries = add;
 	}

-	/* double buffering, swap to next */
-	buf_id = (rl->cur_buffer == 0U) ? 1U : 0U;
-
-	ret = nvgpu_runlist_reconstruct_locked(g, rl, buf_id, add_entries);
+	ret = nvgpu_runlist_reconstruct_locked(g, rl, domain, add_entries);
 	if (ret != 0) {
 		return ret;
 	}

-	g->ops.runlist.hw_submit(g, rl->id, rl->count, buf_id);
+	/*
+	 * hw_submit updates mem_hw to hardware; swap the buffers now. mem
+	 * becomes the previously scheduled buffer and it can be modified once
+	 * the runlist lock is released.
+	 */
+
+	mem_tmp = domain->mem;
+	domain->mem = domain->mem_hw;
+	domain->mem_hw = mem_tmp;
+
+	/*
+	 * A non-active domain may be updated, but submit still the currently
+	 * active one just for simplicity.
+	 *
+	 * TODO: Later on, updates and submits will need to be totally
+	 * decoupled so that submits are done only in the domain scheduler.
+	 */
+	g->ops.runlist.hw_submit(g, rl);

 	if (wait_for_finish) {
 		ret = g->ops.runlist.wait_pending(g, rl->id);
@@ -446,8 +464,6 @@ int nvgpu_runlist_update_locked(struct gk20a *g, struct nvgpu_runlist *rl,
 		}
 	}

-	rl->cur_buffer = buf_id;
-
 	return ret;
 }

@@ -473,8 +489,12 @@ int nvgpu_runlist_reschedule(struct nvgpu_channel *ch, bool preempt_next,
 		g, g->pmu, PMU_MUTEX_ID_FIFO, &token);
 #endif

-	g->ops.runlist.hw_submit(
-		g, runlist->id, runlist->count, runlist->cur_buffer);
+	/*
+	 * Note that the runlist memory is not rewritten; the currently active
+	 * buffer is just resubmitted so that scheduling begins from the first
+	 * entry in it.
+	 */
+	g->ops.runlist.hw_submit(g, runlist);

 	if (preempt_next) {
 		if (g->ops.runlist.reschedule_preempt_next_locked(ch,
@@ -633,10 +653,16 @@ void nvgpu_runlist_set_state(struct gk20a *g, u32 runlists_mask,
 #endif
 }

+static void free_rl_mem(struct gk20a *g, struct nvgpu_runlist_mem *mem)
+{
+	nvgpu_dma_free(g, &mem->mem);
+	nvgpu_kfree(g, mem);
+}
+
 void nvgpu_runlist_cleanup_sw(struct gk20a *g)
 {
 	struct nvgpu_fifo *f = &g->fifo;
-	u32 i, j;
+	u32 i;
 	struct nvgpu_runlist *runlist;

 	if ((f->runlists == NULL) || (f->active_runlists == NULL)) {
@@ -647,8 +673,14 @@ void nvgpu_runlist_cleanup_sw(struct gk20a *g)

 	for (i = 0; i < f->num_runlists; i++) {
 		runlist = &f->active_runlists[i];
-		for (j = 0; j < MAX_RUNLIST_BUFFERS; j++) {
-			nvgpu_dma_free(g, &runlist->mem[j]);
+
+		if (runlist->domain != NULL) {
+			free_rl_mem(g, runlist->domain->mem);
+			runlist->domain->mem = NULL;
+			free_rl_mem(g, runlist->domain->mem_hw);
+			runlist->domain->mem_hw = NULL;
+			nvgpu_kfree(g, runlist->domain);
+			runlist->domain = NULL;
 		}

 		nvgpu_kfree(g, runlist->active_channels);
@@ -787,13 +819,61 @@ void nvgpu_runlist_init_enginfo(struct gk20a *g, struct nvgpu_fifo *f)
 	nvgpu_log_fn(g, "done");
 }

+static struct nvgpu_runlist_mem *init_rl_mem(struct gk20a *g, u32 runlist_size)
+{
+	struct nvgpu_runlist_mem *mem = nvgpu_kzalloc(g, sizeof(*mem));
+	int err;
+
+	if (mem == NULL) {
+		return NULL;
+	}
+
+	err = nvgpu_dma_alloc_flags_sys(g,
+			g->is_virtual ?
+			  0ULL : NVGPU_DMA_PHYSICALLY_ADDRESSED,
+			runlist_size,
+			&mem->mem);
+	if (err != 0) {
+		nvgpu_kfree(g, mem);
+		mem = NULL;
+	}
+
+	return mem;
+}
+
+static struct nvgpu_runlist_domain *nvgpu_init_rl_domain(struct gk20a *g, u32 runlist_size)
+{
+	struct nvgpu_runlist_domain *domain = nvgpu_kzalloc(g, sizeof(*domain));
+
+	if (domain == NULL) {
+		return NULL;
+	}
+
+	domain->mem = init_rl_mem(g, runlist_size);
+	if (domain->mem == NULL) {
+		goto free_domain;
+	}
+
+	domain->mem_hw = init_rl_mem(g, runlist_size);
+	if (domain->mem_hw == NULL) {
+		goto free_mem;
+	}
+
+	return domain;
+free_mem:
+	free_rl_mem(g, domain->mem);
+free_domain:
+	nvgpu_kfree(g, domain);
+	return NULL;
+}
+
 static int nvgpu_init_active_runlist_mapping(struct gk20a *g)
 {
 	struct nvgpu_runlist *runlist;
 	struct nvgpu_fifo *f = &g->fifo;
 	unsigned int runlist_id;
 	size_t runlist_size;
-	u32 i, j;
+	u32 i;
 	int err = 0;

 	rl_dbg(g, "Building active runlist map.");
@@ -840,26 +920,14 @@ static int nvgpu_init_active_runlist_mapping(struct gk20a *g)
 		rl_dbg(g, "    RL entries: %d", f->num_runlist_entries);
 		rl_dbg(g, "    RL size %zu", runlist_size);

-		for (j = 0; j < MAX_RUNLIST_BUFFERS; j++) {
-			err = nvgpu_dma_alloc_flags_sys(g,
-					g->is_virtual ?
-					  0ULL : NVGPU_DMA_PHYSICALLY_ADDRESSED,
-					runlist_size,
-					&runlist->mem[j]);
-			if (err != 0) {
-				nvgpu_err(g, "memory allocation failed");
-				err = -ENOMEM;
-				goto clean_up_runlist;
-			}
+		runlist->domain = nvgpu_init_rl_domain(g, runlist_size);
+		if (runlist->domain == NULL) {
+			nvgpu_err(g, "memory allocation failed");
+			err = -ENOMEM;
+			goto clean_up_runlist;
 		}

 		nvgpu_mutex_init(&runlist->runlist_lock);
-
-		/*
-                 * None of buffers is pinned if this value doesn't change.
-		 * Otherwise, one of them (cur_buffer) must have been pinned.
-                 */
-		runlist->cur_buffer = MAX_RUNLIST_BUFFERS;
 	}

 	return 0;
--- a/drivers/gpu/nvgpu/common/vgpu/fifo/runlist_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/fifo/runlist_vgpu.c
@@ -74,6 +74,7 @@ done:
 }

 static bool vgpu_runlist_modify_active_locked(struct gk20a *g, u32 runlist_id,
+					    struct nvgpu_runlist_domain *domain,
 					    struct nvgpu_channel *ch, bool add)
 {
 	struct nvgpu_fifo *f = &g->fifo;
@@ -99,6 +100,7 @@ static bool vgpu_runlist_modify_active_locked(struct gk20a *g, u32 runlist_id,
 }

 static void vgpu_runlist_reconstruct_locked(struct gk20a *g, u32 runlist_id,
+				     struct nvgpu_runlist_domain *domain,
 				     bool add_entries)
 {
 	struct nvgpu_fifo *f = &g->fifo;
@@ -111,7 +113,7 @@ static void vgpu_runlist_reconstruct_locked(struct gk20a *g, u32 runlist_id,
 		u32 count = 0;
 		unsigned long chid;

-		runlist_entry = runlist->mem[0].cpu_va;
+		runlist_entry = domain->mem->mem.cpu_va;

 		nvgpu_assert(f->num_channels <= (unsigned int)U16_MAX);
 		for_each_set_bit(chid,
@@ -121,9 +123,9 @@ static void vgpu_runlist_reconstruct_locked(struct gk20a *g, u32 runlist_id,
 			count++;
 		}

-		runlist->count = count;
+		domain->mem->count = count;
 	} else {
-		runlist->count = 0;
+		domain->mem->count = 0;
 	}
 }

@@ -132,14 +134,15 @@ static int vgpu_runlist_update_locked(struct gk20a *g, u32 runlist_id,
 					bool wait_for_finish)
 {
 	struct nvgpu_fifo *f = &g->fifo;
-	struct nvgpu_runlist *runlist;
+	struct nvgpu_runlist *runlist = f->runlists[runlist_id];
+	struct nvgpu_runlist_domain *domain = runlist->domain;
 	bool add_entries;

 	nvgpu_log_fn(g, " ");

 	if (ch != NULL) {
 		bool update = vgpu_runlist_modify_active_locked(g, runlist_id,
-				ch, add);
+				domain, ch, add);
 		if (!update) {
 			/* no change in runlist contents */
 			return 0;
@@ -151,12 +154,11 @@ static int vgpu_runlist_update_locked(struct gk20a *g, u32 runlist_id,
 		add_entries = add;
 	}

-	runlist = f->runlists[runlist_id];
-
-	vgpu_runlist_reconstruct_locked(g, runlist_id, add_entries);
+	vgpu_runlist_reconstruct_locked(g, runlist_id, domain, add_entries);

 	return vgpu_submit_runlist(g, vgpu_get_handle(g), runlist_id,
-				runlist->mem[0].cpu_va, runlist->count);
+				domain->mem->mem.cpu_va,
+				domain->mem->count);
 }

 /* add/remove a channel from runlist
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100.h
@@ -30,6 +30,5 @@
 struct gk20a;

 u32 ga100_runlist_count_max(struct gk20a *g);
-void ga100_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-					u32 count, u32 buffer_index);
+void ga100_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 #endif /* NVGPU_RUNLIST_FIFO_GA100_H */
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga100_fusa.c
@@ -38,24 +38,20 @@ u32 ga100_runlist_count_max(struct gk20a *g)
 	return nvgpu_get_litter_value(g, GPU_LIT_MAX_RUNLISTS_SUPPORTED);
 }

-void ga100_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-				u32 count, u32 buffer_index)
+void ga100_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
 	u64 runlist_iova;
 	u32 runlist_iova_lo, runlist_iova_hi;

-	runlist = g->fifo.runlists[runlist_id];
-
-	runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);
 	runlist_iova_lo = u64_lo32(runlist_iova) >>
 			runlist_submit_base_lo_ptr_align_shift_v();
 	runlist_iova_hi = u64_hi32(runlist_iova);

-	if (count != 0U) {
+	if (runlist->domain->mem_hw->count != 0U) {
 		nvgpu_runlist_writel(g, runlist, runlist_submit_base_lo_r(),
 			runlist_submit_base_lo_ptr_lo_f(runlist_iova_lo) |
-			nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
+			nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
 			runlist_submit_base_lo_target_sys_mem_noncoherent_f(),
 			runlist_submit_base_lo_target_sys_mem_coherent_f(),
 			runlist_submit_base_lo_target_vid_mem_f()));
@@ -67,5 +63,5 @@ void ga100_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
 	/* TODO offset in runlist support */
 	nvgpu_runlist_writel(g, runlist, runlist_submit_r(),
 			runlist_submit_offset_f(0U) |
-			runlist_submit_length_f(count));
+			runlist_submit_length_f(runlist->domain->mem_hw->count));
 }
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b.h
@@ -28,11 +28,11 @@
 #include <nvgpu/types.h>

 struct gk20a;
+struct nvgpu_runlist;

 u32 ga10b_runlist_count_max(struct gk20a *g);
 u32 ga10b_runlist_length_max(struct gk20a *g);
-void ga10b_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-					u32 count, u32 buffer_index);
+void ga10b_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 int ga10b_runlist_wait_pending(struct gk20a *g, u32 runlist_id);
 void ga10b_runlist_write_state(struct gk20a *g, u32 runlists_mask,
 				u32 runlist_state);
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_ga10b_fusa.c
@@ -48,24 +48,20 @@ u32 ga10b_runlist_length_max(struct gk20a *g)
 	return runlist_submit_length_max_v();
 }

-void ga10b_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-				u32 count, u32 buffer_index)
+void ga10b_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
 	u64 runlist_iova;
 	u32 runlist_iova_lo, runlist_iova_hi;

-	runlist = g->fifo.runlists[runlist_id];
-
-	runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);
 	runlist_iova_lo = u64_lo32(runlist_iova) >>
 			runlist_submit_base_lo_ptr_align_shift_v();
 	runlist_iova_hi = u64_hi32(runlist_iova);

-	if (count != 0U) {
+	if (runlist->domain->mem_hw->count != 0U) {
 		nvgpu_runlist_writel(g, runlist, runlist_submit_base_lo_r(),
 			runlist_submit_base_lo_ptr_lo_f(runlist_iova_lo) |
-			nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
+			nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
 			runlist_submit_base_lo_target_sys_mem_noncoherent_f(),
 			runlist_submit_base_lo_target_sys_mem_coherent_f(),
 			runlist_submit_base_lo_target_vid_mem_f()));
@@ -77,7 +73,7 @@ void ga10b_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
 	/* TODO offset in runlist support */
 	nvgpu_runlist_writel(g, runlist, runlist_submit_r(),
 			runlist_submit_offset_f(0U) |
-			runlist_submit_length_f(count));
+			runlist_submit_length_f(runlist->domain->mem_hw->count));
 }

 int ga10b_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.h
@@ -28,6 +28,7 @@
 struct nvgpu_channel;
 struct nvgpu_tsg;
 struct gk20a;
+struct nvgpu_runlist;

 #ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
 int gk20a_runlist_reschedule(struct nvgpu_channel *ch, bool preempt_next);
@@ -39,8 +40,7 @@ u32 gk20a_runlist_count_max(struct gk20a *g);
 #endif

 u32 gk20a_runlist_length_max(struct gk20a *g);
-void gk20a_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-		u32 count, u32 buffer_index);
+void gk20a_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 int gk20a_runlist_wait_pending(struct gk20a *g, u32 runlist_id);
 void gk20a_runlist_write_state(struct gk20a *g, u32 runlists_mask,
 		u32 runlist_state);
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a_fusa.c
@@ -38,29 +38,24 @@ u32 gk20a_runlist_length_max(struct gk20a *g)
 	return fifo_eng_runlist_length_max_v();
 }

-void gk20a_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-       u32 count, u32 buffer_index)
+void gk20a_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
-	u64 runlist_iova;
+	u64 runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);

-       runlist = g->fifo.runlists[runlist_id];
-       runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	nvgpu_spinlock_acquire(&g->fifo.runlist_submit_lock);

-       nvgpu_spinlock_acquire(&g->fifo.runlist_submit_lock);
+	if (runlist->domain->mem_hw->count != 0U) {
+		nvgpu_writel(g, fifo_runlist_base_r(),
+			     fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12U)) |
+			     nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
+				fifo_runlist_base_target_sys_mem_ncoh_f(),
+				fifo_runlist_base_target_sys_mem_coh_f(),
+				fifo_runlist_base_target_vid_mem_f()));
+	}

-       if (count != 0U) {
-               nvgpu_writel(g, fifo_runlist_base_r(),
-                       fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12U)) |
-                       nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
-                               fifo_runlist_base_target_sys_mem_ncoh_f(),
-                               fifo_runlist_base_target_sys_mem_coh_f(),
-                               fifo_runlist_base_target_vid_mem_f()));
-       }
-
-       nvgpu_writel(g, fifo_runlist_r(),
-               fifo_runlist_engine_f(runlist_id) |
-               fifo_eng_runlist_length_f(count));
+	nvgpu_writel(g, fifo_runlist_r(),
+		     fifo_runlist_engine_f(runlist->id) |
+		     fifo_eng_runlist_length_f(runlist->domain->mem_hw->count));

       nvgpu_spinlock_release(&g->fifo.runlist_submit_lock);
 }
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.c
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.c
@@ -34,34 +34,31 @@ u32 tu104_runlist_count_max(struct gk20a *g)
 	return fifo_runlist_base_lo__size_1_v();
 }

-void tu104_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-	u32 count, u32 buffer_index)
+void tu104_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist)
 {
-	struct nvgpu_runlist *runlist = NULL;
 	u64 runlist_iova;
 	u32 runlist_iova_lo, runlist_iova_hi;

-	runlist = g->fifo.runlists[runlist_id];
-	runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+	runlist_iova = nvgpu_mem_get_addr(g, &runlist->domain->mem_hw->mem);

 	runlist_iova_lo = u64_lo32(runlist_iova) >>
 				fifo_runlist_base_lo_ptr_align_shift_v();
 	runlist_iova_hi = u64_hi32(runlist_iova);

-	if (count != 0U) {
-		nvgpu_writel(g, fifo_runlist_base_lo_r(runlist_id),
+	if (runlist->domain->mem_hw->count != 0U) {
+		nvgpu_writel(g, fifo_runlist_base_lo_r(runlist->id),
 			fifo_runlist_base_lo_ptr_lo_f(runlist_iova_lo) |
-			nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
+			nvgpu_aperture_mask(g, &runlist->domain->mem_hw->mem,
 				fifo_runlist_base_lo_target_sys_mem_ncoh_f(),
 				fifo_runlist_base_lo_target_sys_mem_coh_f(),
 				fifo_runlist_base_lo_target_vid_mem_f()));

-		nvgpu_writel(g, fifo_runlist_base_hi_r(runlist_id),
+		nvgpu_writel(g, fifo_runlist_base_hi_r(runlist->id),
 			fifo_runlist_base_hi_ptr_hi_f(runlist_iova_hi));
 	}

-	nvgpu_writel(g, fifo_runlist_submit_r(runlist_id),
-		fifo_runlist_submit_length_f(count));
+	nvgpu_writel(g, fifo_runlist_submit_r(runlist->id),
+		fifo_runlist_submit_length_f(runlist->domain->mem_hw->count));
 }

 int tu104_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
--- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.h
+++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_tu104.h
@@ -26,10 +26,10 @@
 #include <nvgpu/types.h>

 struct gk20a;
+struct nvgpu_runlist;

 u32 tu104_runlist_count_max(struct gk20a *g);
-void tu104_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
-		u32 count, u32 buffer_index);
+void tu104_runlist_hw_submit(struct gk20a *g, struct nvgpu_runlist *runlist);
 int tu104_runlist_wait_pending(struct gk20a *g, u32 runlist_id);

 #endif /* NVGPU_RUNLIST_FIFO_TU104_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/runlist.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/runlist.h
@@ -32,6 +32,7 @@
 */
 struct gk20a;
 struct nvgpu_channel;
+struct nvgpu_runlist;

 /**
 * Runlist HAL operations.
@@ -89,8 +90,7 @@ struct gops_runlist {
 	void (*get_tsg_entry)(struct nvgpu_tsg *tsg,
 			u32 *runlist, u32 timeslice);
 	void (*get_ch_entry)(struct nvgpu_channel *ch, u32 *runlist);
-	void (*hw_submit)(struct gk20a *g, u32 runlist_id,
-		u32 count, u32 buffer_index);
+	void (*hw_submit)(struct gk20a *g, struct nvgpu_runlist *runlist);
 	int (*wait_pending)(struct gk20a *g, u32 runlist_id);
 	void (*write_state)(struct gk20a *g, u32 runlists_mask,
 			u32 runlist_state);
--- a/drivers/gpu/nvgpu/include/nvgpu/runlist.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/runlist.h
@@ -75,31 +75,50 @@ struct nvgpu_pbdma_info;
 /** Enable runlist. */
 #define RUNLIST_ENABLED			1U

-/** Double buffering is used to build runlists */
-#define MAX_RUNLIST_BUFFERS		2U
-
 /** Runlist identifier is invalid. */
 #define NVGPU_INVALID_RUNLIST_ID		U32_MAX

+/*
+ * Updates to this memory are still serialized by the runlist lock.
+ *
+ * TODO: add a mutex when domain updates get more fine-grained. The buffers in
+ * nvgpu_runlist_domain are pointer members for a reason to make this easier in
+ * the future; the buffers may get juggled around.
+ */
+struct nvgpu_runlist_mem {
+	/** Rendered runlist memory suitable for HW. */
+	struct nvgpu_mem mem;
+
+	/** Number of entries written in the buffer. */
+	u32 count;
+};
+
+struct nvgpu_runlist_domain {
+	/** Runlist buffer free to use in sw. Swapped with another mem on next load. */
+	struct nvgpu_runlist_mem *mem;
+
+	/** Currently active buffer submitted for hardware. */
+	struct nvgpu_runlist_mem *mem_hw;
+};
+
 struct nvgpu_runlist {
-	/** Runlist identifier. */
+	/** The HW has some designated RL IDs that are bound to engines. */
 	u32 id;
+
 	/** Bitmap of active channels in the runlist. One bit per chid. */
 	unsigned long *active_channels;
 	/** Bitmap of active TSGs in the runlist. One bit per tsgid. */
 	unsigned long *active_tsgs;
-	/** Runlist buffers. Double buffering is used for each engine. */
-	struct nvgpu_mem mem[MAX_RUNLIST_BUFFERS];
-	/** Indicates current runlist buffer used by HW. */
-	u32  cur_buffer;
+
+	/* The default domain is the only one that currently exists. */
+	struct nvgpu_runlist_domain *domain;
+
 	/** Bitmask of PBDMAs supported for this runlist. */
 	u32  pbdma_bitmask;
 	/** Bitmask of engines using this runlist. */
 	u32  eng_bitmask;
 	/** Bitmask of engines to be reset during recovery. */
 	u32  reset_eng_bitmask;
-	/** Cached hw_submit parameter. */
-	u32  count;
 	/** Protect ch/tsg/runlist preempt & runlist update. */
 	struct nvgpu_mutex runlist_lock;

@@ -139,7 +158,8 @@ struct nvgpu_runlist {
 */
 u32 nvgpu_runlist_construct_locked(struct nvgpu_fifo *f,
 		struct nvgpu_runlist *runlist,
-		u32 buf_id, u32 max_entries);
+		struct nvgpu_runlist_domain *domain,
+		u32 max_entries);

 /**
 * @brief Add/remove channel to/from runlist (locked)
--- a/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
+++ b/userspace/units/fifo/runlist/gk20a/nvgpu-runlist-gk20a.c
@@ -75,17 +75,19 @@ done:
 int test_gk20a_runlist_hw_submit(struct unit_module *m,
 		struct gk20a *g, void *args)
 {
+	struct nvgpu_fifo *f = &g->fifo;
 	int ret = UNIT_FAIL;
 	u32 runlist_id = nvgpu_engine_get_gr_runlist_id(g);
 	u32 count;
-	u32 buffer_index = 0;

 	for (count = 0; count < 2; count++) {

 		nvgpu_writel(g, fifo_runlist_r(), 0);
 		nvgpu_writel(g, fifo_runlist_base_r(), 0);

-		gk20a_runlist_hw_submit(g, runlist_id, count, buffer_index);
+		f->runlists[runlist_id]->domain->mem_hw->count = count;
+
+		gk20a_runlist_hw_submit(g, f->runlists[runlist_id]);
 		if (count == 0) {
 			unit_assert(nvgpu_readl(g, fifo_runlist_base_r()) == 0,
 					goto done);