diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 1b2b76ed6..e59d8777c 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -60,8 +60,6 @@
 #define FECS_METHOD_WFI_RESTORE 0x80000
 #define FECS_MAILBOX_0_ACK_RESTORE 0x4
 
-#define RUNLIST_APPEND_FAILURE 0xffffffffU
-
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
 
 static const char *const pbdma_intr_fault_type_desc[] = {
@@ -3325,7 +3323,7 @@ void gk20a_get_ch_runlist_entry(struct channel_gk20a *ch, u32 *runlist)
 
 static u32 nvgpu_runlist_append_tsg(struct gk20a *g,
 		struct fifo_runlist_info_gk20a *runlist,
-		u32 *runlist_entry,
+		u32 **runlist_entry,
 		u32 *entries_left,
 		struct tsg_gk20a *tsg)
 {
@@ -3334,16 +3332,19 @@ static u32 nvgpu_runlist_append_tsg(struct gk20a *g,
 	struct channel_gk20a *ch;
 	u32 count = 0;
 
+	nvgpu_log_fn(f->g, " ");
+
 	if (*entries_left == 0U) {
 		return RUNLIST_APPEND_FAILURE;
 	}
 
 	/* add TSG entry */
 	nvgpu_log_info(g, "add TSG %d to runlist", tsg->tsgid);
-	g->ops.fifo.get_tsg_runlist_entry(tsg, runlist_entry);
+	g->ops.fifo.get_tsg_runlist_entry(tsg, *runlist_entry);
 	nvgpu_log_info(g, "tsg rl entries left %d runlist [0] %x [1] %x",
-			*entries_left, runlist_entry[0], runlist_entry[1]);
-	runlist_entry += runlist_entry_words;
+			*entries_left,
+			(*runlist_entry)[0], (*runlist_entry)[1]);
+	*runlist_entry += runlist_entry_words;
 	count++;
 	(*entries_left)--;
 
@@ -3363,11 +3364,12 @@ static u32 nvgpu_runlist_append_tsg(struct gk20a *g,
 
 		nvgpu_log_info(g, "add channel %d to runlist",
 			ch->chid);
-		g->ops.fifo.get_ch_runlist_entry(ch, runlist_entry);
+		g->ops.fifo.get_ch_runlist_entry(ch, *runlist_entry);
 		nvgpu_log_info(g, "rl entries left %d runlist [0] %x [1] %x",
-			*entries_left, runlist_entry[0], runlist_entry[1]);
+			*entries_left,
+			(*runlist_entry)[0], (*runlist_entry)[1]);
 		count++;
-		runlist_entry += runlist_entry_words;
+		*runlist_entry += runlist_entry_words;
 		(*entries_left)--;
 	}
 	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
@@ -3375,81 +3377,196 @@ static u32 nvgpu_runlist_append_tsg(struct gk20a *g,
 	return count;
 }
 
-/* recursively construct a runlist with interleaved bare channels and TSGs */
-u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
+
+static u32 nvgpu_runlist_append_prio(struct fifo_gk20a *f,
 				struct fifo_runlist_info_gk20a *runlist,
-				u32 cur_level,
-				u32 *runlist_entry,
-				bool interleave_enabled,
-				bool prev_empty,
-				u32 *entries_left)
+				u32 **runlist_entry,
+				u32 *entries_left,
+				u32 interleave_level)
 {
-	bool last_level = cur_level == NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH;
-	bool skip_next = false;
-	unsigned long tsgid;
 	u32 count = 0;
-	struct gk20a *g = f->g;
+	unsigned long tsgid;
 
-	nvgpu_log_fn(g, " ");
+	nvgpu_log_fn(f->g, " ");
 
-	/* for each TSG, T, on this level, insert all higher-level channels
-	   and TSGs before inserting T. */
 	for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
 		struct tsg_gk20a *tsg = &f->tsg[tsgid];
-		u32 n;
+		u32 entries;
 
-		if (tsg->interleave_level != cur_level) {
+		if (tsg->interleave_level == interleave_level) {
+			entries = nvgpu_runlist_append_tsg(f->g, runlist,
+					runlist_entry, entries_left, tsg);
+			if (entries == RUNLIST_APPEND_FAILURE) {
+				return RUNLIST_APPEND_FAILURE;
+			}
+			count += entries;
+		}
+	}
+
+	return count;
+}
+
+static u32 nvgpu_runlist_append_hi(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	nvgpu_log_fn(f->g, " ");
+
+	/*
+	 * No higher levels - this is where the "recursion" ends; just add all
+	 * active TSGs at this level.
+	 */
+	return nvgpu_runlist_append_prio(f, runlist, runlist_entry,
+			entries_left,
+			NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH);
+}
+
+static u32 nvgpu_runlist_append_med(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	u32 count = 0;
+	unsigned long tsgid;
+
+	nvgpu_log_fn(f->g, " ");
+
+	for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
+		struct tsg_gk20a *tsg = &f->tsg[tsgid];
+		u32 entries;
+
+		if (tsg->interleave_level !=
+				NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM) {
 			continue;
 		}
 
-		if (!last_level && !skip_next) {
-			runlist_entry = gk20a_runlist_construct_locked(f,
-							runlist,
-							cur_level + 1,
-							runlist_entry,
-							interleave_enabled,
-							false,
-							entries_left);
-			if (!interleave_enabled) {
-				skip_next = true;
-			}
-		}
+		/* LEVEL_MEDIUM list starts with a LEVEL_HIGH, if any */
 
-		n = nvgpu_runlist_append_tsg(g, runlist, runlist_entry,
-				entries_left, tsg);
-		if (n == RUNLIST_APPEND_FAILURE) {
-			return NULL;
+		entries = nvgpu_runlist_append_hi(f, runlist,
+				runlist_entry, entries_left);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
 		}
-		count += n;
+		count += entries;
+
+		entries = nvgpu_runlist_append_tsg(f->g, runlist,
+				runlist_entry, entries_left, tsg);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
 	}
 
-	/* append entries from higher level if this level is empty */
-	if ((count == 0U) && !last_level) {
-		runlist_entry = gk20a_runlist_construct_locked(f,
-							runlist,
-							cur_level + 1,
-							runlist_entry,
-							interleave_enabled,
-							true,
-							entries_left);
+	return count;
+}
+
+static u32 nvgpu_runlist_append_low(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	u32 count = 0;
+	unsigned long tsgid;
+
+	nvgpu_log_fn(f->g, " ");
+
+	for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
+		struct tsg_gk20a *tsg = &f->tsg[tsgid];
+		u32 entries;
+
+		if (tsg->interleave_level !=
+				NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW) {
+			continue;
+		}
+
+		/* The medium level starts with the highs, if any. */
+
+		entries = nvgpu_runlist_append_med(f, runlist,
+				runlist_entry, entries_left);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
+
+		entries = nvgpu_runlist_append_hi(f, runlist,
+				runlist_entry, entries_left);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
+
+		entries = nvgpu_runlist_append_tsg(f->g, runlist,
+				runlist_entry, entries_left, tsg);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
 	}
 
+	if (count == 0U) {
+		/*
+		 * No transitions to fill with higher levels, so add
+		 * the next level once. If that's empty too, we have only
+		 * LEVEL_HIGH jobs.
+		 */
+		count = nvgpu_runlist_append_med(f, runlist,
+				runlist_entry, entries_left);
+		if (count == 0U) {
+			count = nvgpu_runlist_append_hi(f, runlist,
+					runlist_entry, entries_left);
+		}
+	}
+
+	return count;
+}
+
+static u32 nvgpu_runlist_append_flat(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	u32 count = 0, entries, i;
+
+	nvgpu_log_fn(f->g, " ");
+
+	/* Group by priority but don't interleave. High comes first. */
+
+	for (i = 0; i < NVGPU_FIFO_RUNLIST_INTERLEAVE_NUM_LEVELS; i++) {
+		u32 level = NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH - i;
+
+		entries = nvgpu_runlist_append_prio(f, runlist, runlist_entry,
+				entries_left, level);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
+	}
+
+	return count;
+}
+
+u32 nvgpu_runlist_construct_locked(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 buf_id,
+				u32 max_entries)
+{
+	u32 *runlist_entry_base = runlist->mem[buf_id].cpu_va;
+
+	nvgpu_log_fn(f->g, " ");
+
 	/*
-	 * if previous and this level have entries, append
-	 * entries from higher level.
-	 *
-	 * ex. dropping from MEDIUM to LOW, need to insert HIGH
+	 * The entry pointer and capacity counter that live on the stack here
+	 * keep track of the current position and the remaining space when tsg
+	 * and channel entries are ultimately appended.
 	 */
-	if (interleave_enabled && (count != 0U) && !prev_empty && !last_level) {
-		runlist_entry = gk20a_runlist_construct_locked(f,
-							runlist,
-							cur_level + 1,
-							runlist_entry,
-							interleave_enabled,
-							false,
-							entries_left);
+	if (f->g->runlist_interleave) {
+		return nvgpu_runlist_append_low(f, runlist,
+				&runlist_entry_base, &max_entries);
+	} else {
+		return nvgpu_runlist_append_flat(f, runlist,
+				&runlist_entry_base, &max_entries);
 	}
-	return runlist_entry;
 }
 
 int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
@@ -3515,12 +3632,10 @@ int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 	int ret = 0;
 	struct fifo_gk20a *f = &g->fifo;
 	struct fifo_runlist_info_gk20a *runlist = NULL;
-	u32 *runlist_entry_base = NULL;
 	u64 runlist_iova;
 	u32 new_buf;
 	struct channel_gk20a *ch = NULL;
 	struct tsg_gk20a *tsg = NULL;
-	u32 runlist_entry_words = f->runlist_entry_size / (u32)sizeof(u32);
 
 	runlist = &f->runlist_info[runlist_id];
 
@@ -3567,30 +3682,19 @@ int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 		goto clean_up;
 	}
 
-	runlist_entry_base = runlist->mem[new_buf].cpu_va;
-	if (runlist_entry_base == NULL) {
-		ret = -ENOMEM;
-		goto clean_up;
-	}
-
 	if (chid != FIFO_INVAL_CHANNEL_ID || /* add/remove a valid channel */
 	    add /* resume to add all channels back */) {
-		u32 max_entries = f->num_runlist_entries;
-		u32 *runlist_end;
+		u32 num_entries;
 
-		runlist_end = gk20a_runlist_construct_locked(f,
+		num_entries = nvgpu_runlist_construct_locked(f,
 						runlist,
-						0,
-						runlist_entry_base,
-						g->runlist_interleave,
-						true,
-						&max_entries);
-		if (runlist_end == NULL) {
+						new_buf,
+						f->num_runlist_entries);
+		if (num_entries == RUNLIST_APPEND_FAILURE) {
 			ret = -E2BIG;
 			goto clean_up;
 		}
-		runlist->count = (runlist_end - runlist_entry_base) /
-			runlist_entry_words;
+		runlist->count = num_entries;
 		WARN_ON(runlist->count > f->num_runlist_entries);
 	} else {
 		/* suspend to remove all channels */
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 70f40c463..5da5e2ac0 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -33,10 +33,10 @@ struct channel_gk20a;
 struct tsg_gk20a;
 
 enum {
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW = 0,
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM,
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH,
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_NUM_LEVELS,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW = 0U,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM = 1U,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH = 2U,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_NUM_LEVELS = 3U,
 };
 
 #define MAX_RUNLIST_BUFFERS		2
@@ -406,13 +406,11 @@ int gk20a_fifo_setup_userd(struct channel_gk20a *c);
 u32 gk20a_fifo_pbdma_acquire_val(u64 timeout);
 
 
-u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
+#define RUNLIST_APPEND_FAILURE 0xffffffffU
+u32 nvgpu_runlist_construct_locked(struct fifo_gk20a *f,
 				struct fifo_runlist_info_gk20a *runlist,
-				u32 cur_level,
-				u32 *runlist_entry,
-				bool interleave_enabled,
-				bool prev_empty,
-				u32 *entries_left);
+				u32 buf_id,
+				u32 max_entries);
 void gk20a_fifo_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
 	u32 count, u32 buffer_index);
 int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id);