gpu: nvgpu: avoid recursion in runlist construction

MISRA rule 17.2 forbids recursion as a hazard on the stack space. To comply and additionally to make the code somewhat more straightforward to read, rewrite the runlist construction with three explicit functions that work as the three levels of the earlier recursion. These levels map to the three priority levels of TSGs and having more than that is unlikely. When "runlist interleaving" is enabled, TSGs with higher priorities get interleaved between the switch of each pair of lower-level priority TSGs, so that the latency for a job at priority level X is no more than all jobs' timeslices of priority X and higher, plus at most one job at a lower level. This can be illustrated as follows (low, medium, high TSGs 1 and 2): L1 L2 (only low-priority TSGs) H1 H2 (only high-priority TSGs) H1 H2 M1 H1 H2 M2 (no low-priority TSGs) M1 M2 L1 M1 M2 L2 (no high-priority TSGs) H1 H2 L1 H1 H2 L2 (no medium-priority TSGs) H1 H2 M1 H1 H2 M2 H1 H2 L1 H1 H2 M1 H1 H2 M2 H1 H2 L2 (no empty levels) Without interleaving, the items are simply grouped by priority. Jira NVGPU-1174 Change-Id: Ic3b5106945df7105633730ecd1d150af770a5e83 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1918226 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2018-10-03 15:58:18 +03:00
parent 998bf379df
commit 0fbc1a2652
2 changed files with 195 additions and 93 deletions
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -60,8 +60,6 @@
 #define FECS_METHOD_WFI_RESTORE 0x80000
 #define FECS_MAILBOX_0_ACK_RESTORE 0x4

-#define RUNLIST_APPEND_FAILURE 0xffffffffU
-
 static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);

 static const char *const pbdma_intr_fault_type_desc[] = {
@@ -3325,7 +3323,7 @@ void gk20a_get_ch_runlist_entry(struct channel_gk20a *ch, u32 *runlist)

 static u32 nvgpu_runlist_append_tsg(struct gk20a *g,
 		struct fifo_runlist_info_gk20a *runlist,
-		u32 *runlist_entry,
+		u32 **runlist_entry,
 		u32 *entries_left,
 		struct tsg_gk20a *tsg)
 {
@@ -3334,16 +3332,19 @@ static u32 nvgpu_runlist_append_tsg(struct gk20a *g,
 	struct channel_gk20a *ch;
 	u32 count = 0;

+	nvgpu_log_fn(f->g, " ");
+
 	if (*entries_left == 0U) {
 		return RUNLIST_APPEND_FAILURE;
 	}

 	/* add TSG entry */
 	nvgpu_log_info(g, "add TSG %d to runlist", tsg->tsgid);
-	g->ops.fifo.get_tsg_runlist_entry(tsg, runlist_entry);
+	g->ops.fifo.get_tsg_runlist_entry(tsg, *runlist_entry);
 	nvgpu_log_info(g, "tsg rl entries left %d runlist [0] %x [1] %x",
-			*entries_left, runlist_entry[0], runlist_entry[1]);
-	runlist_entry += runlist_entry_words;
+			*entries_left,
+			(*runlist_entry)[0], (*runlist_entry)[1]);
+	*runlist_entry += runlist_entry_words;
 	count++;
 	(*entries_left)--;

@@ -3363,11 +3364,12 @@ static u32 nvgpu_runlist_append_tsg(struct gk20a *g,

 		nvgpu_log_info(g, "add channel %d to runlist",
 			ch->chid);
-		g->ops.fifo.get_ch_runlist_entry(ch, runlist_entry);
+		g->ops.fifo.get_ch_runlist_entry(ch, *runlist_entry);
 		nvgpu_log_info(g, "rl entries left %d runlist [0] %x [1] %x",
-			*entries_left, runlist_entry[0], runlist_entry[1]);
+			*entries_left,
+			(*runlist_entry)[0], (*runlist_entry)[1]);
 		count++;
-		runlist_entry += runlist_entry_words;
+		*runlist_entry += runlist_entry_words;
 		(*entries_left)--;
 	}
 	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
@@ -3375,81 +3377,196 @@ static u32 nvgpu_runlist_append_tsg(struct gk20a *g,
 	return count;
 }

-/* recursively construct a runlist with interleaved bare channels and TSGs */
-u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
+
+static u32 nvgpu_runlist_append_prio(struct fifo_gk20a *f,
 				struct fifo_runlist_info_gk20a *runlist,
-				u32 cur_level,
-				u32 *runlist_entry,
-				bool interleave_enabled,
-				bool prev_empty,
-				u32 *entries_left)
+				u32 **runlist_entry,
+				u32 *entries_left,
+				u32 interleave_level)
 {
-	bool last_level = cur_level == NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH;
-	bool skip_next = false;
-	unsigned long tsgid;
 	u32 count = 0;
-	struct gk20a *g = f->g;
+	unsigned long tsgid;

-	nvgpu_log_fn(g, " ");
+	nvgpu_log_fn(f->g, " ");

-	/* for each TSG, T, on this level, insert all higher-level channels
-	   and TSGs before inserting T. */
 	for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
 		struct tsg_gk20a *tsg = &f->tsg[tsgid];
-		u32 n;
+		u32 entries;

-		if (tsg->interleave_level != cur_level) {
+		if (tsg->interleave_level == interleave_level) {
+			entries = nvgpu_runlist_append_tsg(f->g, runlist,
+					runlist_entry, entries_left, tsg);
+			if (entries == RUNLIST_APPEND_FAILURE) {
+				return RUNLIST_APPEND_FAILURE;
+			}
+			count += entries;
+		}
+	}
+
+	return count;
+}
+
+static u32 nvgpu_runlist_append_hi(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	nvgpu_log_fn(f->g, " ");
+
+	/*
+	 * No higher levels - this is where the "recursion" ends; just add all
+	 * active TSGs at this level.
+	 */
+	return nvgpu_runlist_append_prio(f, runlist, runlist_entry,
+			entries_left,
+			NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH);
+}
+
+static u32 nvgpu_runlist_append_med(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	u32 count = 0;
+	unsigned long tsgid;
+
+	nvgpu_log_fn(f->g, " ");
+
+	for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
+		struct tsg_gk20a *tsg = &f->tsg[tsgid];
+		u32 entries;
+
+		if (tsg->interleave_level !=
+				NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM) {
 			continue;
 		}

-		if (!last_level && !skip_next) {
-			runlist_entry = gk20a_runlist_construct_locked(f,
-							runlist,
-							cur_level + 1,
-							runlist_entry,
-							interleave_enabled,
-							false,
-							entries_left);
-			if (!interleave_enabled) {
-				skip_next = true;
-			}
-		}
+		/* LEVEL_MEDIUM list starts with a LEVEL_HIGH, if any */

-		n = nvgpu_runlist_append_tsg(g, runlist, runlist_entry,
-				entries_left, tsg);
-		if (n == RUNLIST_APPEND_FAILURE) {
-			return NULL;
+		entries = nvgpu_runlist_append_hi(f, runlist,
+				runlist_entry, entries_left);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
 		}
-		count += n;
+		count += entries;
+
+		entries = nvgpu_runlist_append_tsg(f->g, runlist,
+				runlist_entry, entries_left, tsg);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
 	}

-	/* append entries from higher level if this level is empty */
-	if ((count == 0U) && !last_level) {
-		runlist_entry = gk20a_runlist_construct_locked(f,
-							runlist,
-							cur_level + 1,
-							runlist_entry,
-							interleave_enabled,
-							true,
-							entries_left);
+	return count;
+}
+
+static u32 nvgpu_runlist_append_low(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	u32 count = 0;
+	unsigned long tsgid;
+
+	nvgpu_log_fn(f->g, " ");
+
+	for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
+		struct tsg_gk20a *tsg = &f->tsg[tsgid];
+		u32 entries;
+
+		if (tsg->interleave_level !=
+				NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW) {
+			continue;
+		}
+
+		/* The medium level starts with the highs, if any. */
+
+		entries = nvgpu_runlist_append_med(f, runlist,
+				runlist_entry, entries_left);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
+
+		entries = nvgpu_runlist_append_hi(f, runlist,
+				runlist_entry, entries_left);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
+
+		entries = nvgpu_runlist_append_tsg(f->g, runlist,
+				runlist_entry, entries_left, tsg);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
 	}

+	if (count == 0U) {
+		/*
+		 * No transitions to fill with higher levels, so add
+		 * the next level once. If that's empty too, we have only
+		 * LEVEL_HIGH jobs.
+		 */
+		count = nvgpu_runlist_append_med(f, runlist,
+				runlist_entry, entries_left);
+		if (count == 0U) {
+			count = nvgpu_runlist_append_hi(f, runlist,
+					runlist_entry, entries_left);
+		}
+	}
+
+	return count;
+}
+
+static u32 nvgpu_runlist_append_flat(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 **runlist_entry,
+				u32 *entries_left)
+{
+	u32 count = 0, entries, i;
+
+	nvgpu_log_fn(f->g, " ");
+
+	/* Group by priority but don't interleave. High comes first. */
+
+	for (i = 0; i < NVGPU_FIFO_RUNLIST_INTERLEAVE_NUM_LEVELS; i++) {
+		u32 level = NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH - i;
+
+		entries = nvgpu_runlist_append_prio(f, runlist, runlist_entry,
+				entries_left, level);
+		if (entries == RUNLIST_APPEND_FAILURE) {
+			return RUNLIST_APPEND_FAILURE;
+		}
+		count += entries;
+	}
+
+	return count;
+}
+
+u32 nvgpu_runlist_construct_locked(struct fifo_gk20a *f,
+				struct fifo_runlist_info_gk20a *runlist,
+				u32 buf_id,
+				u32 max_entries)
+{
+	u32 *runlist_entry_base = runlist->mem[buf_id].cpu_va;
+
+	nvgpu_log_fn(f->g, " ");
+
 	/*
-	 * if previous and this level have entries, append
-	 * entries from higher level.
-	 *
-	 * ex. dropping from MEDIUM to LOW, need to insert HIGH
+	 * The entry pointer and capacity counter that live on the stack here
+	 * keep track of the current position and the remaining space when tsg
+	 * and channel entries are ultimately appended.
 	 */
-	if (interleave_enabled && (count != 0U) && !prev_empty && !last_level) {
-		runlist_entry = gk20a_runlist_construct_locked(f,
-							runlist,
-							cur_level + 1,
-							runlist_entry,
-							interleave_enabled,
-							false,
-							entries_left);
+	if (f->g->runlist_interleave) {
+		return nvgpu_runlist_append_low(f, runlist,
+				&runlist_entry_base, &max_entries);
+	} else {
+		return nvgpu_runlist_append_flat(f, runlist,
+				&runlist_entry_base, &max_entries);
 	}
-	return runlist_entry;
 }

 int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
@@ -3515,12 +3632,10 @@ int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 	int ret = 0;
 	struct fifo_gk20a *f = &g->fifo;
 	struct fifo_runlist_info_gk20a *runlist = NULL;
-	u32 *runlist_entry_base = NULL;
 	u64 runlist_iova;
 	u32 new_buf;
 	struct channel_gk20a *ch = NULL;
 	struct tsg_gk20a *tsg = NULL;
-	u32 runlist_entry_words = f->runlist_entry_size / (u32)sizeof(u32);

 	runlist = &f->runlist_info[runlist_id];

@@ -3567,30 +3682,19 @@ int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
 		goto clean_up;
 	}

-	runlist_entry_base = runlist->mem[new_buf].cpu_va;
-	if (runlist_entry_base == NULL) {
-		ret = -ENOMEM;
-		goto clean_up;
-	}
-
 	if (chid != FIFO_INVAL_CHANNEL_ID || /* add/remove a valid channel */
 	    add /* resume to add all channels back */) {
-		u32 max_entries = f->num_runlist_entries;
-		u32 *runlist_end;
+		u32 num_entries;

-		runlist_end = gk20a_runlist_construct_locked(f,
+		num_entries = nvgpu_runlist_construct_locked(f,
 						runlist,
-						0,
-						runlist_entry_base,
-						g->runlist_interleave,
-						true,
-						&max_entries);
-		if (runlist_end == NULL) {
+						new_buf,
+						f->num_runlist_entries);
+		if (num_entries == RUNLIST_APPEND_FAILURE) {
 			ret = -E2BIG;
 			goto clean_up;
 		}
-		runlist->count = (runlist_end - runlist_entry_base) /
-			runlist_entry_words;
+		runlist->count = num_entries;
 		WARN_ON(runlist->count > f->num_runlist_entries);
 	} else {
 		/* suspend to remove all channels */
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -33,10 +33,10 @@ struct channel_gk20a;
 struct tsg_gk20a;

 enum {
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW = 0,
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM,
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH,
-	NVGPU_FIFO_RUNLIST_INTERLEAVE_NUM_LEVELS,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW = 0U,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM = 1U,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH = 2U,
+	NVGPU_FIFO_RUNLIST_INTERLEAVE_NUM_LEVELS = 3U,
 };

 #define MAX_RUNLIST_BUFFERS		2
@@ -406,13 +406,11 @@ int gk20a_fifo_setup_userd(struct channel_gk20a *c);
 u32 gk20a_fifo_pbdma_acquire_val(u64 timeout);


-u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
+#define RUNLIST_APPEND_FAILURE 0xffffffffU
+u32 nvgpu_runlist_construct_locked(struct fifo_gk20a *f,
 				struct fifo_runlist_info_gk20a *runlist,
-				u32 cur_level,
-				u32 *runlist_entry,
-				bool interleave_enabled,
-				bool prev_empty,
-				u32 *entries_left);
+				u32 buf_id,
+				u32 max_entries);
 void gk20a_fifo_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
 	u32 count, u32 buffer_index);
 int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id);