gpu: nvgpu: decouple async and immediate cleanup

Split up nvgpu_channel_clean_up_jobs() on the clean_all parameter so that there's one version for the asynchronous ("deferred") cleanup and another for the synchronous deterministic cleanup that occurs in the submit path. Forking another version like this adds some repetition, but this lets us look at both versions clearly in order to come up with a coherent plan. For example, it might be feasible to have the light cleanup of pooled items in also the nondeterministic path, and deferring heavy cleanup to another, entirely separated job queue. Jira NVGPU-5493 Change-Id: I5423fd474e5b8f7b273383f12302126f47076bd3 Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2346065 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2020-08-05 10:04:25 +03:00
parent 3f81f1952d
commit baaf25f8b0
5 changed files with 91 additions and 38 deletions
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -202,12 +202,22 @@ void nvgpu_channel_abort_clean_up(struct nvgpu_channel *ch)

 	nvgpu_mutex_release(&ch->joblist.cleanup_lock);

+	/* The update to flush the job queue is only needed to process
+	 * nondeterministic resources and ch wdt timeouts. Any others are
+	 * either nonexistent or preallocated from pools that can be killed in
+	 * one go on deterministic channels; take a look at what would happen
+	 * in nvgpu_channel_clean_up_deterministic_job() and what
+	 * nvgpu_submit_deterministic() requires.
+	 */
+	if (!nvgpu_channel_is_deterministic(ch)) {
 		/*
-	 * When closing the channel, this scheduled update holds one ref which
-	 * is waited for before advancing with freeing.
+		 * When closing the channel, this scheduled update holds one
+		 * channel ref which is waited for before advancing with
+		 * freeing.
 		 */
 		nvgpu_channel_update(ch);
 	}
+}

 static void channel_kernelmode_deinit(struct nvgpu_channel *ch)
 {
@@ -529,7 +539,7 @@ static void nvgpu_channel_worker_poll_wakeup_process_item(

 	nvgpu_log_fn(ch->g, " ");

-	nvgpu_channel_clean_up_jobs(ch, true);
+	nvgpu_channel_clean_up_jobs(ch);

 	/* ref taken when enqueued */
 	nvgpu_channel_put(ch);
@@ -645,16 +655,14 @@ err_put_buffers:

 /**
 * Clean up job resources for further jobs to use.
- * @clean_all: If true, process as many jobs as possible, otherwise just one.
 *
- * Loop all jobs from the joblist until a pending job is found, or just one if
- * clean_all is not set. Pending jobs are detected from the job's post fence,
- * so this is only done for jobs that have job tracking resources. Free all
- * per-job memory for completed jobs; in case of preallocated resources, this
- * opens up slots for new jobs to be submitted.
+ * Loop all jobs from the joblist until a pending job is found. Pending jobs
+ * are detected from the job's post fence, so this is only done for jobs that
+ * have job tracking resources. Free all per-job memory for completed jobs; in
+ * case of preallocated resources, this opens up slots for new jobs to be
+ * submitted.
 */
-void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
-					bool clean_all)
+void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c)
 {
 	struct vm_gk20a *vm;
 	struct nvgpu_channel_job *job;
@@ -669,13 +677,9 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
 	vm = c->vm;
 	g = c->g;

-	/*
-	 * If !clean_all, we're in a condition where watchdog isn't supported
-	 * anyway (this would be a no-op).
-	 */
-	if (clean_all) {
+	nvgpu_assert(!nvgpu_channel_is_deterministic(c));
+
 	watchdog_on = nvgpu_channel_wdt_stop(c->wdt);
-	}

 	/* Synchronize with abort cleanup that needs the jobs. */
 	nvgpu_mutex_acquire(&c->joblist.cleanup_lock);
@@ -704,7 +708,7 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
 			 * this - in that case, this is a no-op and the new
 			 * later timeout is still used.
 			 */
-			if (clean_all && watchdog_on) {
+			if (watchdog_on) {
 				nvgpu_channel_wdt_continue(c->wdt);
 			}
 			break;
@@ -738,8 +742,7 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
 		nvgpu_fence_put(&job->post_fence);

 		/*
-		 * Free the private command buffers (wait_cmd first and
-		 * then incr_cmd i.e. order of allocation)
+		 * Free the private command buffers (in order of allocation)
 		 */
 		if (job->wait_cmd != NULL) {
 			nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->wait_cmd);
@@ -754,20 +757,10 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,

 		job_finished = true;

-		/*
-		 * Deterministic channels have a channel-wide power reference;
-		 * for others, there's one per submit.
-		 */
-		if (!nvgpu_channel_is_deterministic(c)) {
+		/* taken in nvgpu_submit_nondeterministic() */
 		gk20a_idle(g);
 	}

-		if (!clean_all) {
-			/* Timeout isn't supported here so don't touch it. */
-			break;
-		}
-	}
-
 	nvgpu_mutex_release(&c->joblist.cleanup_lock);

 	if ((job_finished) &&
@@ -776,6 +769,61 @@ void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
 	}
 }

+/**
+ * Clean up one job if any to provide space for a new submit.
+ *
+ * Deterministic channels do very little in the submit path, so the cleanup
+ * code does not do much either. This assumes the preconditions that
+ * deterministic channels are missing features such as timeouts and mapped
+ * buffers.
+ */
+void nvgpu_channel_clean_up_deterministic_job(struct nvgpu_channel *c)
+{
+	struct nvgpu_channel_job *job;
+
+	nvgpu_assert(nvgpu_channel_is_deterministic(c));
+
+	/* Synchronize with abort cleanup that needs the jobs. */
+	nvgpu_mutex_acquire(&c->joblist.cleanup_lock);
+
+	nvgpu_channel_joblist_lock(c);
+	if (nvgpu_channel_joblist_is_empty(c)) {
+		nvgpu_channel_joblist_unlock(c);
+		goto out_unlock;
+	}
+	job = channel_joblist_peek(c);
+	nvgpu_channel_joblist_unlock(c);
+
+	nvgpu_assert(job->num_mapped_buffers == 0U);
+
+	if (!nvgpu_fence_is_expired(&job->post_fence)) {
+		goto out_unlock;
+	}
+
+	/*
+	 * This fence is syncpoint-based, so cleanup doesn't do anything. Put
+	 * the ref back for consistency though.
+	 */
+	nvgpu_fence_put(&job->post_fence);
+
+	/*
+	 * Free the private command buffers (in order of allocation)
+	 */
+	if (job->wait_cmd != NULL) {
+		nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->wait_cmd);
+	}
+	nvgpu_priv_cmdbuf_free(c->priv_cmd_q, job->incr_cmd);
+
+	nvgpu_channel_free_job(c, job);
+
+	nvgpu_channel_joblist_lock(c);
+	channel_joblist_delete(c, job);
+	nvgpu_channel_joblist_unlock(c);
+
+out_unlock:
+	nvgpu_mutex_release(&c->joblist.cleanup_lock);
+}
+
 /**
 * Schedule a job cleanup work on this channel to free resources and to signal
 * about completion.
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -584,7 +584,7 @@ static int nvgpu_submit_deterministic(struct nvgpu_channel *c,
 		 * though - the hw GP_GET pointer can be much further away than
 		 * our metadata pointers; gpfifo space is "freed" by the HW.
 		 */
-		nvgpu_channel_clean_up_jobs(c, true);
+		nvgpu_channel_clean_up_deterministic_job(c);
 	}

 	/* Grab access to HW to deal with do_idle */
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -563,8 +563,8 @@ u32 nvgpu_channel_get_gpfifo_free_count(struct nvgpu_channel *ch);
 int nvgpu_channel_add_job(struct nvgpu_channel *c,
 				 struct nvgpu_channel_job *job,
 				 bool skip_buffer_refcounting);
-void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c,
-					bool clean_all);
+void nvgpu_channel_clean_up_jobs(struct nvgpu_channel *c);
+void nvgpu_channel_clean_up_deterministic_job(struct nvgpu_channel *c);
 int nvgpu_submit_channel_gpfifo_user(struct nvgpu_channel *c,
 				struct nvgpu_gpfifo_userdata userdata,
 				u32 num_entries,
--- a/drivers/gpu/nvgpu/os/linux/cde.c
+++ b/drivers/gpu/nvgpu/os/linux/cde.c
@@ -1376,6 +1376,10 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 		goto err_setup_bind;
 	}

+	/*
+	 * Note that this cannot be deterministic because of the job completion
+	 * callbacks that aren't delivered for deterministic channels.
+	 */
 	setup_bind_args.num_gpfifo_entries = 1024;
 	setup_bind_args.num_inflight_jobs = 0;
 	setup_bind_args.flags = 0;
--- a/drivers/gpu/nvgpu/os/linux/channel.h
+++ b/drivers/gpu/nvgpu/os/linux/channel.h
@@ -98,6 +98,7 @@ u32 nvgpu_submit_gpfifo_user_flags_to_common_flags(u32 user_flags);
 int nvgpu_channel_init_support_linux(struct nvgpu_os_linux *l);
 void nvgpu_channel_remove_support_linux(struct nvgpu_os_linux *l);

+/* Deprecated. Use fences in new code. */
 struct nvgpu_channel *gk20a_open_new_channel_with_cb(struct gk20a *g,
 		void (*update_fn)(struct nvgpu_channel *, void *),
 		void *update_fn_data,