gpu: nvgpu: make deferred clean-up conditional

This change makes the invocation of the deferred job clean-up mechanism conditional. For submissions that require job tracking, deferred clean-up is only required if any of the following conditions are met: 1) Channel's deterministic flag is not set 2) Rail-gating is enabled 3) Channel WDT is enabled 4) Buffer refcounting is enabled 5) Dependency on Sync Framework In case deferred clean-up is not needed, we clean-up a single job tracking resource in the submit path. For deterministic channels, we do not allow deferred clean-up to occur and fail any submits that require it. Bug 1795076 Change-Id: I4021dffe8a71aa58f12db6b58518d3f4021f3313 Signed-off-by: Sachit Kadle <skadle@nvidia.com> Reviewed-on: http://git-master/r/1220920 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> (cherry picked from commit b09f7589d5ad3c496e7350f1ed583a4fe2db574a) Reviewed-on: http://git-master/r/1223941 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2016-09-14 11:45:38 -07:00
parent 55dba9f1a9
commit ab593b9ccd
6 changed files with 119 additions and 26 deletions
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -76,7 +76,8 @@ static void gk20a_free_error_notifiers(struct channel_gk20a *ch);

 static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);

-static void gk20a_channel_clean_up_jobs(struct work_struct *work);
+static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
+					bool clean_all);
 static void gk20a_channel_cancel_job_clean_up(struct channel_gk20a *c,
 				bool wait_for_completion);

@@ -1029,6 +1030,7 @@ unbind:
 	g->ops.fifo.free_inst(g, ch);

 	ch->vpr = false;
+	ch->deterministic = false;
 	ch->vm = NULL;

 	WARN_ON(ch->sync);
@@ -1703,9 +1705,12 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,

 	gpfifo_size = args->num_entries;

-	if (args->flags & NVGPU_ALLOC_GPFIFO_FLAGS_VPR_ENABLED)
+	if (args->flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED)
 		c->vpr = true;

+	if (args->flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_DETERMINISTIC)
+		c->deterministic = true;
+
 	/* an address space needs to have been bound at this point. */
 	if (!gk20a_channel_as_bound(c)) {
 		gk20a_err(d,
@@ -2173,10 +2178,17 @@ err_put_vm:
 	return err;
 }

-static void gk20a_channel_clean_up_jobs(struct work_struct *work)
+static void gk20a_channel_clean_up_runcb_fn(struct work_struct *work)
 {
 	struct channel_gk20a *c = container_of(to_delayed_work(work),
 			struct channel_gk20a, clean_up.wq);
+
+	gk20a_channel_clean_up_jobs(c, true);
+}
+
+static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
+					bool clean_all)
+{
 	struct vm_gk20a *vm;
 	struct channel_gk20a_job *job;
 	struct gk20a_platform *platform;
@@ -2273,6 +2285,9 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
 		channel_gk20a_free_job(c, job);
 		job_finished = 1;
 		gk20a_idle(g->dev);
+
+		if (!clean_all)
+			break;
 	}

 	if (job_finished && c->update_fn)
@@ -2419,6 +2434,7 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
 				      struct gk20a_fence **pre_fence,
 				      struct gk20a_fence **post_fence,
 				      bool force_need_sync_fence,
+				      bool register_irq,
 				      u32 flags)
 {
 	struct gk20a *g = c->g;
@@ -2515,10 +2531,12 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,

 	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
 		err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
-				 job->post_fence, need_wfi, need_sync_fence);
+				 job->post_fence, need_wfi, need_sync_fence,
+				 register_irq);
 	else
 		err = c->sync->incr(c->sync, job->incr_cmd,
-				    job->post_fence, need_sync_fence);
+				    job->post_fence, need_sync_fence,
+				    register_irq);
 	if (!err) {
 		*incr_cmd = job->incr_cmd;
 		*post_fence = job->post_fence;
@@ -2568,6 +2586,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 			NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
 	int err = 0;
 	bool need_job_tracking;
+	bool need_deferred_cleanup = false;
 	struct nvgpu_gpfifo __user *user_gpfifo = args ?
 		(struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
 	struct gk20a_platform *platform = gk20a_get_platform(d);
@@ -2626,13 +2645,48 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 			!skip_buffer_refcounting;

 	if (need_job_tracking) {
+		bool need_sync_framework = false;
+
 		/*
-		 * If the submit is to have deterministic latency and
+		 * If the channel is to have deterministic latency and
 		 * job tracking is required, the channel must have
 		 * pre-allocated resources. Otherwise, we fail the submit here
 		 */
-		if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_DETERMINISTIC) &&
-				!channel_gk20a_is_prealloc_enabled(c))
+		if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
+			return -EINVAL;
+
+		need_sync_framework = force_need_sync_fence ||
+			gk20a_channel_sync_needs_sync_framework(c) ||
+			(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
+			(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT ||
+			 flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
+
+		/*
+		 * Deferred clean-up is necessary for any of the following
+		 * conditions:
+		 * - channel's deterministic flag is not set
+		 * - dependency on sync framework, which could make the
+		 *   behavior of the clean-up operation non-deterministic
+		 *   (should not be performed in the submit path)
+		 * - channel wdt
+		 * - GPU rail-gating
+		 * - buffer refcounting
+		 *
+		 * If none of the conditions are met, then deferred clean-up
+		 * is not required, and we clean-up one job-tracking
+		 * resource in the submit path.
+		 */
+		need_deferred_cleanup = !c->deterministic ||
+					need_sync_framework ||
+					c->wdt_enabled ||
+					platform->can_railgate ||
+					!skip_buffer_refcounting;
+
+		/*
+		 * For deterministic channels, we don't allow deferred clean_up
+		 * processing to occur. In cases we hit this, we fail the submit
+		 */
+		if (c->deterministic && need_deferred_cleanup)
 			return -EINVAL;

 		/* gk20a_channel_update releases this ref. */
@@ -2641,6 +2695,11 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 			gk20a_err(d, "failed to host gk20a to submit gpfifo");
 			return err;
 		}
+
+		if (!need_deferred_cleanup) {
+			/* clean up a single job */
+			gk20a_channel_clean_up_jobs(c, false);
+		}
 	}

 	trace_gk20a_channel_submit_gpfifo(dev_name(c->g->dev),
@@ -2678,7 +2737,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		err = gk20a_submit_prepare_syncs(c, fence, job,
 						 &wait_cmd, &incr_cmd,
 						 &pre_fence, &post_fence,
-						 force_need_sync_fence, flags);
+						 force_need_sync_fence,
+						 need_deferred_cleanup,
+						 flags);
 		if (err)
 			goto clean_up_job;
 	}
@@ -2727,7 +2788,7 @@ clean_up:
 	gk20a_dbg_fn("fail");
 	gk20a_fence_put(pre_fence);
 	gk20a_fence_put(post_fence);
-	if (need_job_tracking)
+	if (need_deferred_cleanup)
 		gk20a_idle(g->dev);
 	return err;
 }
@@ -2749,7 +2810,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	raw_spin_lock_init(&c->timeout.lock);
 	mutex_init(&c->sync_lock);
 	INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
-	INIT_DELAYED_WORK(&c->clean_up.wq, gk20a_channel_clean_up_jobs);
+	INIT_DELAYED_WORK(&c->clean_up.wq, gk20a_channel_clean_up_runcb_fn);
 	mutex_init(&c->clean_up.lock);
 	INIT_LIST_HEAD(&c->joblist.dynamic.jobs);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -3416,10 +3477,10 @@ long gk20a_channel_ioctl(struct file *filp,

 		if (!is_power_of_2(alloc_gpfifo_ex_args->num_entries)) {
 			err = -EINVAL;
+			gk20a_idle(dev);
 			break;
 		}
-		err = gk20a_alloc_channel_gpfifo(ch,
-				(struct nvgpu_alloc_gpfifo_ex_args *)buf);
+		err = gk20a_alloc_channel_gpfifo(ch, alloc_gpfifo_ex_args);
 		gk20a_idle(dev);
 		break;
 	}
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -131,7 +131,7 @@ struct channel_gk20a {
 	bool bound;
 	bool first_init;
 	bool vpr;
-	bool no_block;
+	bool deterministic;
 	bool cde;
 	pid_t pid;
 	pid_t tgid;
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -285,13 +285,14 @@ static int gk20a_channel_syncpt_incr_wfi(struct gk20a_channel_sync *s,
 static int gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 			      struct priv_cmd_entry *entry,
 			      struct gk20a_fence *fence,
-			      bool need_sync_fence)
+			      bool need_sync_fence,
+			      bool register_irq)
 {
 	/* Don't put wfi cmd to this one since we're not returning
 	 * a fence to user space. */
 	return __gk20a_channel_syncpt_incr(s,
 			false /* no wfi */,
-			true /* register irq */,
+			register_irq /* register irq */,
 			entry, fence, need_sync_fence);
 }

@@ -300,13 +301,14 @@ static int gk20a_channel_syncpt_incr_user(struct gk20a_channel_sync *s,
 				   struct priv_cmd_entry *entry,
 				   struct gk20a_fence *fence,
 				   bool wfi,
-				   bool need_sync_fence)
+				   bool need_sync_fence,
+				   bool register_irq)
 {
 	/* Need to do 'wfi + host incr' since we return the fence
 	 * to user space. */
 	return __gk20a_channel_syncpt_incr(s,
 			wfi,
-			true /* register irq */,
+			register_irq /* register irq */,
 			entry, fence, need_sync_fence);
 }

@@ -756,7 +758,8 @@ static int gk20a_channel_semaphore_incr(
 		struct gk20a_channel_sync *s,
 		struct priv_cmd_entry *entry,
 		struct gk20a_fence *fence,
-		bool need_sync_fence)
+		bool need_sync_fence,
+		bool register_irq)
 {
 	/* Don't put wfi cmd to this one since we're not returning
 	 * a fence to user space. */
@@ -772,7 +775,8 @@ static int gk20a_channel_semaphore_incr_user(
 		struct priv_cmd_entry *entry,
 		struct gk20a_fence *fence,
 		bool wfi,
-		bool need_sync_fence)
+		bool need_sync_fence,
+		bool register_irq)
 {
 #ifdef CONFIG_SYNC
 	struct sync_fence *dependency = NULL;
@@ -889,3 +893,12 @@ struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c)
 #endif
 	return gk20a_channel_semaphore_create(c);
 }
+
+bool gk20a_channel_sync_needs_sync_framework(struct channel_gk20a *c)
+{
+#ifdef CONFIG_TEGRA_GK20A
+	if (gk20a_platform_has_syncpoints(c->g->dev))
+		return false;
+#endif
+	return true;
+}
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
@@ -57,7 +57,8 @@ struct gk20a_channel_sync {
 	int (*incr)(struct gk20a_channel_sync *s,
 		    struct priv_cmd_entry *entry,
 		    struct gk20a_fence *fence,
-		    bool need_sync_fence);
+		    bool need_sync_fence,
+		    bool register_irq);

 	/* Increment syncpoint/semaphore, preceded by a wfi.
 	 * Returns
@@ -80,7 +81,8 @@ struct gk20a_channel_sync {
 			 struct priv_cmd_entry *entry,
 			 struct gk20a_fence *fence,
 			 bool wfi,
-			 bool need_sync_fence);
+			 bool need_sync_fence,
+			 bool register_irq);

 	/* Reset the channel syncpoint/semaphore. */
 	void (*set_min_eq_max)(struct gk20a_channel_sync *s);
@@ -99,5 +101,6 @@ struct gk20a_channel_sync {

 void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync);
 struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c);
+bool gk20a_channel_sync_needs_sync_framework(struct channel_gk20a *c);

 #endif
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -272,8 +272,24 @@ static int gk20a_syncpt_fence_wait(struct gk20a_fence *f, long timeout)

 static bool gk20a_syncpt_fence_is_expired(struct gk20a_fence *f)
 {
-	return nvhost_syncpt_is_expired_ext(f->host1x_pdev, f->syncpt_id,
-					    f->syncpt_value);
+
+	/*
+	 * In cases we don't register a notifier, we can't expect the
+	 * syncpt value to be updated. For this case, we force a read
+	 * of the value from HW, and then check for expiration.
+	 */
+	if (!nvhost_syncpt_is_expired_ext(f->host1x_pdev, f->syncpt_id,
+				f->syncpt_value)) {
+		u32 val;
+
+		if (!nvhost_syncpt_read_ext_check(f->host1x_pdev,
+				f->syncpt_id, &val)) {
+			return nvhost_syncpt_is_expired_ext(f->host1x_pdev,
+					f->syncpt_id, f->syncpt_value);
+		}
+	}
+
+	return true;
 }

 static const struct gk20a_fence_ops gk20a_syncpt_fence_ops = {
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -968,6 +968,8 @@ struct nvgpu_alloc_gpfifo_args {
 struct nvgpu_alloc_gpfifo_ex_args {
 	__u32 num_entries;
 	__u32 num_inflight_jobs;
+#define NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED		(1 << 0) /* set owner channel of this gpfifo as a vpr channel */
+#define NVGPU_ALLOC_GPFIFO_EX_FLAGS_DETERMINISTIC	(1 << 1) /* channel shall exhibit deterministic behavior in the submit path */
 	__u32 flags;
 	__u32 reserved[5];
 };
@@ -994,8 +996,6 @@ struct nvgpu_fence {
 #define NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI	(1 << 4)
 /* skip buffer refcounting during submit */
 #define NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING	(1 << 5)
-/* submit should exhibit deterministic latency */
-#define NVGPU_SUBMIT_GPFIFO_FLAGS_DETERMINISTIC	(1 << 6)

 struct nvgpu_submit_gpfifo_args {
 	__u64 gpfifo;