gpu: nvgpu: create sync_fence only if needed

Currently, we create sync_fence (from nvhost_sync_create_fence()) for every submit But not all submits request for a sync_fence. Also, nvhost_sync_create_fence() API takes about 1/3rd of the total submit path. Hence to optimize, we can allocate sync_fence only when user explicitly asks for it using (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET && NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) Also, in CDE path from gk20a_prepare_compressible_read(), we reuse existing fence stored in "state" and that can result into not returning sync_fence_fd when user asked for it Hence, force allocation of sync_fence when job submission comes from CDE path Bug 200141116 Change-Id: Ia921701bf0e2432d6b8a5e8b7d91160e7f52db1e Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/812845 (cherry picked from commit 5fd47015eeed00352cc8473eff969a66c94fee98) Reviewed-on: http://git-master/r/837662 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Sachin Nikam <snikam@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2015-10-07 16:20:07 +05:30
parent 937de14907
commit 52753b51f1
7 changed files with 54 additions and 29 deletions
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1830,7 +1830,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				u32 num_entries,
 				u32 flags,
 				struct nvgpu_fence *fence,
-				struct gk20a_fence **fence_out)
+				struct gk20a_fence **fence_out,
+				bool force_need_sync_fence)
 {
 	struct gk20a *g = c->g;
 	struct device *d = dev_from_gk20a(g);
@@ -1848,6 +1849,14 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	struct nvgpu_gpfifo *gpfifo_mem = c->gpfifo.mem.cpu_va;
 	bool skip_buffer_refcounting = (flags &
 			NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
+	bool need_sync_fence = false;
+
+	/*
+	 * If user wants to allocate sync_fence_fd always, then respect that;
+	 * otherwise, allocate sync_fence_fd based on user flags only
+	 */
+	if (force_need_sync_fence)
+		need_sync_fence = true;

 	if (c->has_timedout)
 		return -ETIMEDOUT;
@@ -1970,15 +1979,18 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		goto clean_up;
 	}

+	if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
+			(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
+		need_sync_fence = true;

 	/* always insert syncpt increment at end of gpfifo submission
 	   to keep track of method completion for idle railgating */
 	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
 		err = c->sync->incr_user(c->sync, wait_fence_fd, &incr_cmd,
-					 &post_fence, need_wfi);
+				 &post_fence, need_wfi, need_sync_fence);
 	else
 		err = c->sync->incr(c->sync, &incr_cmd,
-				    &post_fence);
+				    &post_fence, need_sync_fence);
 	if (err) {
 		mutex_unlock(&c->submit_lock);
 		goto clean_up;
@@ -2578,7 +2590,7 @@ static int gk20a_ioctl_channel_submit_gpfifo(

 	ret = gk20a_submit_channel_gpfifo(ch, NULL, args, args->num_entries,
 					  args->flags, &args->fence,
-					  &fence_out);
+					  &fence_out, false);

 	if (ret)
 		goto clean_up;