gpu: nvgpu: acquire tsg ctx_init_lock when changing ctx state

GR context associated with channel is updated in various driver paths. Sequence to do the same is disable the TSG, preempt the TSG, update the GR context or instance block and then enable the TSG. These operations and runlist updates for channel have to be done under TSG specific ctx_init_lock to avoid the race. suspend_contexts and resume_contexts needs special handling which is not covered in this patch. Bug 3677982 Change-Id: I837257fe9d9ef3eb6f69f5d7e0707e0bb6d4ea72 Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2720222 Reviewed-by: Scott Long <scottl@nvidia.com> Reviewed-by: Ankur Kishore <ankkishore@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2022-05-30 11:17:17 +05:30
parent ef99d9f010
commit f1896e0a64
5 changed files with 66 additions and 14 deletions
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -1963,27 +1963,33 @@ static int nvgpu_dbg_gpu_ioctl_get_gr_context(struct dbg_session_gk20a *dbg_s,
 		return -EINVAL;
 	}

+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
 	ctx_mem = nvgpu_gr_ctx_get_ctx_mem(tsg->gr_ctx, NVGPU_GR_CTX_CTX);
 	if (ctx_mem == NULL || !nvgpu_mem_is_valid(ctx_mem)) {
 		nvgpu_err(g, "invalid context mem");
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}

 	if (ctx_mem->size > (u64)UINT_MAX) {
 		nvgpu_err(ch->g, "ctx size is larger than expected");
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}

 	/* Check if the input buffer size equals the gr context size */
 	size = (u32)ctx_mem->size;
 	if (args->size != size) {
 		nvgpu_err(g, "size mismatch: %d != %d", args->size, size);
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}

 	if (nvgpu_channel_disable_tsg(g, ch) != 0) {
 		nvgpu_err(g, "failed to disable channel/TSG");
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}

 	err = nvgpu_preempt_channel(g, ch);
@@ -1998,9 +2004,13 @@ done:
 	enable_err = nvgpu_channel_enable_tsg(g, ch);
 	if (enable_err != 0) {
 		nvgpu_err(g, "failed to re-enable channel/TSG");
+		nvgpu_mutex_release(&tsg->ctx_init_lock);
 		return (err != 0) ? err : enable_err;
 	}

+out:
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
 	return err;
 }