gpu: nvgpu: free VEID if the channel is closed

In case of process crash or forceful closure of the channels, userspace may not release the VEID. In that case, creating further subcontexts may not be possible. Hence, when the channel is closed forcibly (linux), release the VEID on closure of the last channel in the subcontext. With this, normally on linux, channel close will not relase the VEID However, on qnx it will release the VEID. So delete subcontext devctl call on qnx will be nop in normal case hence changed the error print and error return to success. Also added check in the subcontext delete ioctl fn that all channels are unbound before deleting the subcontext. This is to ensure that channels don't refer to dangling subcontext pointer. Bug 3979886 Change-Id: I434944b01740720011abce3664394ae8cb0d4e2e Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2858060 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2023-02-14 18:43:01 +05:30
parent 53dc53a8b4
commit a5640d61bd
6 changed files with 107 additions and 30 deletions
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -317,30 +317,25 @@ int nvgpu_tsg_create_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
 	return 0;
 }

-int nvgpu_tsg_delete_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
-				u32 max_subctx_count, u32 veid)
+void nvgpu_tsg_delete_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
+				 u32 veid)
 {
-	if (veid >= max_subctx_count) {
-		nvgpu_err(g, "Invalid VEID specified %u", veid);
-		return -EINVAL;
-	}
-
 	nvgpu_mutex_acquire(&tsg->veid_alloc_lock);

 	if (veid == 0U) {
 		if (!tsg->sync_veid) {
-			nvgpu_err(g, "VEID 0 not allocated");
+			nvgpu_log_info(g, "VEID 0 not allocated");
 			nvgpu_mutex_release(&tsg->veid_alloc_lock);
-			return -EINVAL;
+			return;
 		}

 		tsg->sync_veid = false;
 		tsg->subctx_vms[veid] = NULL;
 	} else {
 		if (!nvgpu_test_bit(veid - MAX_SYNC_SUBCONTEXTS, tsg->async_veids)) {
-			nvgpu_err(g, "VEID %u not allocated", veid);
+			nvgpu_log_info(g, "VEID %u not allocated", veid);
 			nvgpu_mutex_release(&tsg->veid_alloc_lock);
-			return -EINVAL;
+			return;
 		}
 		nvgpu_clear_bit(veid - MAX_SYNC_SUBCONTEXTS, tsg->async_veids);
 		tsg->subctx_vms[veid - MAX_SYNC_SUBCONTEXTS] = NULL;
@@ -349,6 +344,23 @@ int nvgpu_tsg_delete_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
 	nvgpu_mutex_release(&tsg->veid_alloc_lock);

 	nvgpu_log_info(g, "Freed VEID %u", veid);
+}
+
+int nvgpu_tsg_user_delete_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
+				     u32 max_subctx_count, u32 veid)
+{
+	if (veid >= max_subctx_count) {
+		nvgpu_err(g, "Invalid VEID specified %u", veid);
+		return -EINVAL;
+	}
+
+	if (nvgpu_tsg_subctx_has_channels_bound(tsg, veid)) {
+		nvgpu_err(g, "Channels should be unbound before freeing the"
+			      " subcontext");
+		return -EINVAL;
+	}
+
+	nvgpu_tsg_delete_subcontext(g, tsg, veid);

 	return 0;
 }
@@ -564,7 +576,7 @@ static bool nvgpu_tsg_is_multi_channel(struct nvgpu_tsg *tsg)
 }

 static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
-		struct nvgpu_channel *ch)
+		struct nvgpu_channel *ch, bool force)
 {
 	struct gk20a *g = ch->g;
 	int err;
@@ -635,7 +647,7 @@ static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,

 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
 	if (!nvgpu_engine_is_multimedia_runlist_id(g, ch->runlist->id)) {
-		nvgpu_tsg_subctx_unbind_channel(tsg, ch);
+		nvgpu_tsg_subctx_unbind_channel(tsg, ch, force);
 	}
 	nvgpu_list_del(&ch->ch_entry);
 	tsg->ch_count = nvgpu_safe_sub_u32(tsg->ch_count, 1U);
@@ -688,7 +700,7 @@ int nvgpu_tsg_unbind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch,
 	nvgpu_mutex_acquire(&tsg->ctx_init_lock);

 	if (!force) {
-		err = nvgpu_tsg_unbind_channel_common(tsg, ch);
+		err = nvgpu_tsg_unbind_channel_common(tsg, ch, force);

 		/* Let userspace retry the unbind if HW is busy. */
 		if (err == -EAGAIN) {
@@ -697,7 +709,7 @@ int nvgpu_tsg_unbind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch,
 		}
 	} else {
 		do {
-			err = nvgpu_tsg_unbind_channel_common(tsg, ch);
+			err = nvgpu_tsg_unbind_channel_common(tsg, ch, force);
 			/*
 			 * Retry for few iterations if the HW is busy before failing unbind
 			 * if the channel is getting killed, otherwise it can lead to faults.
@@ -778,7 +790,7 @@ fail_common:
 	g->ops.channel.unbind(ch);

 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
-	nvgpu_tsg_subctx_unbind_channel(tsg, ch);
+	nvgpu_tsg_subctx_unbind_channel(tsg, ch, force);
 	nvgpu_list_del(&ch->ch_entry);
 	ch->tsgid = NVGPU_INVALID_TSG_ID;
 	tsg->ch_count = nvgpu_safe_sub_u32(tsg->ch_count, 1U);
--- a/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg_subctx.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -55,6 +55,30 @@ static struct nvgpu_tsg_subctx *nvgpu_tsg_subctx_from_id(struct nvgpu_tsg *tsg,
 	return NULL;
 }

+bool nvgpu_tsg_subctx_has_channels_bound(struct nvgpu_tsg *tsg, u32 subctx_id)
+{
+	struct nvgpu_tsg_subctx *subctx = NULL;
+	struct gk20a *g = tsg->g;
+	bool chs_bound;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+
+	subctx = nvgpu_tsg_subctx_from_id(tsg, subctx_id);
+	if (subctx == NULL) {
+		nvgpu_log_info(g, "Subctx %u not allocated", subctx_id);
+		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+		return false;
+	}
+
+	chs_bound = !nvgpu_list_empty(&subctx->ch_list);
+
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+
+	return chs_bound;
+}
+
 int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
 				  struct nvgpu_channel *ch)
 {
@@ -103,7 +127,7 @@ add_ch_subctx:
 }

 void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
-				     struct nvgpu_channel *ch)
+				     struct nvgpu_channel *ch, bool force)
 {
 	struct nvgpu_tsg_subctx *subctx;
 	struct gk20a *g = tsg->g;
@@ -120,6 +144,10 @@ void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
 	nvgpu_list_del(&ch->subctx_entry);

 	if (nvgpu_list_empty(&subctx->ch_list)) {
+		if (force) {
+			nvgpu_tsg_delete_subcontext(g, tsg, ch->subctx_id);
+		}
+
 		if (g->ops.tsg.remove_subctx_channel_hw != NULL) {
 			g->ops.tsg.remove_subctx_channel_hw(ch);
 		}
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
@@ -353,19 +353,32 @@ int nvgpu_tsg_create_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
 *
 * @param g [in]		The GPU driver struct.
 * @param tsg [in]		Pointer to TSG struct.
- * @param max_subctx_count [in] Maximum subcontexts supported for the
- *                              gpu instance.
 * @param veid [in]		VEID to be freed.
 *
- * - Validate #veid. If invalid, return -EINVAL.
- * - Else free the VEID by resetting either #sync_veid or bit from #async_veids
- *   if allocated. If not allocated, return -EINVAL.
+ * - Free the VEID by resetting either #sync_veid or bit from #async_veids
+ *   if allocated.
+ */
+void nvgpu_tsg_delete_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
+				 u32 veid);
+
+/**
+ * @brief Free subcontext VEID from a TSG from an ioctl.
+ *
+ * @param g [in]		The GPU driver struct.
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param max_subctx_count [in] Maximum supported subcontexts.
+ * @param veid [in]		VEID to be freed.
+ *
+ * - Validate the veid. If it is not less than max_subctx_count return -EINVAL.
+ * - If the TSG subcontext corresponding to veid has channels bound then return
+ *   error.
+ * - Call nvgpu_tsg_create_subcontext.
 *
 * @return 0 in case of success, < 0 in case of failure.
 * @retval -EINVAL if veid is invalid.
 */
-int nvgpu_tsg_delete_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
-				u32 max_subctx_count, u32 veid);
+int nvgpu_tsg_user_delete_subcontext(struct gk20a *g, struct nvgpu_tsg *tsg,
+				     u32 max_subctx_count, u32 veid);

 /**
 * @brief Mark sync subctx created if channel is opened with implicit subctx.
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg_subctx.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -28,6 +28,27 @@ struct gk20a;
 struct nvgpu_tsg;
 struct nvgpu_tsg_subctx;
 struct nvgpu_channel;
+
+/**
+ * @brief Check if the TSG subcontext has channels bound to it.
+ *
+ * @param tsg [in]		Pointer to TSG struct.
+ * @param subctx_id [in]	Subcontext ID.
+ *
+ * - Loop through the #subctx_list in #tsg to check if the subctx
+ *   exists for the provided subctx_id.
+ * - If it exists, check if the channels list is empty or not and set
+ *   chs_bound accordingly.
+ * - Else return false.
+ *
+ * @return false if subcontext does not exist for supplied subctx_id.
+ * @return false if subcontext exists for supplied subctx_id and ch list
+ *		 is empty.
+ * @return true  if subcontext exists for supplied subctx_id and ch list
+ *		 is not empty.
+ */
+bool nvgpu_tsg_subctx_has_channels_bound(struct nvgpu_tsg *tsg, u32 subctx_id);
+
 /**
 * @brief Bind a channel to the TSG subcontext.
 *
@@ -56,10 +77,12 @@ int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
 *
 * @param tsg [in]		Pointer to TSG struct.
 * @param ch [in]		Pointer to Channel struct.
+ * @param force [in]		Free the VEID if force is true.
 *
 * - Validate that #subctx is allocated for the channel #ch.
 * - Remove the channel from the subctx #ch_list.
 * - If the subctx #ch_list is empty
+ *   - Free the VEID corresponding to the channel if force is true.
 *   - Update the instance blocks of all channels to remove the
 *     subctx pdb.
 *   - Invoke g->ops.gr.setup.free_subctx to free the GR subcontext
@@ -70,7 +93,7 @@ int nvgpu_tsg_subctx_bind_channel(struct nvgpu_tsg *tsg,
 *     sequence: mappings -> gr_subctx -> tsg_subctx
 */
 void nvgpu_tsg_subctx_unbind_channel(struct nvgpu_tsg *tsg,
-				     struct nvgpu_channel *ch);
+				     struct nvgpu_channel *ch, bool force);

 /**
 * @brief Allocate GR subcontext for a TSG subcontext.
--- a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
@@ -1079,7 +1079,8 @@ static int nvgpu_tsg_ioctl_delete_subcontext(struct gk20a *g,

 	max_subctx_count = nvgpu_grmgr_get_gpu_instance_max_veid_count(g, gpu_instance_id);

-	err = nvgpu_tsg_delete_subcontext(g, tsg, max_subctx_count, args->veid);
+	err = nvgpu_tsg_user_delete_subcontext(g, tsg, max_subctx_count,
+					       args->veid);
 	if (err != 0) {
 		nvgpu_err(g, "Delete subcontext failed %d", err);
 	}
--- a/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c
+++ b/userspace/units/gr/obj_ctx/nvgpu-gr-obj-ctx.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -415,7 +415,7 @@ int test_gr_obj_ctx_error_injection(struct unit_module *m,
 	}

 	/* Cleanup */
-	nvgpu_tsg_subctx_unbind_channel(tsg, channel);
+	nvgpu_tsg_subctx_unbind_channel(tsg, channel, false);
 	nvgpu_gr_ctx_free(g, gr_ctx, global_desc);
 	nvgpu_free_gr_ctx_struct(g, gr_ctx);
 	nvgpu_gr_ctx_desc_free(g, desc);