gpu: nvgpu: non abortable TSG for vidmem-clear

When an engine faults due to unbound instance block, all active TSGs are currently aborted. This includes the TSG used by vidmem-clear task to clear vidmem buffers. From this point nvgpu_vidmem_clear cannot submit jobs anymore. Define TSG in MM CE context as non-abortable, and skip it when aborting active TSGs. Bug 2486146 Change-Id: I221259aec468e8ee3a24e80fab8d8fb7ee8607b0 Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2008954 (cherry picked from commit 6f2444dc5e128aa2b870796bd1e9dee7853f90af) Reviewed-on: https://git-master.nvidia.com/r/2008942 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2019-01-31 10:36:05 -08:00
parent bccb49d8fb
commit 13afcc24c3
5 changed files with 17 additions and 1 deletions
--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -228,6 +228,7 @@ int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid)

 	tsg->in_use = false;
 	tsg->tsgid = tsgid;
+	tsg->abortable = true;

 	nvgpu_init_list_node(&tsg->ch_list);
 	nvgpu_rwsem_init(&tsg->ch_list_lock);
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -128,6 +128,7 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
 	struct nvgpu_list_node *list = &ce_ctx->list;

 	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
+	ce_ctx->tsg->abortable = true;

 	nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);

@@ -478,6 +479,9 @@ u32 gk20a_ce_create_context(struct gk20a *g,
 		goto end;
 	}

+	/* this TSG should never be aborted */
+	ce_ctx->tsg->abortable = false;
+
 	/* always kernel client needs privileged channel */
 	ce_ctx->ch = gk20a_open_new_channel(g, runlist_id, true,
 				nvgpu_current_pid(g), nvgpu_current_tid(g));
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1232,6 +1232,8 @@ void gk20a_fifo_abort_tsg(struct gk20a *g, struct tsg_gk20a *tsg, bool preempt)

 	nvgpu_log_fn(g, " ");

+	WARN_ON(tsg->abortable == false);
+
 	g->ops.fifo.disable_tsg(tsg);

 	if (preempt) {
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -944,8 +944,16 @@ static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,

 		for_each_set_bit(tsgid, runlist->active_tsgs,
 			g->fifo.num_channels) {
-			nvgpu_log(g, gpu_dbg_info, "abort tsg id %lu", tsgid);
 			tsg = &g->fifo.tsg[tsgid];
+
+			if (!tsg->abortable) {
+				nvgpu_log(g, gpu_dbg_info,
+					  "tsg %lu is not abortable, skipping",
+					  tsgid);
+				continue;
+			}
+			nvgpu_log(g, gpu_dbg_info, "abort tsg id %lu", tsgid);
+
 			gk20a_disable_tsg(tsg);

 			/* assume all pbdma and eng faulted are set */
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
@@ -71,6 +71,7 @@ struct tsg_gk20a {
 	u8   tpc_pg_enabled;
 	bool tpc_num_initialized;
 	bool in_use;
+	bool abortable;

 	struct nvgpu_tsg_sm_error_state *sm_error_states;