gpu: nvgpu: add new tsg functions for ctxsw timeout re-org

Add nvgpu_tsg_set_error_notifier function for setting error_notifier for all channels of a tsg. Add nvgpu_tsg_timeout_debug_dump_state function for finding if timeout_debug_dump is set for any of the channels of a tsg. Add nvgpu_tsg_set_timeout_accumulated_ms to set timeout_accumulated_ms for all the channels of a tsg. JIRA NVGPU-1312 Change-Id: Ib2daf2d462c2cf767f5a6e6fd3436abf6860091d Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2077626 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-03-20 10:31:55 -07:00
parent 540241e47c
commit 27e3546175
5 changed files with 63 additions and 36 deletions
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -1397,12 +1397,18 @@ bool nvgpu_channel_mark_error(struct gk20a *g, struct channel_gk20a *ch)
 	return verbose;
 }

+void nvgpu_channel_set_error_notifier(struct gk20a *g, struct channel_gk20a *ch,
+				u32 error_notifier)
+{
+	g->ops.fifo.set_error_notifier(ch, error_notifier);
+}
+
 void nvgpu_channel_set_ctx_mmu_error(struct gk20a *g,
 		struct channel_gk20a *ch)
 {
 	nvgpu_err(g,
 		"channel %d generated a mmu fault", ch->chid);
-	g->ops.fifo.set_error_notifier(ch,
+	nvgpu_channel_set_error_notifier(g, ch,
 				NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT);
 }

@@ -1441,7 +1447,7 @@ bool nvgpu_channel_check_ctxsw_timeout(struct channel_gk20a *ch,
 	*verbose = ch->timeout_debug_dump;
 	*ms = ch->timeout_accumulated_ms;
 	if (recover) {
-		g->ops.fifo.set_error_notifier(ch,
+		nvgpu_channel_set_error_notifier(g, ch,
 				NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
 	}

--- a/drivers/gpu/nvgpu/common/fifo/tsg.c
+++ b/drivers/gpu/nvgpu/common/fifo/tsg.c
@@ -354,22 +354,60 @@ bool nvgpu_tsg_mark_error(struct gk20a *g,

 }

-void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g,
-		struct tsg_gk20a *tsg)
+void nvgpu_tsg_set_timeout_accumulated_ms(struct tsg_gk20a *tsg, u32 ms)
 {
 	struct channel_gk20a *ch = NULL;

-	nvgpu_err(g, "TSG %d generated a mmu fault", tsg->tsgid);
-
 	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
 	nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
 		if (gk20a_channel_get(ch) != NULL) {
-			nvgpu_channel_set_ctx_mmu_error(g, ch);
+			ch->timeout_accumulated_ms = ms;
+			gk20a_channel_put(ch);
+		}
+	}
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+}
+
+bool nvgpu_tsg_timeout_debug_dump_state(struct tsg_gk20a *tsg)
+{
+	struct channel_gk20a *ch = NULL;
+	bool verbose = false;
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+	nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
+		if (gk20a_channel_get(ch) != NULL) {
+			if (ch->timeout_debug_dump) {
+				verbose = true;
+			}
 			gk20a_channel_put(ch);
 		}
 	}
 	nvgpu_rwsem_up_read(&tsg->ch_list_lock);

+	return verbose;
+}
+
+void nvgpu_tsg_set_error_notifier(struct gk20a *g, struct tsg_gk20a *tsg,
+		u32 error_notifier)
+{
+	struct channel_gk20a *ch = NULL;
+
+	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+	nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
+		if (gk20a_channel_get(ch) != NULL) {
+			nvgpu_channel_set_error_notifier(g, ch, error_notifier);
+			gk20a_channel_put(ch);
+		}
+	}
+	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+}
+
+void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g, struct tsg_gk20a *tsg)
+{
+	nvgpu_err(g, "TSG %d generated a mmu fault", tsg->tsgid);
+
+	nvgpu_tsg_set_error_notifier(g, tsg,
+		NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT);
 }

 bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg,
@@ -411,17 +449,9 @@ bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg,
 				tsg->tsgid, ch->chid);
 		*ms = ch->timeout_accumulated_ms;
 		gk20a_channel_put(ch);
-		nvgpu_list_for_each_entry(ch, &tsg->ch_list,
-				channel_gk20a, ch_entry) {
-			if (gk20a_channel_get(ch) != NULL) {
-				ch->g->ops.fifo.set_error_notifier(ch,
-					NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
-				if (ch->timeout_debug_dump) {
-					*verbose = true;
-				}
-				gk20a_channel_put(ch);
-			}
-		}
+		nvgpu_tsg_set_error_notifier(g, tsg,
+			NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
+		*verbose = nvgpu_tsg_timeout_debug_dump_state(tsg);
 	} else if (progress) {
 		/*
 		 * if at least one channel in the TSG made some progress, reset
@@ -433,13 +463,7 @@ bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg,
 				tsg->tsgid, ch->chid);
 		gk20a_channel_put(ch);
 		*ms = g->fifo_eng_timeout_us / 1000U;
-		nvgpu_list_for_each_entry(ch, &tsg->ch_list,
-				channel_gk20a, ch_entry) {
-			if (gk20a_channel_get(ch) != NULL) {
-				ch->timeout_accumulated_ms = *ms;
-				gk20a_channel_put(ch);
-			}
-		}
+		nvgpu_tsg_set_timeout_accumulated_ms(tsg, *ms);
 	}

 	/* if we could not detect progress on any of the channel, but none
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -2938,7 +2938,6 @@ static void gk20a_gr_set_error_notifier(struct gk20a *g,
 {
 	struct channel_gk20a *ch;
 	struct tsg_gk20a *tsg;
-	struct channel_gk20a *ch_tsg;

 	ch = isr_data->ch;

@@ -2948,16 +2947,7 @@ static void gk20a_gr_set_error_notifier(struct gk20a *g,

 	tsg = tsg_gk20a_from_ch(ch);
 	if (tsg != NULL) {
-		nvgpu_rwsem_down_read(&tsg->ch_list_lock);
-		nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
-				channel_gk20a, ch_entry) {
-			if (gk20a_channel_get(ch_tsg) != NULL) {
-				g->ops.fifo.set_error_notifier(ch_tsg,
-					 error_notifier);
-				gk20a_channel_put(ch_tsg);
-			}
-		}
-		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+		nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
 	} else {
 		nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
 	}
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -496,4 +496,7 @@ static inline u64 gk20a_channel_userd_gpu_va(struct channel_gk20a *c)
 	struct nvgpu_mem *mem = c->userd_mem;
 	return (mem->gpu_va != 0ULL) ? mem->gpu_va + c->userd_offset : 0ULL;
 }
+
+void nvgpu_channel_set_error_notifier(struct gk20a *g, struct channel_gk20a *ch,
+			u32 error_notifier);
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h
@@ -145,4 +145,8 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node)
 		((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node));
 };

+void nvgpu_tsg_set_error_notifier(struct gk20a *g, struct tsg_gk20a *tsg,
+		u32 error_notifier);
+bool nvgpu_tsg_timeout_debug_dump_state(struct tsg_gk20a *tsg);
+void nvgpu_tsg_set_timeout_accumulated_ms(struct tsg_gk20a *tsg, u32 ms);
 #endif /* TSG_GK20A_H */