gpu: nvgpu: handle MMU fault for TSG

- add support to handle MMU faults on a channel in TSG - first get the ID and type of channel that engine is running - if TSG, abort each channel in it - if regular channel, abort that channel - also, add two versions of API set_ctx_mmu_error(), one for regular channel and another for TSG Bug 1470692 Change-Id: Ia7b01b81739598459702ed172180adb00e345eba Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/497874 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2014-09-17 12:38:34 +05:30
parent e4a7bc1602
commit 2f232348e6
2 changed files with 68 additions and 33 deletions
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -585,7 +585,7 @@ void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error)
 		ch->error_notifier->info32 = error;
 		ch->error_notifier->status = 0xffff;
 		gk20a_err(dev_from_gk20a(ch->g),
-				"error notifier set to %d\n", error);
+		    "error notifier set to %d for ch %d\n", error, ch->hw_chid);
 	}
 }

--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -911,14 +911,12 @@ void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
 }

 static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
-		struct channel_gk20a *ch) {
+		struct channel_gk20a *ch)
+{
 	bool verbose = true;
 	if (!ch)
 		return verbose;

-	gk20a_err(dev_from_gk20a(g),
-		"channel %d generated a mmu fault",
-		ch->hw_chid);
 	if (ch->error_notifier) {
 		u32 err = ch->error_notifier->info32;
 		if (ch->error_notifier->status == 0xffff) {
@@ -944,6 +942,31 @@ static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
 	return verbose;
 }

+static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
+		struct channel_gk20a *ch)
+{
+	gk20a_err(dev_from_gk20a(g),
+		"channel %d generated a mmu fault", ch->hw_chid);
+
+	return gk20a_fifo_set_ctx_mmu_error(g, ch);
+}
+
+static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
+		struct tsg_gk20a *tsg)
+{
+	bool ret = true;
+	struct channel_gk20a *ch = NULL;
+
+	gk20a_err(dev_from_gk20a(g),
+		"TSG %d generated a mmu fault", tsg->tsgid);
+
+	mutex_lock(&tsg->ch_list_lock);
+	list_for_each_entry(ch, &tsg->ch_list, ch_entry)
+		ret = gk20a_fifo_set_ctx_mmu_error(g, ch);
+	mutex_unlock(&tsg->ch_list_lock);
+
+	return ret;
+}

 static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 {
@@ -987,6 +1010,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 		u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id);
 		struct fifo_mmu_fault_info_gk20a f;
 		struct channel_gk20a *ch = NULL;
+		struct tsg_gk20a *tsg = NULL;

 		get_exception_mmu_fault_info(g, engine_mmu_id, &f);
 		trace_gk20a_mmu_fault(f.fault_hi_v,
@@ -1008,49 +1032,61 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
 			   f.fault_type_v, f.fault_type_desc,
 			   f.fault_info_v, f.inst_ptr);

-		/* get the channel */
+		/* get the channel/TSG */
 		if (fake_fault) {
 			/* read and parse engine status */
 			u32 status = gk20a_readl(g,
 				fifo_engine_status_r(engine_id));
 			u32 ctx_status =
 				fifo_engine_status_ctx_status_v(status);
-			bool type_ch = fifo_pbdma_status_id_type_v(status) ==
-				fifo_pbdma_status_id_type_chid_v();

 			/* use next_id if context load is failing */
 			u32 id = (ctx_status ==
 				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
 				fifo_engine_status_next_id_v(status) :
 				fifo_engine_status_id_v(status);
+			u32 type = (ctx_status ==
+				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+				fifo_engine_status_next_id_type_v(status) :
+				fifo_engine_status_id_type_v(status);

-			if (type_ch) {
-				ch = g->fifo.channel + id;
-			} else {
-				gk20a_err(dev_from_gk20a(g), "non-chid type not supported");
-				WARN_ON(1);
-			}
+			if (type == fifo_engine_status_id_type_tsgid_v())
+				tsg = &g->fifo.tsg[id];
+			else if (type == fifo_engine_status_id_type_chid_v())
+				ch = &g->fifo.channel[id];
 		} else {
 			/* read channel based on instruction pointer */
 			ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr);
 		}

-		if (ch) {
+		if (ch && gk20a_is_channel_marked_as_tsg(ch))
+			tsg = &g->fifo.tsg[ch->tsgid];
+
 		/* check if engine reset should be deferred */
-			if (gk20a_fifo_should_defer_engine_reset(g, engine_id, &f, fake_fault)) {
+		if ((ch || tsg) && gk20a_fifo_should_defer_engine_reset(g,
+				engine_id, &f, fake_fault)) {
 			g->fifo.mmu_fault_engines = fault_id;

 			/* handled during channel free */
 			g->fifo.deferred_reset_pending = true;
-			} else
-				verbose = gk20a_fifo_set_ctx_mmu_error(g, ch);
-
-			if (ch->in_use) {
-				/* disable the channel from hw and increment
-				 * syncpoints */
-				gk20a_channel_abort(ch);
 		}

+		/* disable the channel/TSG from hw and increment
+		 * syncpoints */
+		if (tsg) {
+			struct channel_gk20a *ch = NULL;
+			if (!g->fifo.deferred_reset_pending)
+				verbose =
+				       gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+			mutex_lock(&tsg->ch_list_lock);
+			list_for_each_entry(ch, &tsg->ch_list, ch_entry)
+				gk20a_channel_abort(ch);
+			mutex_unlock(&tsg->ch_list_lock);
+		} else if (ch) {
+			if (!g->fifo.deferred_reset_pending)
+				verbose =
+					gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+			gk20a_channel_abort(ch);
 		} else if (f.inst_ptr ==
 				g->mm.bar1.inst_block.cpu_pa) {
 			gk20a_err(dev_from_gk20a(g), "mmu fault from bar1");
@@ -1192,7 +1228,7 @@ void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose)

 		gk20a_channel_abort(ch);

-		if (gk20a_fifo_set_ctx_mmu_error(g, ch))
+		if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
 			gk20a_debug_dump(g->dev);
 	}
 }
@@ -1206,13 +1242,12 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
 		struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
 		struct channel_gk20a *ch;

-		mutex_lock(&tsg->ch_list_lock);
-		list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
-			if (gk20a_fifo_set_ctx_mmu_error(g, ch))
+		if (gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg))
 			gk20a_debug_dump(g->dev);

+		mutex_lock(&tsg->ch_list_lock);
+		list_for_each_entry(ch, &tsg->ch_list, ch_entry)
 			gk20a_channel_abort(ch);
-		}
 		mutex_unlock(&tsg->ch_list_lock);
 	}
 }