gpu: nvgpu: recover pbdma errors before ack

When a pbdma fault needs a channel teardown, do the recovery/teardown process before acking the pbdma interrupt status back. Acking it causes the hardware to proceed which could release fences too early before the involved channel(s) have been found to be broken. With these host copyengine interrupts, the teardown sequence is light and proceeds even with the pbdma intr flag still set; there are no engines to reset when these pbdma launch check interrupts happen. The bad tsg is just disabled and the channels in it aborted. A few unit tests are so heavily affected by this refactor that they would need to be rewritten. They're not strictly needed at the moment, so do only half of the rewrite: just delete them. Bug 200611198 Change-Id: Id126fb158b6d05e46ba124cd426389046eedc053 Signed-off-by: Konsta Hölttä <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2392669 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2020-08-14 15:20:31 +03:00
parent 370ac6cc98
commit dfd9feace6
9 changed files with 34 additions and 313 deletions
--- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c
@@ -30,8 +30,6 @@
 #include <nvgpu/tsg.h>
 #include <nvgpu/rc.h>
 #include <nvgpu/nvgpu_err.h>
-#include <nvgpu/error_notifier.h>
-#include <nvgpu/pbdma_status.h>
 #include <nvgpu/engines.h>

 #include <hal/fifo/fifo_intr_gk20a.h>
@@ -94,20 +92,12 @@ u32 gk20a_fifo_pbdma_isr(struct gk20a *g)
 	u32 pbdma_id;
 	u32 num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA);
 	u32 pbdma_pending_bitmask = nvgpu_readl(g, fifo_intr_pbdma_id_r());
-	u32 error_notifier;
-	bool recover;
-	struct nvgpu_pbdma_status_info pbdma_status;

 	for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) {
 		if (fifo_intr_pbdma_id_status_v(pbdma_pending_bitmask, pbdma_id) != 0U) {
 			nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending",
 				pbdma_id);
-			recover = g->ops.pbdma.handle_intr(g, pbdma_id,
-				&error_notifier, &pbdma_status);
-			if (recover) {
-				nvgpu_rc_pbdma_fault(g, pbdma_id,
-					error_notifier, &pbdma_status);
-			}
+			g->ops.pbdma.handle_intr(g, pbdma_id, true);
 		}
 	}
 	return fifo_intr_0_pbdma_intr_pending_f();
--- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h
+++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h
@@ -33,9 +33,7 @@ struct nvgpu_pbdma_status_info;

 bool gm20b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id,
 			u32 pbdma_intr_0, u32 *error_notifier);
-bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
-			u32 *error_notifier,
-			struct nvgpu_pbdma_status_info *pbdma_status);
+void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover);

 u32 gm20b_pbdma_read_data(struct gk20a *g, u32 pbdma_id);
 void gm20b_pbdma_reset_header(struct gk20a *g, u32 pbdma_id);
--- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c
@@ -34,6 +34,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/pbdma_status.h>
 #include <nvgpu/static_analysis.h>
+#include <nvgpu/rc.h>

 #include <nvgpu/hw/gm20b/hw_pbdma_gm20b.h>

@@ -318,17 +319,14 @@ u32 gm20b_pbdma_restartable_0_intr_descs(void)
 	return restartable_0_intr_descs;
 }

-bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
-			u32 *error_notifier,
-			struct nvgpu_pbdma_status_info *pbdma_status)
+void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover)
 {
+	struct nvgpu_pbdma_status_info pbdma_status;
 	u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR;

 	u32 pbdma_intr_0 = nvgpu_readl(g, pbdma_intr_0_r(pbdma_id));
 	u32 pbdma_intr_1 = nvgpu_readl(g, pbdma_intr_1_r(pbdma_id));

-	bool recover = false;
-
 	if (pbdma_intr_0 != 0U) {
 		nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
 			"pbdma id %d intr_0 0x%08x pending",
@@ -337,8 +335,12 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
 		if (g->ops.pbdma.handle_intr_0(g, pbdma_id, pbdma_intr_0,
 			&intr_error_notifier)) {
 			g->ops.pbdma_status.read_pbdma_status_info(g,
-				pbdma_id, pbdma_status);
-			recover = true;
+				pbdma_id, &pbdma_status);
+			if (recover) {
+				nvgpu_rc_pbdma_fault(g, pbdma_id,
+						intr_error_notifier,
+						&pbdma_status);
+			}
 		}
 		nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
 	}
@@ -351,17 +353,16 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
 		if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1,
 			&intr_error_notifier)) {
 			g->ops.pbdma_status.read_pbdma_status_info(g,
-				pbdma_id, pbdma_status);
-			recover = true;
+				pbdma_id, &pbdma_status);
+			if (recover) {
+				nvgpu_rc_pbdma_fault(g, pbdma_id,
+						intr_error_notifier,
+						&pbdma_status);
+			}
 		}
+
 		nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
 	}
-
-	if (error_notifier != NULL) {
-		*error_notifier = intr_error_notifier;
-	}
-
-	return recover;
 }

 u32 gm20b_pbdma_get_gp_base(u64 gpfifo_base)
--- a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c
@@ -127,9 +127,6 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid,
 			loop_count++;
 		}

-		g->ops.pbdma_status.read_pbdma_status_info(g,
-			pbdma_id, &pbdma_status);
-
 		/*
 		 * If the PBDMA has a stalling interrupt and receives a NACK,
 		 * the PBDMA won't save out until the STALLING interrupt is
@@ -142,8 +139,10 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid,
 		 * reported to SW.
 		 */

-		/* Ignore un-needed return value "recover" */
-		(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL, &pbdma_status);
+		g->ops.pbdma.handle_intr(g, pbdma_id, false);
+
+		g->ops.pbdma_status.read_pbdma_status_info(g,
+			pbdma_id, &pbdma_status);

 		ret = fifo_preempt_check_tsg_on_pbdma(tsgid, &pbdma_status);
 		if (ret == 0) {
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_pbdma.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_pbdma.h
@@ -50,9 +50,7 @@ struct gops_pbdma {
 	bool (*handle_intr_1)(struct gk20a *g,
 			u32 pbdma_id, u32 pbdma_intr_1,
 			u32 *error_notifier);
-	bool (*handle_intr)(struct gk20a *g, u32 pbdma_id,
-			u32 *error_notifier,
-			struct nvgpu_pbdma_status_info *pbdma_status);
+	void (*handle_intr)(struct gk20a *g, u32 pbdma_id, bool recover);
 	u32 (*set_clear_intr_offsets) (struct gk20a *g,
 			u32 set_clear_size);
 	u32 (*get_signature)(struct gk20a *g);