gpu: nvgpu: correct handling of pbdma rc

nvgpu_rc_pbdma_fault just checks for the id and id_type from struct nvgpu_pbdma_status_info. These contain invalid values during chsw_load and chsw_switch. This patch corrects the above bug by checking for the chsw status and then loading the values for id and type. The current code reads the pbdma_status info after clearing the interrupt. Other interrupts can cause enough delay between clearing the interrupt and pbdma switching the channel leading to invalid channel/tsg ID. Correct that by reading the pbdma_status info register before clearing of the pbdma interrupt to correctly read the context information before the pbdma can switch out the context. Bug 2648298 Change-Id: Ic2f0682526e00d14ad58f0411472f34388183f2b Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2165047 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-07-31 16:53:40 +05:30
parent 8a691fcf6c
commit 0ef96e4b1a
7 changed files with 38 additions and 18 deletions
--- a/drivers/gpu/nvgpu/common/rc/rc.c
+++ b/drivers/gpu/nvgpu/common/rc/rc.c
@@ -77,25 +77,36 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
 }

 void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f,
-			u32 pbdma_id, u32 error_notifier)
+			u32 pbdma_id, u32 error_notifier,
+			struct nvgpu_pbdma_status_info *pbdma_status)
 {
 	u32 id;
-	struct nvgpu_pbdma_status_info pbdma_status;
+	u32 id_type = PBDMA_STATUS_ID_TYPE_INVALID;

 	nvgpu_log(g, gpu_dbg_info, "pbdma id %d error notifier %d",
 			pbdma_id, error_notifier);

-	g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id,
-		&pbdma_status);
-	/* Remove channel from runlist */
-	id = pbdma_status.id;
-	if (pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_TSGID) {
+	if (nvgpu_pbdma_status_is_chsw_valid(pbdma_status) ||
+			nvgpu_pbdma_status_is_chsw_save(pbdma_status)) {
+		id = pbdma_status->id;
+		id_type = pbdma_status->id_type;
+	} else if (nvgpu_pbdma_status_is_chsw_load(pbdma_status) ||
+			nvgpu_pbdma_status_is_chsw_switch(pbdma_status)) {
+		id = pbdma_status->next_id;
+		id_type = pbdma_status->next_id_type;
+	} else {
+		/* Nothing to do here */
+		nvgpu_err(g, "Invalid pbdma_status.id");
+		return;
+	}
+
+	if (id_type == PBDMA_STATUS_ID_TYPE_TSGID) {
 		struct nvgpu_tsg *tsg = nvgpu_tsg_get_from_id(g, id);

 		nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
 		nvgpu_rc_tsg_and_related_engines(g, tsg, true,
 			RC_TYPE_PBDMA_FAULT);
-	} else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) {
+	} else if(id_type == PBDMA_STATUS_ID_TYPE_CHID) {
 		struct nvgpu_channel *ch = nvgpu_channel_from_id(g, id);
 		struct nvgpu_tsg *tsg;
 		if (ch == NULL) {
--- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c
@@ -96,16 +96,17 @@ u32 gk20a_fifo_pbdma_isr(struct gk20a *g)
 	u32 pbdma_pending_bitmask = nvgpu_readl(g, fifo_intr_pbdma_id_r());
 	u32 error_notifier;
 	bool recover;
+	struct nvgpu_pbdma_status_info pbdma_status;

 	for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) {
 		if (fifo_intr_pbdma_id_status_v(pbdma_pending_bitmask, pbdma_id) != 0U) {
 			nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending",
 				pbdma_id);
 			recover = g->ops.pbdma.handle_intr(g, pbdma_id,
-				&error_notifier);
+				&error_notifier, &pbdma_status);
 			if (recover) {
 				nvgpu_rc_pbdma_fault(g, f, pbdma_id,
-					error_notifier);
+					error_notifier, &pbdma_status);
 			}
 		}
 	}
--- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h
+++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h
@@ -29,11 +29,13 @@ struct gk20a;
 struct nvgpu_debug_context;
 struct nvgpu_channel_dump_info;
 struct nvgpu_gpfifo_entry;
+struct nvgpu_pbdma_status_info;

 bool gm20b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id,
 			u32 pbdma_intr_0, u32 *error_notifier);
 bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
-			u32 *error_notifier);
+			u32 *error_notifier,
+			struct nvgpu_pbdma_status_info *pbdma_status);

 u32 gm20b_pbdma_read_data(struct gk20a *g, u32 pbdma_id);
 void gm20b_pbdma_reset_header(struct gk20a *g, u32 pbdma_id);
--- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c
@@ -356,7 +356,8 @@ u32 gm20b_pbdma_restartable_0_intr_descs(void)
 }

 bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
-			u32 *error_notifier)
+			u32 *error_notifier,
+			struct nvgpu_pbdma_status_info *pbdma_status)
 {
 	u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR;

@@ -372,6 +373,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,

 		if (g->ops.pbdma.handle_intr_0(g, pbdma_id, pbdma_intr_0,
 			&intr_error_notifier)) {
+			g->ops.pbdma_status.read_pbdma_status_info(g,
+				pbdma_id, pbdma_status);
 			recover = true;
 		}
 		nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
@@ -384,6 +387,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,

 		if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1,
 			&intr_error_notifier)) {
+			g->ops.pbdma_status.read_pbdma_status_info(g,
+				pbdma_id, pbdma_status);
 			recover = true;
 		}
 		nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
--- a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c
@@ -173,10 +173,7 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid,
 		 */

 		/* Ignore un-needed return value "recover" */
-		(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL);
-
-		g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id,
-			&pbdma_status);
+		(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL, &pbdma_status);

 		if (nvgpu_pbdma_status_is_chsw_valid(&pbdma_status) ||
 			nvgpu_pbdma_status_is_chsw_save(&pbdma_status)) {
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -116,6 +116,7 @@ struct _resmgr_context;
 struct nvgpu_gpfifo_entry;
 struct vm_gk20a_mapping_batch;
 struct pmu_pg_stats_data;
+struct nvgpu_pbdma_status_info;

 enum nvgpu_flush_op;
 enum gk20a_mem_rw_flag;
@@ -1172,7 +1173,8 @@ struct gpu_ops {
 				u32 *error_notifier);
 		/* error_notifier can be NULL */
 		bool (*handle_intr)(struct gk20a *g, u32 pbdma_id,
-				u32 *error_notifier);
+				u32 *error_notifier,
+				struct nvgpu_pbdma_status_info *pbdma_status);
 		u32 (*get_signature)(struct gk20a *g);
 		void (*dump_status)(struct gk20a *g,
 				struct nvgpu_debug_context *o);
--- a/drivers/gpu/nvgpu/include/nvgpu/rc.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h
@@ -41,12 +41,14 @@ struct gk20a;
 struct nvgpu_fifo;
 struct nvgpu_tsg;
 struct nvgpu_channel;
+struct nvgpu_pbdma_status_info;

 void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
 				struct nvgpu_tsg *tsg, bool debug_dump);

 void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f,
-			u32 pbdma_id, u32 error_notifier);
+			u32 pbdma_id, u32 error_notifier,
+			struct nvgpu_pbdma_status_info *pbdma_status);

 void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id);