From 0ef96e4b1a7979d2bae0e52924e976515cb87400 Mon Sep 17 00:00:00 2001 From: Debarshi Dutta Date: Wed, 31 Jul 2019 16:53:40 +0530 Subject: [PATCH] gpu: nvgpu: correct handling of pbdma rc nvgpu_rc_pbdma_fault just checks for the id and id_type from struct nvgpu_pbdma_status_info. These contain invalid values during chsw_load and chsw_switch. This patch corrects the above bug by checking for the chsw status and then loading the values for id and type. The current code reads the pbdma_status info after clearing the interrupt. Other interrupts can cause enough delay between clearing the interrupt and pbdma switching the channel leading to invalid channel/tsg ID. Correct that by reading the pbdma_status info register before clearing of the pbdma interrupt to correctly read the context information before the pbdma can switch out the context. Bug 2648298 Change-Id: Ic2f0682526e00d14ad58f0411472f34388183f2b Signed-off-by: Debarshi Dutta Reviewed-on: https://git-master.nvidia.com/r/2165047 Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-misra Reviewed-by: Deepak Nibade GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/rc/rc.c | 27 +++++++++++++------ .../gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c | 5 ++-- drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h | 4 ++- drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c | 7 ++++- .../gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c | 5 +--- drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 4 ++- drivers/gpu/nvgpu/include/nvgpu/rc.h | 4 ++- 7 files changed, 38 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/nvgpu/common/rc/rc.c b/drivers/gpu/nvgpu/common/rc/rc.c index 8c53b4e33..3be41a507 100644 --- a/drivers/gpu/nvgpu/common/rc/rc.c +++ b/drivers/gpu/nvgpu/common/rc/rc.c @@ -77,25 +77,36 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, } void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f, - u32 pbdma_id, u32 error_notifier) + u32 pbdma_id, u32 error_notifier, + struct nvgpu_pbdma_status_info *pbdma_status) { u32 id; - struct nvgpu_pbdma_status_info pbdma_status; + u32 id_type = PBDMA_STATUS_ID_TYPE_INVALID; nvgpu_log(g, gpu_dbg_info, "pbdma id %d error notifier %d", pbdma_id, error_notifier); - g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id, - &pbdma_status); - /* Remove channel from runlist */ - id = pbdma_status.id; - if (pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_TSGID) { + if (nvgpu_pbdma_status_is_chsw_valid(pbdma_status) || + nvgpu_pbdma_status_is_chsw_save(pbdma_status)) { + id = pbdma_status->id; + id_type = pbdma_status->id_type; + } else if (nvgpu_pbdma_status_is_chsw_load(pbdma_status) || + nvgpu_pbdma_status_is_chsw_switch(pbdma_status)) { + id = pbdma_status->next_id; + id_type = pbdma_status->next_id_type; + } else { + /* Nothing to do here */ + nvgpu_err(g, "Invalid pbdma_status.id"); + return; + } + + if (id_type == PBDMA_STATUS_ID_TYPE_TSGID) { struct nvgpu_tsg *tsg = nvgpu_tsg_get_from_id(g, id); nvgpu_tsg_set_error_notifier(g, tsg, error_notifier); nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PBDMA_FAULT); - } else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) { + } else if(id_type == PBDMA_STATUS_ID_TYPE_CHID) { struct nvgpu_channel *ch = nvgpu_channel_from_id(g, id); struct nvgpu_tsg *tsg; if (ch == NULL) { diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c index 7805beb85..c01a47932 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c @@ -96,16 +96,17 @@ u32 gk20a_fifo_pbdma_isr(struct gk20a *g) u32 pbdma_pending_bitmask = nvgpu_readl(g, fifo_intr_pbdma_id_r()); u32 error_notifier; bool recover; + struct nvgpu_pbdma_status_info pbdma_status; for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) { if (fifo_intr_pbdma_id_status_v(pbdma_pending_bitmask, pbdma_id) != 0U) { nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending", pbdma_id); recover = g->ops.pbdma.handle_intr(g, pbdma_id, - &error_notifier); + &error_notifier, &pbdma_status); if (recover) { nvgpu_rc_pbdma_fault(g, f, pbdma_id, - error_notifier); + error_notifier, &pbdma_status); } } } diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h index d4cab047f..b9f6f8190 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h @@ -29,11 +29,13 @@ struct gk20a; struct nvgpu_debug_context; struct nvgpu_channel_dump_info; struct nvgpu_gpfifo_entry; +struct nvgpu_pbdma_status_info; bool gm20b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_0, u32 *error_notifier); bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, - u32 *error_notifier); + u32 *error_notifier, + struct nvgpu_pbdma_status_info *pbdma_status); u32 gm20b_pbdma_read_data(struct gk20a *g, u32 pbdma_id); void gm20b_pbdma_reset_header(struct gk20a *g, u32 pbdma_id); diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c index 0fbbcf294..e5fb1ce86 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c @@ -356,7 +356,8 @@ u32 gm20b_pbdma_restartable_0_intr_descs(void) } bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, - u32 *error_notifier) + u32 *error_notifier, + struct nvgpu_pbdma_status_info *pbdma_status) { u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR; @@ -372,6 +373,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, if (g->ops.pbdma.handle_intr_0(g, pbdma_id, pbdma_intr_0, &intr_error_notifier)) { + g->ops.pbdma_status.read_pbdma_status_info(g, + pbdma_id, pbdma_status); recover = true; } nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0); @@ -384,6 +387,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1, &intr_error_notifier)) { + g->ops.pbdma_status.read_pbdma_status_info(g, + pbdma_id, pbdma_status); recover = true; } nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1); diff --git a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c index 5167a3bdf..2fec5704d 100644 --- a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c @@ -173,10 +173,7 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid, */ /* Ignore un-needed return value "recover" */ - (void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL); - - g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id, - &pbdma_status); + (void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL, &pbdma_status); if (nvgpu_pbdma_status_is_chsw_valid(&pbdma_status) || nvgpu_pbdma_status_is_chsw_save(&pbdma_status)) { diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index f821cf4da..64a8e9bf8 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -116,6 +116,7 @@ struct _resmgr_context; struct nvgpu_gpfifo_entry; struct vm_gk20a_mapping_batch; struct pmu_pg_stats_data; +struct nvgpu_pbdma_status_info; enum nvgpu_flush_op; enum gk20a_mem_rw_flag; @@ -1172,7 +1173,8 @@ struct gpu_ops { u32 *error_notifier); /* error_notifier can be NULL */ bool (*handle_intr)(struct gk20a *g, u32 pbdma_id, - u32 *error_notifier); + u32 *error_notifier, + struct nvgpu_pbdma_status_info *pbdma_status); u32 (*get_signature)(struct gk20a *g); void (*dump_status)(struct gk20a *g, struct nvgpu_debug_context *o); diff --git a/drivers/gpu/nvgpu/include/nvgpu/rc.h b/drivers/gpu/nvgpu/include/nvgpu/rc.h index 51c176d80..df7236444 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/rc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h @@ -41,12 +41,14 @@ struct gk20a; struct nvgpu_fifo; struct nvgpu_tsg; struct nvgpu_channel; +struct nvgpu_pbdma_status_info; void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, struct nvgpu_tsg *tsg, bool debug_dump); void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f, - u32 pbdma_id, u32 error_notifier); + u32 pbdma_id, u32 error_notifier, + struct nvgpu_pbdma_status_info *pbdma_status); void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id);