From 881a6f35bed16e30519ad07f99bec4e76cab5b13 Mon Sep 17 00:00:00 2001 From: Tejal Kudav Date: Mon, 13 Jul 2020 10:55:47 +0000 Subject: [PATCH] gpu: nvgpu: Trigger quiesce on PBDMA preempt fail During recovery, we preempt the faulty TSG from PBDMA and engines. If the TSG preempt on PBDMA times out(timeout = 100ms), the PBDMA might be hung state. We do not reset the HOST during recovery, so stuck PBDMAs are unrecoverable. Abort the recovery and trigger GPU to quiesce as there is no way back. Triggering Quiesce from recovery sequence should be fine as the only redundant operation will be write to FIFO_RUNLIST_PREEMPT register. The error notifiers will eventually be set by Quiesce thread. Bug 2768005 JIRA NVGPU-4631 Change-Id: I914b9379aa8e48014e6ddace9abe47180a072863 Signed-off-by: Tejal Kudav Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2368187 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: automaticguardword Reviewed-by: svc-mobile-coverity Reviewed-by: Deepak Nibade Reviewed-by: mobile promotions GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/fifo/preempt.c | 13 ++++--------- drivers/gpu/nvgpu/hal/rc/rc_gv11b.c | 9 +++++++-- drivers/gpu/nvgpu/include/nvgpu/preempt.h | 2 +- userspace/units/fifo/preempt/nvgpu-preempt.c | 19 +++++++------------ 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/nvgpu/common/fifo/preempt.c b/drivers/gpu/nvgpu/common/fifo/preempt.c index 6f3433b42..c6f8a0e8c 100644 --- a/drivers/gpu/nvgpu/common/fifo/preempt.c +++ b/drivers/gpu/nvgpu/common/fifo/preempt.c @@ -112,7 +112,7 @@ int nvgpu_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch) } /* called from rc */ -void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, +int nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, struct nvgpu_tsg *tsg) { struct nvgpu_fifo *f = &g->fifo; @@ -122,11 +122,7 @@ void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, u32 tsgid, pbdma_id; if (g->ops.fifo.preempt_poll_pbdma == NULL) { - return; - } - - if (tsg == NULL) { - return; + return 0; } tsgid = tsg->tsgid; @@ -142,12 +138,11 @@ void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, * memory system would be blocked. */ if (g->ops.fifo.preempt_poll_pbdma(g, tsgid, pbdma_id) != 0) { - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - pbdma_id, - GPU_HOST_PBDMA_PREEMPT_ERROR, 0); nvgpu_err(g, "PBDMA preempt failed"); + return -EBUSY; } } + return 0; } /* diff --git a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c index 2800c54f5..e19d94ede 100644 --- a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c @@ -41,6 +41,7 @@ #ifdef CONFIG_NVGPU_LS_PMU #include #endif +#include #include "rc_gv11b.h" @@ -223,8 +224,12 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, * For each PBDMA which serves the runlist, poll to verify the TSG is no * longer on the PBDMA and the engine phase of the preempt has started. */ - if (tsg != NULL) { - nvgpu_preempt_poll_tsg_on_pbdma(g, tsg); + if (tsg != NULL && (nvgpu_preempt_poll_tsg_on_pbdma(g, tsg) != 0)) { + nvgpu_err(g, "TSG preemption on PBDMA failed; " + "PBDMA seems stuck; cannot recover stuck PBDMA."); + /* Trigger Quiesce as recovery failed on hung PBDMA. */ + nvgpu_sw_quiesce(g); + return; } #ifdef CONFIG_NVGPU_DEBUGGER diff --git a/drivers/gpu/nvgpu/include/nvgpu/preempt.h b/drivers/gpu/nvgpu/include/nvgpu/preempt.h index f848a6f4f..b1511a827 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/preempt.h +++ b/drivers/gpu/nvgpu/include/nvgpu/preempt.h @@ -70,7 +70,7 @@ int nvgpu_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch); * Called from recovery handling for volta onwards. This will * not be part of safety build after recovery is not supported in safety build. */ -void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, +int nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, struct nvgpu_tsg *tsg); /** * @brief Preempt a set of runlists. diff --git a/userspace/units/fifo/preempt/nvgpu-preempt.c b/userspace/units/fifo/preempt/nvgpu-preempt.c index d2662b98a..1fae89ad7 100644 --- a/userspace/units/fifo/preempt/nvgpu-preempt.c +++ b/userspace/units/fifo/preempt/nvgpu-preempt.c @@ -162,13 +162,11 @@ done: } #define F_PREEMPT_POLL_PBDMA_NULL BIT(0) -#define F_PREEMPT_POLL_TSG_NULL BIT(1) -#define F_PREEMPT_POLL_PBDMA_BUSY BIT(2) -#define F_PREEMPT_POLL_LAST BIT(3) +#define F_PREEMPT_POLL_PBDMA_BUSY BIT(1) +#define F_PREEMPT_POLL_LAST BIT(2) static const char *f_preempt_poll[] = { "preempt_poll_pbdma_null", - "tsg_null", "preempt_poll_pbdma_busy", }; @@ -197,7 +195,7 @@ int test_preempt_poll_tsg_on_pbdma(struct unit_module *m, struct gk20a *g, u32 branches = 0U; int ret = UNIT_FAIL; - u32 prune = F_PREEMPT_POLL_PBDMA_NULL | F_PREEMPT_POLL_TSG_NULL; + u32 prune = F_PREEMPT_POLL_PBDMA_NULL; tsg = nvgpu_tsg_open(g, getpid()); unit_assert(tsg != NULL, goto done); @@ -221,14 +219,11 @@ int test_preempt_poll_tsg_on_pbdma(struct unit_module *m, struct gk20a *g, stub_fifo_preempt_poll_pbdma_busy : stub_fifo_preempt_poll_pbdma); - if (branches & F_PREEMPT_POLL_TSG_NULL) { - nvgpu_preempt_poll_tsg_on_pbdma(g, NULL); - } else { - nvgpu_preempt_poll_tsg_on_pbdma(g, tsg); - } + nvgpu_preempt_poll_tsg_on_pbdma(g, tsg); - if (branches & F_PREEMPT_POLL_TSG_NULL) { - unit_assert(stub[0].tsgid == NVGPU_INVALID_TSG_ID, + if (branches & F_PREEMPT_POLL_PBDMA_BUSY) { + unit_assert(stub[0].pbdma_id != + nvgpu_ffs(f->runlist_info[0]->pbdma_bitmask), goto done); } else if (!(branches & F_PREEMPT_POLL_PBDMA_NULL)) { unit_assert(stub[0].tsgid == 0, goto done);