gpu: nvgpu: Trigger quiesce on PBDMA preempt fail

During recovery, we preempt the faulty TSG from PBDMA and engines.
If the TSG preempt on PBDMA times out(timeout = 100ms), the PBDMA
might be hung state. We do not reset the HOST during recovery, so
stuck PBDMAs are unrecoverable.
Abort the recovery and trigger GPU to quiesce as there is no way
back.

Triggering Quiesce from recovery sequence should be fine as the only
redundant operation will be write to FIFO_RUNLIST_PREEMPT register.
The error notifiers will eventually be set by Quiesce thread.

Bug 2768005
JIRA NVGPU-4631

Change-Id: I914b9379aa8e48014e6ddace9abe47180a072863
Signed-off-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2368187
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Tejal Kudav
2020-07-13 10:55:47 +00:00
committed by Alex Waterman
parent f7a73f4ab8
commit 881a6f35be
4 changed files with 19 additions and 24 deletions

View File

@@ -112,7 +112,7 @@ int nvgpu_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch)
} }
/* called from rc */ /* called from rc */
void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, int nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
struct nvgpu_tsg *tsg) struct nvgpu_tsg *tsg)
{ {
struct nvgpu_fifo *f = &g->fifo; struct nvgpu_fifo *f = &g->fifo;
@@ -122,11 +122,7 @@ void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
u32 tsgid, pbdma_id; u32 tsgid, pbdma_id;
if (g->ops.fifo.preempt_poll_pbdma == NULL) { if (g->ops.fifo.preempt_poll_pbdma == NULL) {
return; return 0;
}
if (tsg == NULL) {
return;
} }
tsgid = tsg->tsgid; tsgid = tsg->tsgid;
@@ -142,12 +138,11 @@ void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
* memory system would be blocked. * memory system would be blocked.
*/ */
if (g->ops.fifo.preempt_poll_pbdma(g, tsgid, pbdma_id) != 0) { if (g->ops.fifo.preempt_poll_pbdma(g, tsgid, pbdma_id) != 0) {
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
pbdma_id,
GPU_HOST_PBDMA_PREEMPT_ERROR, 0);
nvgpu_err(g, "PBDMA preempt failed"); nvgpu_err(g, "PBDMA preempt failed");
return -EBUSY;
} }
} }
return 0;
} }
/* /*

View File

@@ -41,6 +41,7 @@
#ifdef CONFIG_NVGPU_LS_PMU #ifdef CONFIG_NVGPU_LS_PMU
#include <nvgpu/pmu/mutex.h> #include <nvgpu/pmu/mutex.h>
#endif #endif
#include <nvgpu/nvgpu_init.h>
#include "rc_gv11b.h" #include "rc_gv11b.h"
@@ -223,8 +224,12 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
* For each PBDMA which serves the runlist, poll to verify the TSG is no * For each PBDMA which serves the runlist, poll to verify the TSG is no
* longer on the PBDMA and the engine phase of the preempt has started. * longer on the PBDMA and the engine phase of the preempt has started.
*/ */
if (tsg != NULL) { if (tsg != NULL && (nvgpu_preempt_poll_tsg_on_pbdma(g, tsg) != 0)) {
nvgpu_preempt_poll_tsg_on_pbdma(g, tsg); nvgpu_err(g, "TSG preemption on PBDMA failed; "
"PBDMA seems stuck; cannot recover stuck PBDMA.");
/* Trigger Quiesce as recovery failed on hung PBDMA. */
nvgpu_sw_quiesce(g);
return;
} }
#ifdef CONFIG_NVGPU_DEBUGGER #ifdef CONFIG_NVGPU_DEBUGGER

View File

@@ -70,7 +70,7 @@ int nvgpu_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch);
* Called from recovery handling for volta onwards. This will * Called from recovery handling for volta onwards. This will
* not be part of safety build after recovery is not supported in safety build. * not be part of safety build after recovery is not supported in safety build.
*/ */
void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g, int nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
struct nvgpu_tsg *tsg); struct nvgpu_tsg *tsg);
/** /**
* @brief Preempt a set of runlists. * @brief Preempt a set of runlists.

View File

@@ -162,13 +162,11 @@ done:
} }
#define F_PREEMPT_POLL_PBDMA_NULL BIT(0) #define F_PREEMPT_POLL_PBDMA_NULL BIT(0)
#define F_PREEMPT_POLL_TSG_NULL BIT(1) #define F_PREEMPT_POLL_PBDMA_BUSY BIT(1)
#define F_PREEMPT_POLL_PBDMA_BUSY BIT(2) #define F_PREEMPT_POLL_LAST BIT(2)
#define F_PREEMPT_POLL_LAST BIT(3)
static const char *f_preempt_poll[] = { static const char *f_preempt_poll[] = {
"preempt_poll_pbdma_null", "preempt_poll_pbdma_null",
"tsg_null",
"preempt_poll_pbdma_busy", "preempt_poll_pbdma_busy",
}; };
@@ -197,7 +195,7 @@ int test_preempt_poll_tsg_on_pbdma(struct unit_module *m, struct gk20a *g,
u32 branches = 0U; u32 branches = 0U;
int ret = UNIT_FAIL; int ret = UNIT_FAIL;
u32 prune = F_PREEMPT_POLL_PBDMA_NULL | F_PREEMPT_POLL_TSG_NULL; u32 prune = F_PREEMPT_POLL_PBDMA_NULL;
tsg = nvgpu_tsg_open(g, getpid()); tsg = nvgpu_tsg_open(g, getpid());
unit_assert(tsg != NULL, goto done); unit_assert(tsg != NULL, goto done);
@@ -221,14 +219,11 @@ int test_preempt_poll_tsg_on_pbdma(struct unit_module *m, struct gk20a *g,
stub_fifo_preempt_poll_pbdma_busy : stub_fifo_preempt_poll_pbdma_busy :
stub_fifo_preempt_poll_pbdma); stub_fifo_preempt_poll_pbdma);
if (branches & F_PREEMPT_POLL_TSG_NULL) {
nvgpu_preempt_poll_tsg_on_pbdma(g, NULL);
} else {
nvgpu_preempt_poll_tsg_on_pbdma(g, tsg); nvgpu_preempt_poll_tsg_on_pbdma(g, tsg);
}
if (branches & F_PREEMPT_POLL_TSG_NULL) { if (branches & F_PREEMPT_POLL_PBDMA_BUSY) {
unit_assert(stub[0].tsgid == NVGPU_INVALID_TSG_ID, unit_assert(stub[0].pbdma_id !=
nvgpu_ffs(f->runlist_info[0]->pbdma_bitmask),
goto done); goto done);
} else if (!(branches & F_PREEMPT_POLL_PBDMA_NULL)) { } else if (!(branches & F_PREEMPT_POLL_PBDMA_NULL)) {
unit_assert(stub[0].tsgid == 0, goto done); unit_assert(stub[0].tsgid == 0, goto done);