mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 09:12:24 +03:00
gpu: nvgpu: Trigger quiesce on PBDMA preempt fail
During recovery, we preempt the faulty TSG from PBDMA and engines. If the TSG preempt on PBDMA times out(timeout = 100ms), the PBDMA might be hung state. We do not reset the HOST during recovery, so stuck PBDMAs are unrecoverable. Abort the recovery and trigger GPU to quiesce as there is no way back. Triggering Quiesce from recovery sequence should be fine as the only redundant operation will be write to FIFO_RUNLIST_PREEMPT register. The error notifiers will eventually be set by Quiesce thread. Bug 2768005 JIRA NVGPU-4631 Change-Id: I914b9379aa8e48014e6ddace9abe47180a072863 Signed-off-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2368187 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
Alex Waterman
parent
f7a73f4ab8
commit
881a6f35be
@@ -112,7 +112,7 @@ int nvgpu_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch)
|
||||
}
|
||||
|
||||
/* called from rc */
|
||||
void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
|
||||
int nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
|
||||
struct nvgpu_tsg *tsg)
|
||||
{
|
||||
struct nvgpu_fifo *f = &g->fifo;
|
||||
@@ -122,11 +122,7 @@ void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
|
||||
u32 tsgid, pbdma_id;
|
||||
|
||||
if (g->ops.fifo.preempt_poll_pbdma == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (tsg == NULL) {
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
|
||||
tsgid = tsg->tsgid;
|
||||
@@ -142,12 +138,11 @@ void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
|
||||
* memory system would be blocked.
|
||||
*/
|
||||
if (g->ops.fifo.preempt_poll_pbdma(g, tsgid, pbdma_id) != 0) {
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
pbdma_id,
|
||||
GPU_HOST_PBDMA_PREEMPT_ERROR, 0);
|
||||
nvgpu_err(g, "PBDMA preempt failed");
|
||||
return -EBUSY;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#ifdef CONFIG_NVGPU_LS_PMU
|
||||
#include <nvgpu/pmu/mutex.h>
|
||||
#endif
|
||||
#include <nvgpu/nvgpu_init.h>
|
||||
|
||||
#include "rc_gv11b.h"
|
||||
|
||||
@@ -223,8 +224,12 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask,
|
||||
* For each PBDMA which serves the runlist, poll to verify the TSG is no
|
||||
* longer on the PBDMA and the engine phase of the preempt has started.
|
||||
*/
|
||||
if (tsg != NULL) {
|
||||
nvgpu_preempt_poll_tsg_on_pbdma(g, tsg);
|
||||
if (tsg != NULL && (nvgpu_preempt_poll_tsg_on_pbdma(g, tsg) != 0)) {
|
||||
nvgpu_err(g, "TSG preemption on PBDMA failed; "
|
||||
"PBDMA seems stuck; cannot recover stuck PBDMA.");
|
||||
/* Trigger Quiesce as recovery failed on hung PBDMA. */
|
||||
nvgpu_sw_quiesce(g);
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVGPU_DEBUGGER
|
||||
|
||||
@@ -70,7 +70,7 @@ int nvgpu_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch);
|
||||
* Called from recovery handling for volta onwards. This will
|
||||
* not be part of safety build after recovery is not supported in safety build.
|
||||
*/
|
||||
void nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
|
||||
int nvgpu_preempt_poll_tsg_on_pbdma(struct gk20a *g,
|
||||
struct nvgpu_tsg *tsg);
|
||||
/**
|
||||
* @brief Preempt a set of runlists.
|
||||
|
||||
@@ -162,13 +162,11 @@ done:
|
||||
}
|
||||
|
||||
#define F_PREEMPT_POLL_PBDMA_NULL BIT(0)
|
||||
#define F_PREEMPT_POLL_TSG_NULL BIT(1)
|
||||
#define F_PREEMPT_POLL_PBDMA_BUSY BIT(2)
|
||||
#define F_PREEMPT_POLL_LAST BIT(3)
|
||||
#define F_PREEMPT_POLL_PBDMA_BUSY BIT(1)
|
||||
#define F_PREEMPT_POLL_LAST BIT(2)
|
||||
|
||||
static const char *f_preempt_poll[] = {
|
||||
"preempt_poll_pbdma_null",
|
||||
"tsg_null",
|
||||
"preempt_poll_pbdma_busy",
|
||||
};
|
||||
|
||||
@@ -197,7 +195,7 @@ int test_preempt_poll_tsg_on_pbdma(struct unit_module *m, struct gk20a *g,
|
||||
|
||||
u32 branches = 0U;
|
||||
int ret = UNIT_FAIL;
|
||||
u32 prune = F_PREEMPT_POLL_PBDMA_NULL | F_PREEMPT_POLL_TSG_NULL;
|
||||
u32 prune = F_PREEMPT_POLL_PBDMA_NULL;
|
||||
|
||||
tsg = nvgpu_tsg_open(g, getpid());
|
||||
unit_assert(tsg != NULL, goto done);
|
||||
@@ -221,14 +219,11 @@ int test_preempt_poll_tsg_on_pbdma(struct unit_module *m, struct gk20a *g,
|
||||
stub_fifo_preempt_poll_pbdma_busy :
|
||||
stub_fifo_preempt_poll_pbdma);
|
||||
|
||||
if (branches & F_PREEMPT_POLL_TSG_NULL) {
|
||||
nvgpu_preempt_poll_tsg_on_pbdma(g, NULL);
|
||||
} else {
|
||||
nvgpu_preempt_poll_tsg_on_pbdma(g, tsg);
|
||||
}
|
||||
nvgpu_preempt_poll_tsg_on_pbdma(g, tsg);
|
||||
|
||||
if (branches & F_PREEMPT_POLL_TSG_NULL) {
|
||||
unit_assert(stub[0].tsgid == NVGPU_INVALID_TSG_ID,
|
||||
if (branches & F_PREEMPT_POLL_PBDMA_BUSY) {
|
||||
unit_assert(stub[0].pbdma_id !=
|
||||
nvgpu_ffs(f->runlist_info[0]->pbdma_bitmask),
|
||||
goto done);
|
||||
} else if (!(branches & F_PREEMPT_POLL_PBDMA_NULL)) {
|
||||
unit_assert(stub[0].tsgid == 0, goto done);
|
||||
|
||||
Reference in New Issue
Block a user