gpu: nvgpu: move nvgpu_tsg_recover to common/rc

Moved from common/tsg to common/rc and renamed
nvgpu_tsg_recover -> nvgpu_rc_tsg_and_related_engines

JIRA NVGPU-1314

Change-Id: I887d5fcdb15def13cc74e2993312b3b36119c97c
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2095622
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Seema Khowala
2019-04-11 13:54:24 -07:00
committed by mobile promotions
parent c570ba99ed
commit 03b521d9d7
4 changed files with 76 additions and 73 deletions

View File

@@ -264,71 +264,6 @@ void nvgpu_tsg_unbind_channel_check_ctx_reload(struct tsg_gk20a *tsg,
} }
} }
void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
bool verbose, u32 rc_type)
{
u32 engines_mask = 0U;
int err;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
/* disable tsg so that it does not get scheduled again */
g->ops.tsg.disable(tsg);
/*
* On hitting engine reset, h/w drops the ctxsw_status to INVALID in
* fifo_engine_status register. Also while the engine is held in reset
* h/w passes busy/idle straight through. fifo_engine_status registers
* are correct in that there is no context switch outstanding
* as the CTXSW is aborted when reset is asserted.
*/
nvgpu_log_info(g, "acquire engines_reset_mutex");
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
/*
* stop context switching to prevent engine assignments from
* changing until engine status is checked to make sure tsg
* being recovered is not loaded on the engines
*/
err = g->ops.gr.falcon.disable_ctxsw(g, g->gr.falcon);
if (err != 0) {
/* if failed to disable ctxsw, just abort tsg */
nvgpu_err(g, "failed to disable ctxsw");
} else {
/* recover engines if tsg is loaded on the engines */
engines_mask = g->ops.engine.get_mask_on_id(g,
tsg->tsgid, true);
/*
* it is ok to enable ctxsw before tsg is recovered. If engines
* is 0, no engine recovery is needed and if it is non zero,
* gk20a_fifo_recover will call get_mask_on_id again.
* By that time if tsg is not on the engine, engine need not
* be reset.
*/
err = g->ops.gr.falcon.enable_ctxsw(g, g->gr.falcon);
if (err != 0) {
nvgpu_err(g, "failed to enable ctxsw");
}
}
nvgpu_log_info(g, "release engines_reset_mutex");
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
if (engines_mask != 0U) {
gk20a_fifo_recover(g, engines_mask, tsg->tsgid, true, true,
verbose, rc_type);
} else {
if (nvgpu_tsg_mark_error(g, tsg) && verbose) {
gk20a_debug_dump(g);
}
nvgpu_tsg_abort(g, tsg, false);
}
nvgpu_mutex_release(&g->dbg_sessions_lock);
}
static void nvgpu_tsg_destroy(struct gk20a *g, struct tsg_gk20a *tsg) static void nvgpu_tsg_destroy(struct gk20a *g, struct tsg_gk20a *tsg)
{ {
nvgpu_mutex_destroy(&tsg->event_id_list_lock); nvgpu_mutex_destroy(&tsg->event_id_list_lock);
@@ -344,7 +279,8 @@ int nvgpu_tsg_force_reset_ch(struct channel_gk20a *ch,
if (tsg != NULL) { if (tsg != NULL) {
nvgpu_tsg_set_error_notifier(g, tsg, err_code); nvgpu_tsg_set_error_notifier(g, tsg, err_code);
nvgpu_tsg_recover(g, tsg, verbose, RC_TYPE_FORCE_RESET); nvgpu_rc_tsg_and_related_engines(g, tsg, verbose,
RC_TYPE_FORCE_RESET);
} else { } else {
nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
} }

View File

@@ -24,6 +24,7 @@
#include <nvgpu/gk20a.h> #include <nvgpu/gk20a.h>
#include <nvgpu/fifo.h> #include <nvgpu/fifo.h>
#include <nvgpu/engines.h> #include <nvgpu/engines.h>
#include <nvgpu/debug.h>
#include <nvgpu/channel.h> #include <nvgpu/channel.h>
#include <nvgpu/tsg.h> #include <nvgpu/tsg.h>
#include <nvgpu/error_notifier.h> #include <nvgpu/error_notifier.h>
@@ -64,7 +65,8 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, struct fifo_gk20a *f,
struct tsg_gk20a *tsg = &f->tsg[id]; struct tsg_gk20a *tsg = &f->tsg[id];
nvgpu_tsg_set_error_notifier(g, tsg, error_notifier); nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PBDMA_FAULT); nvgpu_rc_tsg_and_related_engines(g, tsg, true,
RC_TYPE_PBDMA_FAULT);
} else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) { } else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) {
struct channel_gk20a *ch = gk20a_channel_from_id(g, id); struct channel_gk20a *ch = gk20a_channel_from_id(g, id);
struct tsg_gk20a *tsg; struct tsg_gk20a *tsg;
@@ -76,7 +78,8 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, struct fifo_gk20a *f,
tsg = tsg_gk20a_from_ch(ch); tsg = tsg_gk20a_from_ch(ch);
if (tsg != NULL) { if (tsg != NULL) {
nvgpu_tsg_set_error_notifier(g, tsg, error_notifier); nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PBDMA_FAULT); nvgpu_rc_tsg_and_related_engines(g, tsg, true,
RC_TYPE_PBDMA_FAULT);
} else { } else {
nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
} }
@@ -102,7 +105,7 @@ void nvgpu_rc_preempt_timeout(struct gk20a *g, struct tsg_gk20a *tsg)
nvgpu_tsg_set_error_notifier(g, tsg, nvgpu_tsg_set_error_notifier(g, tsg,
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT); nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT);
} }
void nvgpu_rc_gr_fault(struct gk20a *g, struct tsg_gk20a *tsg, void nvgpu_rc_gr_fault(struct gk20a *g, struct tsg_gk20a *tsg,
@@ -135,5 +138,70 @@ void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g)
{ {
/* id is unknown, preempt all runlists and do recovery */ /* id is unknown, preempt all runlists and do recovery */
gk20a_fifo_recover(g, 0, INVAL_ID, false, false, false, gk20a_fifo_recover(g, 0, INVAL_ID, false, false, false,
RC_TYPE_SCHED_ERR); RC_TYPE_SCHED_ERR);
}
void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct tsg_gk20a *tsg,
bool debug_dump, u32 rc_type)
{
u32 eng_bitmask = 0U;
int err;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
/* disable tsg so that it does not get scheduled again */
g->ops.tsg.disable(tsg);
/*
* On hitting engine reset, h/w drops the ctxsw_status to INVALID in
* fifo_engine_status register. Also while the engine is held in reset
* h/w passes busy/idle straight through. fifo_engine_status registers
* are correct in that there is no context switch outstanding
* as the CTXSW is aborted when reset is asserted.
*/
nvgpu_log_info(g, "acquire engines_reset_mutex");
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
/*
* stop context switching to prevent engine assignments from
* changing until engine status is checked to make sure tsg
* being recovered is not loaded on the engines
*/
err = g->ops.gr.falcon.disable_ctxsw(g, g->gr.falcon);
if (err != 0) {
/* if failed to disable ctxsw, just abort tsg */
nvgpu_err(g, "failed to disable ctxsw");
} else {
/* recover engines if tsg is loaded on the engines */
eng_bitmask = g->ops.engine.get_mask_on_id(g,
tsg->tsgid, true);
/*
* it is ok to enable ctxsw before tsg is recovered. If engines
* is 0, no engine recovery is needed and if it is non zero,
* gk20a_fifo_recover will call get_mask_on_id again.
* By that time if tsg is not on the engine, engine need not
* be reset.
*/
err = g->ops.gr.falcon.enable_ctxsw(g, g->gr.falcon);
if (err != 0) {
nvgpu_err(g, "failed to enable ctxsw");
}
}
nvgpu_log_info(g, "release engines_reset_mutex");
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
if (eng_bitmask != 0U) {
gk20a_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true,
debug_dump, rc_type);
} else {
if (nvgpu_tsg_mark_error(g, tsg) && debug_dump) {
gk20a_debug_dump(g);
}
nvgpu_tsg_abort(g, tsg, false);
}
nvgpu_mutex_release(&g->dbg_sessions_lock);
} }

View File

@@ -54,5 +54,7 @@ void nvgpu_rc_preempt_timeout(struct gk20a *g, struct tsg_gk20a *tsg);
void nvgpu_rc_gr_fault(struct gk20a *g, void nvgpu_rc_gr_fault(struct gk20a *g,
struct tsg_gk20a *tsg, struct channel_gk20a *ch); struct tsg_gk20a *tsg, struct channel_gk20a *ch);
void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g); void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g);
void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct tsg_gk20a *tsg,
bool debug_dump, u32 rc_type);
#endif /* NVGPU_RC_H */ #endif /* NVGPU_RC_H */

View File

@@ -106,9 +106,6 @@ void nvgpu_tsg_unbind_channel_check_ctx_reload(struct tsg_gk20a *tsg,
struct nvgpu_channel_hw_state *hw_state); struct nvgpu_channel_hw_state *hw_state);
int nvgpu_tsg_force_reset_ch(struct channel_gk20a *ch, int nvgpu_tsg_force_reset_ch(struct channel_gk20a *ch,
u32 err_code, bool verbose); u32 err_code, bool verbose);
void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
bool verbose, u32 rc_type);
void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g, void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g,
struct tsg_gk20a *tsg); struct tsg_gk20a *tsg);
bool nvgpu_tsg_mark_error(struct gk20a *g, struct tsg_gk20a *tsg); bool nvgpu_tsg_mark_error(struct gk20a *g, struct tsg_gk20a *tsg);