diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index 625095bd9..8946b314a 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -264,71 +264,6 @@ void nvgpu_tsg_unbind_channel_check_ctx_reload(struct tsg_gk20a *tsg, } } -void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg, - bool verbose, u32 rc_type) -{ - u32 engines_mask = 0U; - int err; - - nvgpu_mutex_acquire(&g->dbg_sessions_lock); - - /* disable tsg so that it does not get scheduled again */ - g->ops.tsg.disable(tsg); - - /* - * On hitting engine reset, h/w drops the ctxsw_status to INVALID in - * fifo_engine_status register. Also while the engine is held in reset - * h/w passes busy/idle straight through. fifo_engine_status registers - * are correct in that there is no context switch outstanding - * as the CTXSW is aborted when reset is asserted. - */ - nvgpu_log_info(g, "acquire engines_reset_mutex"); - nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); - - /* - * stop context switching to prevent engine assignments from - * changing until engine status is checked to make sure tsg - * being recovered is not loaded on the engines - */ - err = g->ops.gr.falcon.disable_ctxsw(g, g->gr.falcon); - - if (err != 0) { - /* if failed to disable ctxsw, just abort tsg */ - nvgpu_err(g, "failed to disable ctxsw"); - } else { - /* recover engines if tsg is loaded on the engines */ - engines_mask = g->ops.engine.get_mask_on_id(g, - tsg->tsgid, true); - - /* - * it is ok to enable ctxsw before tsg is recovered. If engines - * is 0, no engine recovery is needed and if it is non zero, - * gk20a_fifo_recover will call get_mask_on_id again. - * By that time if tsg is not on the engine, engine need not - * be reset. - */ - err = g->ops.gr.falcon.enable_ctxsw(g, g->gr.falcon); - if (err != 0) { - nvgpu_err(g, "failed to enable ctxsw"); - } - } - nvgpu_log_info(g, "release engines_reset_mutex"); - nvgpu_mutex_release(&g->fifo.engines_reset_mutex); - - if (engines_mask != 0U) { - gk20a_fifo_recover(g, engines_mask, tsg->tsgid, true, true, - verbose, rc_type); - } else { - if (nvgpu_tsg_mark_error(g, tsg) && verbose) { - gk20a_debug_dump(g); - } - - nvgpu_tsg_abort(g, tsg, false); - } - - nvgpu_mutex_release(&g->dbg_sessions_lock); -} - static void nvgpu_tsg_destroy(struct gk20a *g, struct tsg_gk20a *tsg) { nvgpu_mutex_destroy(&tsg->event_id_list_lock); @@ -344,7 +279,8 @@ int nvgpu_tsg_force_reset_ch(struct channel_gk20a *ch, if (tsg != NULL) { nvgpu_tsg_set_error_notifier(g, tsg, err_code); - nvgpu_tsg_recover(g, tsg, verbose, RC_TYPE_FORCE_RESET); + nvgpu_rc_tsg_and_related_engines(g, tsg, verbose, + RC_TYPE_FORCE_RESET); } else { nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); } diff --git a/drivers/gpu/nvgpu/common/rc/rc.c b/drivers/gpu/nvgpu/common/rc/rc.c index daeaaaae5..16e102606 100644 --- a/drivers/gpu/nvgpu/common/rc/rc.c +++ b/drivers/gpu/nvgpu/common/rc/rc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -64,7 +65,8 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, struct fifo_gk20a *f, struct tsg_gk20a *tsg = &f->tsg[id]; nvgpu_tsg_set_error_notifier(g, tsg, error_notifier); - nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PBDMA_FAULT); + nvgpu_rc_tsg_and_related_engines(g, tsg, true, + RC_TYPE_PBDMA_FAULT); } else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) { struct channel_gk20a *ch = gk20a_channel_from_id(g, id); struct tsg_gk20a *tsg; @@ -76,7 +78,8 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, struct fifo_gk20a *f, tsg = tsg_gk20a_from_ch(ch); if (tsg != NULL) { nvgpu_tsg_set_error_notifier(g, tsg, error_notifier); - nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PBDMA_FAULT); + nvgpu_rc_tsg_and_related_engines(g, tsg, true, + RC_TYPE_PBDMA_FAULT); } else { nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); } @@ -102,7 +105,7 @@ void nvgpu_rc_preempt_timeout(struct gk20a *g, struct tsg_gk20a *tsg) nvgpu_tsg_set_error_notifier(g, tsg, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); - nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT); + nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT); } void nvgpu_rc_gr_fault(struct gk20a *g, struct tsg_gk20a *tsg, @@ -135,5 +138,70 @@ void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g) { /* id is unknown, preempt all runlists and do recovery */ gk20a_fifo_recover(g, 0, INVAL_ID, false, false, false, - RC_TYPE_SCHED_ERR); + RC_TYPE_SCHED_ERR); +} + +void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct tsg_gk20a *tsg, + bool debug_dump, u32 rc_type) +{ + u32 eng_bitmask = 0U; + int err; + + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + + /* disable tsg so that it does not get scheduled again */ + g->ops.tsg.disable(tsg); + + /* + * On hitting engine reset, h/w drops the ctxsw_status to INVALID in + * fifo_engine_status register. Also while the engine is held in reset + * h/w passes busy/idle straight through. fifo_engine_status registers + * are correct in that there is no context switch outstanding + * as the CTXSW is aborted when reset is asserted. + */ + nvgpu_log_info(g, "acquire engines_reset_mutex"); + nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); + + /* + * stop context switching to prevent engine assignments from + * changing until engine status is checked to make sure tsg + * being recovered is not loaded on the engines + */ + err = g->ops.gr.falcon.disable_ctxsw(g, g->gr.falcon); + + if (err != 0) { + /* if failed to disable ctxsw, just abort tsg */ + nvgpu_err(g, "failed to disable ctxsw"); + } else { + /* recover engines if tsg is loaded on the engines */ + eng_bitmask = g->ops.engine.get_mask_on_id(g, + tsg->tsgid, true); + + /* + * it is ok to enable ctxsw before tsg is recovered. If engines + * is 0, no engine recovery is needed and if it is non zero, + * gk20a_fifo_recover will call get_mask_on_id again. + * By that time if tsg is not on the engine, engine need not + * be reset. + */ + err = g->ops.gr.falcon.enable_ctxsw(g, g->gr.falcon); + if (err != 0) { + nvgpu_err(g, "failed to enable ctxsw"); + } + } + nvgpu_log_info(g, "release engines_reset_mutex"); + nvgpu_mutex_release(&g->fifo.engines_reset_mutex); + + if (eng_bitmask != 0U) { + gk20a_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, + debug_dump, rc_type); + } else { + if (nvgpu_tsg_mark_error(g, tsg) && debug_dump) { + gk20a_debug_dump(g); + } + + nvgpu_tsg_abort(g, tsg, false); + } + + nvgpu_mutex_release(&g->dbg_sessions_lock); } diff --git a/drivers/gpu/nvgpu/include/nvgpu/rc.h b/drivers/gpu/nvgpu/include/nvgpu/rc.h index fff3ecf14..69096065c 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/rc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h @@ -54,5 +54,7 @@ void nvgpu_rc_preempt_timeout(struct gk20a *g, struct tsg_gk20a *tsg); void nvgpu_rc_gr_fault(struct gk20a *g, struct tsg_gk20a *tsg, struct channel_gk20a *ch); void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g); +void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct tsg_gk20a *tsg, + bool debug_dump, u32 rc_type); #endif /* NVGPU_RC_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg.h b/drivers/gpu/nvgpu/include/nvgpu/tsg.h index 67a37eb9c..6a436ac01 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h +++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h @@ -106,9 +106,6 @@ void nvgpu_tsg_unbind_channel_check_ctx_reload(struct tsg_gk20a *tsg, struct nvgpu_channel_hw_state *hw_state); int nvgpu_tsg_force_reset_ch(struct channel_gk20a *ch, u32 err_code, bool verbose); -void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg, - bool verbose, u32 rc_type); - void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g, struct tsg_gk20a *tsg); bool nvgpu_tsg_mark_error(struct gk20a *g, struct tsg_gk20a *tsg);