diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index 984a7d23f..793bf94d3 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -185,18 +185,45 @@ int gk20a_tsg_unbind_channel(struct channel_gk20a *ch) void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg, bool verbose, u32 rc_type) { - u32 engines; + u32 engines_mask = 0U; + int err; - /* stop context switching to prevent engine assignments from - changing until TSG is recovered */ nvgpu_mutex_acquire(&g->dbg_sessions_lock); - gr_gk20a_disable_ctxsw(g); - engines = g->ops.fifo.get_engines_mask_on_id(g, tsg->tsgid, true); + /* disable tsg so that it does not get scheduled again */ + g->ops.fifo.disable_tsg(tsg); - if (engines != 0U) { - gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, - rc_type); + /* + * stop context switching to prevent engine assignments from + * changing until engine status is checked to make sure tsg + * being recovered is not loaded on the engines + */ + err = gr_gk20a_disable_ctxsw(g); + + if (err != 0) { + /* if failed to disable ctxsw, just abort tsg */ + nvgpu_err(g, "failed to disable ctxsw"); + } else { + /* recover engines if tsg is loaded on the engines */ + engines_mask = g->ops.fifo.get_engines_mask_on_id(g, + tsg->tsgid, true); + + /* + * it is ok to enable ctxsw before tsg is recovered. If engines + * is 0, no engine recovery is needed and if it is non zero, + * gk20a_fifo_recover will call get_engines_mask_on_id again. + * By that time if tsg is not on the engine, engine need not + * be reset. + */ + err = gr_gk20a_enable_ctxsw(g); + if (err != 0) { + nvgpu_err(g, "failed to enable ctxsw"); + } + } + + if (engines_mask != 0U) { + gk20a_fifo_recover(g, engines_mask, tsg->tsgid, true, true, + verbose, rc_type); } else { if (nvgpu_tsg_mark_error(g, tsg) && verbose) { gk20a_debug_dump(g); @@ -205,7 +232,6 @@ void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg, gk20a_fifo_abort_tsg(g, tsg, false); } - gr_gk20a_enable_ctxsw(g); nvgpu_mutex_release(&g->dbg_sessions_lock); } diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 108155222..cc00c5fc7 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -528,7 +528,14 @@ static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret) .cond.fail = GR_IS_UCODE_OP_EQUAL }, true); } -/* Stop processing (stall) context switches at FECS. */ +/** + * Stop processing (stall) context switches at FECS:- + * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen + * and may timeout. It could manifest as different error signatures + * depending on when stop_ctxsw fecs method gets sent with respect + * to pmu elpg sequence. It could come as pmu halt or abort or + * maybe ext error too. +*/ int gr_gk20a_disable_ctxsw(struct gk20a *g) { int err = 0; @@ -538,8 +545,24 @@ int gr_gk20a_disable_ctxsw(struct gk20a *g) nvgpu_mutex_acquire(&g->ctxsw_disable_lock); g->ctxsw_disable_count++; if (g->ctxsw_disable_count == 1) { - err = gr_gk20a_ctrl_ctxsw(g, + err = nvgpu_pg_elpg_disable(g); + if (err != 0) { + nvgpu_err(g, "failed to disable elpg. not safe to " + "stop_ctxsw"); + /* stop ctxsw command is not sent */ + g->ctxsw_disable_count--; + } else { + err = gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), NULL); + if (err != 0) { + nvgpu_err(g, "failed to stop fecs ctxsw"); + /* stop ctxsw failed */ + g->ctxsw_disable_count--; + } + } + } else { + nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d", + g->ctxsw_disable_count); } nvgpu_mutex_release(&g->ctxsw_disable_lock); @@ -554,12 +577,27 @@ int gr_gk20a_enable_ctxsw(struct gk20a *g) nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); nvgpu_mutex_acquire(&g->ctxsw_disable_lock); + if (g->ctxsw_disable_count == 0) { + goto ctxsw_already_enabled; + } g->ctxsw_disable_count--; WARN_ON(g->ctxsw_disable_count < 0); if (g->ctxsw_disable_count == 0) { err = gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), NULL); + if (err != 0) { + nvgpu_err(g, "failed to start fecs ctxsw"); + } else { + if (nvgpu_pg_elpg_enable(g) != 0) { + nvgpu_err(g, "failed to enable elpg " + "after start_ctxsw"); + } + } + } else { + nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet", + g->ctxsw_disable_count); } +ctxsw_already_enabled: nvgpu_mutex_release(&g->ctxsw_disable_lock); return err;