/* * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include void nvgpu_rc_fifo_recover(struct gk20a *g, u32 eng_bitmask, u32 hw_id, bool id_is_tsg, bool id_is_known, bool debug_dump, u32 rc_type) { #ifdef CONFIG_NVGPU_RECOVERY unsigned int id_type; if (debug_dump) { gk20a_debug_dump(g); } if (g->ops.ltc.flush != NULL) { g->ops.ltc.flush(g); } if (id_is_known) { id_type = id_is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL; } else { id_type = ID_TYPE_UNKNOWN; } g->ops.fifo.recover(g, eng_bitmask, hw_id, id_type, rc_type, NULL); #else WARN_ON(!g->sw_quiesce_pending); #endif } void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, struct nvgpu_tsg *tsg, bool debug_dump) { #ifdef CONFIG_NVGPU_RECOVERY nvgpu_tsg_set_error_notifier(g, tsg, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); #ifdef CONFIG_NVGPU_CHANNEL_WDT /* * Cancel all channels' wdt since ctxsw timeout might * trigger multiple watchdogs at a time */ nvgpu_channel_wdt_restart_all_channels(g); #endif nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump, RC_TYPE_CTXSW_TIMEOUT); #else WARN_ON(!g->sw_quiesce_pending); #endif } void nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, u32 error_notifier, struct nvgpu_pbdma_status_info *pbdma_status) { #ifdef CONFIG_NVGPU_RECOVERY u32 id; u32 id_type = PBDMA_STATUS_ID_TYPE_INVALID; nvgpu_log(g, gpu_dbg_info, "pbdma id %d error notifier %d", pbdma_id, error_notifier); if (nvgpu_pbdma_status_is_chsw_valid(pbdma_status) || nvgpu_pbdma_status_is_chsw_save(pbdma_status)) { id = pbdma_status->id; id_type = pbdma_status->id_type; } else if (nvgpu_pbdma_status_is_chsw_load(pbdma_status) || nvgpu_pbdma_status_is_chsw_switch(pbdma_status)) { id = pbdma_status->next_id; id_type = pbdma_status->next_id_type; } else { /* Nothing to do here */ nvgpu_err(g, "Invalid pbdma_status.id"); return; } if (id_type == PBDMA_STATUS_ID_TYPE_TSGID) { struct nvgpu_tsg *tsg = nvgpu_tsg_get_from_id(g, id); nvgpu_tsg_set_error_notifier(g, tsg, error_notifier); nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PBDMA_FAULT); } else if(id_type == PBDMA_STATUS_ID_TYPE_CHID) { struct nvgpu_channel *ch = nvgpu_channel_from_id(g, id); struct nvgpu_tsg *tsg; if (ch == NULL) { nvgpu_err(g, "channel is not referenceable"); return; } tsg = nvgpu_tsg_from_ch(ch); if (tsg != NULL) { nvgpu_tsg_set_error_notifier(g, tsg, error_notifier); nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PBDMA_FAULT); } else { nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); } nvgpu_channel_put(ch); } else { nvgpu_err(g, "Invalid pbdma_status.id_type"); } #else WARN_ON(!g->sw_quiesce_pending); #endif } void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id) { #ifdef CONFIG_NVGPU_RECOVERY u32 eng_bitmask = nvgpu_engine_get_runlist_busy_engines(g, runlist_id); if (eng_bitmask != 0U) { nvgpu_rc_fifo_recover(g, eng_bitmask, INVAL_ID, false, false, true, RC_TYPE_RUNLIST_UPDATE_TIMEOUT); } #else WARN_ON(!g->sw_quiesce_pending); #endif } void nvgpu_rc_preempt_timeout(struct gk20a *g, struct nvgpu_tsg *tsg) { #ifdef CONFIG_NVGPU_RECOVERY nvgpu_tsg_set_error_notifier(g, tsg, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT); #else WARN_ON(!g->sw_quiesce_pending); #endif } void nvgpu_rc_gr_fault(struct gk20a *g, struct nvgpu_tsg *tsg, struct nvgpu_channel *ch) { #ifdef CONFIG_NVGPU_RECOVERY u32 gr_engine_id; u32 gr_eng_bitmask = 0U; gr_engine_id = nvgpu_engine_get_gr_id(g); if (gr_engine_id != NVGPU_INVALID_ENG_ID) { gr_eng_bitmask = BIT32(gr_engine_id); } else { nvgpu_warn(g, "gr_engine_id is invalid"); } if (tsg != NULL) { nvgpu_rc_fifo_recover(g, gr_eng_bitmask, tsg->tsgid, true, true, true, RC_TYPE_GR_FAULT); } else { if (ch != NULL) { nvgpu_err(g, "chid: %d referenceable but not " "bound to tsg", ch->chid); } nvgpu_rc_fifo_recover(g, gr_eng_bitmask, INVAL_ID, false, false, true, RC_TYPE_GR_FAULT); } #else WARN_ON(!g->sw_quiesce_pending); #endif } void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g) { #ifdef CONFIG_NVGPU_RECOVERY /* id is unknown, preempt all runlists and do recovery */ nvgpu_rc_fifo_recover(g, 0, INVAL_ID, false, false, false, RC_TYPE_SCHED_ERR); #else WARN_ON(!g->sw_quiesce_pending); #endif } void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct nvgpu_tsg *tsg, bool debug_dump, u32 rc_type) { #ifdef CONFIG_NVGPU_RECOVERY u32 eng_bitmask = 0U; int err = 0; #ifdef CONFIG_NVGPU_DEBUGGER nvgpu_mutex_acquire(&g->dbg_sessions_lock); #endif /* disable tsg so that it does not get scheduled again */ g->ops.tsg.disable(tsg); /* * On hitting engine reset, h/w drops the ctxsw_status to INVALID in * fifo_engine_status register. Also while the engine is held in reset * h/w passes busy/idle straight through. fifo_engine_status registers * are correct in that there is no context switch outstanding * as the CTXSW is aborted when reset is asserted. */ nvgpu_log_info(g, "acquire engines_reset_mutex"); nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); /* * stop context switching to prevent engine assignments from * changing until engine status is checked to make sure tsg * being recovered is not loaded on the engines */ err = nvgpu_gr_disable_ctxsw(g); if (err != 0) { /* if failed to disable ctxsw, just abort tsg */ nvgpu_err(g, "failed to disable ctxsw"); } else { /* recover engines if tsg is loaded on the engines */ eng_bitmask = g->ops.engine.get_mask_on_id(g, tsg->tsgid, true); /* * it is ok to enable ctxsw before tsg is recovered. If engines * is 0, no engine recovery is needed and if it is non zero, * gk20a_fifo_recover will call get_mask_on_id again. * By that time if tsg is not on the engine, engine need not * be reset. */ err = nvgpu_gr_enable_ctxsw(g); if (err != 0) { nvgpu_err(g, "failed to enable ctxsw"); } } nvgpu_log_info(g, "release engines_reset_mutex"); nvgpu_mutex_release(&g->fifo.engines_reset_mutex); if (eng_bitmask != 0U) { nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump, rc_type); } else { if (nvgpu_tsg_mark_error(g, tsg) && debug_dump) { gk20a_debug_dump(g); } nvgpu_tsg_abort(g, tsg, false); } #ifdef CONFIG_NVGPU_DEBUGGER nvgpu_mutex_release(&g->dbg_sessions_lock); #endif #else WARN_ON(!g->sw_quiesce_pending); #endif }