mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-23 09:57:08 +03:00
gpu: nvgpu: move nvgpu_tsg_recover to common/rc
Moved from common/tsg to common/rc and renamed nvgpu_tsg_recover -> nvgpu_rc_tsg_and_related_engines JIRA NVGPU-1314 Change-Id: I887d5fcdb15def13cc74e2993312b3b36119c97c Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2095622 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
c570ba99ed
commit
03b521d9d7
@@ -264,71 +264,6 @@ void nvgpu_tsg_unbind_channel_check_ctx_reload(struct tsg_gk20a *tsg,
|
||||
}
|
||||
}
|
||||
|
||||
void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
|
||||
bool verbose, u32 rc_type)
|
||||
{
|
||||
u32 engines_mask = 0U;
|
||||
int err;
|
||||
|
||||
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
|
||||
|
||||
/* disable tsg so that it does not get scheduled again */
|
||||
g->ops.tsg.disable(tsg);
|
||||
|
||||
/*
|
||||
* On hitting engine reset, h/w drops the ctxsw_status to INVALID in
|
||||
* fifo_engine_status register. Also while the engine is held in reset
|
||||
* h/w passes busy/idle straight through. fifo_engine_status registers
|
||||
* are correct in that there is no context switch outstanding
|
||||
* as the CTXSW is aborted when reset is asserted.
|
||||
*/
|
||||
nvgpu_log_info(g, "acquire engines_reset_mutex");
|
||||
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
|
||||
|
||||
/*
|
||||
* stop context switching to prevent engine assignments from
|
||||
* changing until engine status is checked to make sure tsg
|
||||
* being recovered is not loaded on the engines
|
||||
*/
|
||||
err = g->ops.gr.falcon.disable_ctxsw(g, g->gr.falcon);
|
||||
|
||||
if (err != 0) {
|
||||
/* if failed to disable ctxsw, just abort tsg */
|
||||
nvgpu_err(g, "failed to disable ctxsw");
|
||||
} else {
|
||||
/* recover engines if tsg is loaded on the engines */
|
||||
engines_mask = g->ops.engine.get_mask_on_id(g,
|
||||
tsg->tsgid, true);
|
||||
|
||||
/*
|
||||
* it is ok to enable ctxsw before tsg is recovered. If engines
|
||||
* is 0, no engine recovery is needed and if it is non zero,
|
||||
* gk20a_fifo_recover will call get_mask_on_id again.
|
||||
* By that time if tsg is not on the engine, engine need not
|
||||
* be reset.
|
||||
*/
|
||||
err = g->ops.gr.falcon.enable_ctxsw(g, g->gr.falcon);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "failed to enable ctxsw");
|
||||
}
|
||||
}
|
||||
nvgpu_log_info(g, "release engines_reset_mutex");
|
||||
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
|
||||
|
||||
if (engines_mask != 0U) {
|
||||
gk20a_fifo_recover(g, engines_mask, tsg->tsgid, true, true,
|
||||
verbose, rc_type);
|
||||
} else {
|
||||
if (nvgpu_tsg_mark_error(g, tsg) && verbose) {
|
||||
gk20a_debug_dump(g);
|
||||
}
|
||||
|
||||
nvgpu_tsg_abort(g, tsg, false);
|
||||
}
|
||||
|
||||
nvgpu_mutex_release(&g->dbg_sessions_lock);
|
||||
}
|
||||
|
||||
static void nvgpu_tsg_destroy(struct gk20a *g, struct tsg_gk20a *tsg)
|
||||
{
|
||||
nvgpu_mutex_destroy(&tsg->event_id_list_lock);
|
||||
@@ -344,7 +279,8 @@ int nvgpu_tsg_force_reset_ch(struct channel_gk20a *ch,
|
||||
|
||||
if (tsg != NULL) {
|
||||
nvgpu_tsg_set_error_notifier(g, tsg, err_code);
|
||||
nvgpu_tsg_recover(g, tsg, verbose, RC_TYPE_FORCE_RESET);
|
||||
nvgpu_rc_tsg_and_related_engines(g, tsg, verbose,
|
||||
RC_TYPE_FORCE_RESET);
|
||||
} else {
|
||||
nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include <nvgpu/gk20a.h>
|
||||
#include <nvgpu/fifo.h>
|
||||
#include <nvgpu/engines.h>
|
||||
#include <nvgpu/debug.h>
|
||||
#include <nvgpu/channel.h>
|
||||
#include <nvgpu/tsg.h>
|
||||
#include <nvgpu/error_notifier.h>
|
||||
@@ -64,7 +65,8 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, struct fifo_gk20a *f,
|
||||
struct tsg_gk20a *tsg = &f->tsg[id];
|
||||
|
||||
nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
|
||||
nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PBDMA_FAULT);
|
||||
nvgpu_rc_tsg_and_related_engines(g, tsg, true,
|
||||
RC_TYPE_PBDMA_FAULT);
|
||||
} else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) {
|
||||
struct channel_gk20a *ch = gk20a_channel_from_id(g, id);
|
||||
struct tsg_gk20a *tsg;
|
||||
@@ -76,7 +78,8 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, struct fifo_gk20a *f,
|
||||
tsg = tsg_gk20a_from_ch(ch);
|
||||
if (tsg != NULL) {
|
||||
nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
|
||||
nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PBDMA_FAULT);
|
||||
nvgpu_rc_tsg_and_related_engines(g, tsg, true,
|
||||
RC_TYPE_PBDMA_FAULT);
|
||||
} else {
|
||||
nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
|
||||
}
|
||||
@@ -102,7 +105,7 @@ void nvgpu_rc_preempt_timeout(struct gk20a *g, struct tsg_gk20a *tsg)
|
||||
nvgpu_tsg_set_error_notifier(g, tsg,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
|
||||
|
||||
nvgpu_tsg_recover(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT);
|
||||
nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT);
|
||||
}
|
||||
|
||||
void nvgpu_rc_gr_fault(struct gk20a *g, struct tsg_gk20a *tsg,
|
||||
@@ -135,5 +138,70 @@ void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g)
|
||||
{
|
||||
/* id is unknown, preempt all runlists and do recovery */
|
||||
gk20a_fifo_recover(g, 0, INVAL_ID, false, false, false,
|
||||
RC_TYPE_SCHED_ERR);
|
||||
RC_TYPE_SCHED_ERR);
|
||||
}
|
||||
|
||||
void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct tsg_gk20a *tsg,
|
||||
bool debug_dump, u32 rc_type)
|
||||
{
|
||||
u32 eng_bitmask = 0U;
|
||||
int err;
|
||||
|
||||
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
|
||||
|
||||
/* disable tsg so that it does not get scheduled again */
|
||||
g->ops.tsg.disable(tsg);
|
||||
|
||||
/*
|
||||
* On hitting engine reset, h/w drops the ctxsw_status to INVALID in
|
||||
* fifo_engine_status register. Also while the engine is held in reset
|
||||
* h/w passes busy/idle straight through. fifo_engine_status registers
|
||||
* are correct in that there is no context switch outstanding
|
||||
* as the CTXSW is aborted when reset is asserted.
|
||||
*/
|
||||
nvgpu_log_info(g, "acquire engines_reset_mutex");
|
||||
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
|
||||
|
||||
/*
|
||||
* stop context switching to prevent engine assignments from
|
||||
* changing until engine status is checked to make sure tsg
|
||||
* being recovered is not loaded on the engines
|
||||
*/
|
||||
err = g->ops.gr.falcon.disable_ctxsw(g, g->gr.falcon);
|
||||
|
||||
if (err != 0) {
|
||||
/* if failed to disable ctxsw, just abort tsg */
|
||||
nvgpu_err(g, "failed to disable ctxsw");
|
||||
} else {
|
||||
/* recover engines if tsg is loaded on the engines */
|
||||
eng_bitmask = g->ops.engine.get_mask_on_id(g,
|
||||
tsg->tsgid, true);
|
||||
|
||||
/*
|
||||
* it is ok to enable ctxsw before tsg is recovered. If engines
|
||||
* is 0, no engine recovery is needed and if it is non zero,
|
||||
* gk20a_fifo_recover will call get_mask_on_id again.
|
||||
* By that time if tsg is not on the engine, engine need not
|
||||
* be reset.
|
||||
*/
|
||||
err = g->ops.gr.falcon.enable_ctxsw(g, g->gr.falcon);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "failed to enable ctxsw");
|
||||
}
|
||||
}
|
||||
nvgpu_log_info(g, "release engines_reset_mutex");
|
||||
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
|
||||
|
||||
if (eng_bitmask != 0U) {
|
||||
gk20a_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true,
|
||||
debug_dump, rc_type);
|
||||
} else {
|
||||
if (nvgpu_tsg_mark_error(g, tsg) && debug_dump) {
|
||||
gk20a_debug_dump(g);
|
||||
}
|
||||
|
||||
nvgpu_tsg_abort(g, tsg, false);
|
||||
}
|
||||
|
||||
nvgpu_mutex_release(&g->dbg_sessions_lock);
|
||||
}
|
||||
|
||||
@@ -54,5 +54,7 @@ void nvgpu_rc_preempt_timeout(struct gk20a *g, struct tsg_gk20a *tsg);
|
||||
void nvgpu_rc_gr_fault(struct gk20a *g,
|
||||
struct tsg_gk20a *tsg, struct channel_gk20a *ch);
|
||||
void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g);
|
||||
void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct tsg_gk20a *tsg,
|
||||
bool debug_dump, u32 rc_type);
|
||||
|
||||
#endif /* NVGPU_RC_H */
|
||||
|
||||
@@ -106,9 +106,6 @@ void nvgpu_tsg_unbind_channel_check_ctx_reload(struct tsg_gk20a *tsg,
|
||||
struct nvgpu_channel_hw_state *hw_state);
|
||||
int nvgpu_tsg_force_reset_ch(struct channel_gk20a *ch,
|
||||
u32 err_code, bool verbose);
|
||||
void nvgpu_tsg_recover(struct gk20a *g, struct tsg_gk20a *tsg,
|
||||
bool verbose, u32 rc_type);
|
||||
|
||||
void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g,
|
||||
struct tsg_gk20a *tsg);
|
||||
bool nvgpu_tsg_mark_error(struct gk20a *g, struct tsg_gk20a *tsg);
|
||||
|
||||
Reference in New Issue
Block a user