gpu: nvgpu: protect recovery with engines_reset_mutex

Rename gr_reset_mutex to engines_reset_mutex and acquire it
before initiating recovery. Recovery running in parallel with
engine reset is not recommended.

On hitting engine reset, h/w drops the ctxsw_status to INVALID in
fifo_engine_status register. Also while the engine is held in reset
h/w passes busy/idle straight through. fifo_engine_status registers
are correct in that there is no context switch outstanding
as the CTXSW is aborted when reset is asserted.

Use deferred_reset_mutex to protect deferred_reset_pending variable
If deferred_reset_pending is true then acquire engines_reset_mutex
and call gk20a_fifo_deferred_reset.
gk20a_fifo_deferred_reset would also check the value of
deferred_reset_pending before initiating reset process

Bug 2092051
Bug 2429295
Bug 2484211
Bug 1890287

Change-Id: I47de669a6203e0b2e9a8237ec4e4747339b9837c
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2022373
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
(cherry-picked from cb91bf1e13
in dev-main)
Reviewed-on: https://git-master.nvidia.com/r/2024901
GVS: Gerrit_Virtual_Submit
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Debarshi Dutta
2019-04-30 15:11:31 +05:30
committed by mobile promotions
parent 4d8ad643d6
commit 6509bb49da
4 changed files with 91 additions and 47 deletions

View File

@@ -308,6 +308,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
struct dbg_session_data *session_data, *tmp_s; struct dbg_session_data *session_data, *tmp_s;
struct dbg_session_channel_data *ch_data, *tmp; struct dbg_session_channel_data *ch_data, *tmp;
int err; int err;
bool deferred_reset_pending;
nvgpu_log_fn(g, " "); nvgpu_log_fn(g, " ");
@@ -381,18 +382,18 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
/* if engine reset was deferred, perform it now */ /* if engine reset was deferred, perform it now */
nvgpu_mutex_acquire(&f->deferred_reset_mutex); nvgpu_mutex_acquire(&f->deferred_reset_mutex);
if (g->fifo.deferred_reset_pending) { deferred_reset_pending = g->fifo.deferred_reset_pending;
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
" deferred, running now");
/* if lock is already taken, a reset is taking place
so no need to repeat */
if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {
gk20a_fifo_deferred_reset(g, ch);
nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
}
}
nvgpu_mutex_release(&f->deferred_reset_mutex); nvgpu_mutex_release(&f->deferred_reset_mutex);
if (deferred_reset_pending) {
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
" deferred, running now");
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
gk20a_fifo_deferred_reset(g, ch);
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
}
if (!gk20a_channel_as_bound(ch)) { if (!gk20a_channel_as_bound(ch)) {
goto unbind; goto unbind;
} }

View File

@@ -910,9 +910,9 @@ int gk20a_init_fifo_setup_sw_common(struct gk20a *g)
return err; return err;
} }
err = nvgpu_mutex_init(&f->gr_reset_mutex); err = nvgpu_mutex_init(&f->engines_reset_mutex);
if (err) { if (err) {
nvgpu_err(g, "failed to init gr_reset_mutex"); nvgpu_err(g, "failed to init engines_reset_mutex");
return err; return err;
} }
@@ -1581,14 +1581,22 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch)
{ {
unsigned long engine_id, engines = 0U; unsigned long engine_id, engines = 0U;
struct tsg_gk20a *tsg; struct tsg_gk20a *tsg;
bool deferred_reset_pending;
struct fifo_gk20a *f = &g->fifo;
nvgpu_mutex_acquire(&g->dbg_sessions_lock); nvgpu_mutex_acquire(&g->dbg_sessions_lock);
gr_gk20a_disable_ctxsw(g);
if (!g->fifo.deferred_reset_pending) { nvgpu_mutex_acquire(&f->deferred_reset_mutex);
goto clean_up; deferred_reset_pending = g->fifo.deferred_reset_pending;
nvgpu_mutex_release(&f->deferred_reset_mutex);
if (!deferred_reset_pending) {
nvgpu_mutex_release(&g->dbg_sessions_lock);
return 0;
} }
gr_gk20a_disable_ctxsw(g);
tsg = tsg_gk20a_from_ch(ch); tsg = tsg_gk20a_from_ch(ch);
if (tsg != NULL) { if (tsg != NULL) {
engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);
@@ -1610,8 +1618,10 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch)
} }
} }
nvgpu_mutex_acquire(&f->deferred_reset_mutex);
g->fifo.deferred_fault_engines = 0; g->fifo.deferred_fault_engines = 0;
g->fifo.deferred_reset_pending = false; g->fifo.deferred_reset_pending = false;
nvgpu_mutex_release(&f->deferred_reset_mutex);
clean_up: clean_up:
gr_gk20a_enable_ctxsw(g); gr_gk20a_enable_ctxsw(g);
@@ -1632,9 +1642,10 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
bool verbose = true; bool verbose = true;
u32 grfifo_ctl; u32 grfifo_ctl;
nvgpu_log_fn(g, " "); bool deferred_reset_pending = false;
struct fifo_gk20a *f = &g->fifo;
g->fifo.deferred_reset_pending = false; nvgpu_log_fn(g, " ");
/* Disable power management */ /* Disable power management */
if (g->support_pmu) { if (g->support_pmu) {
@@ -1661,6 +1672,9 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
gk20a_debug_dump(g); gk20a_debug_dump(g);
} }
nvgpu_mutex_acquire(&f->deferred_reset_mutex);
g->fifo.deferred_reset_pending = false;
nvgpu_mutex_release(&f->deferred_reset_mutex);
/* go through all faulted engines */ /* go through all faulted engines */
for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) { for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) {
@@ -1761,17 +1775,17 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
g->fifo.deferred_fault_engines |= BIT(engine_id); g->fifo.deferred_fault_engines |= BIT(engine_id);
/* handled during channel free */ /* handled during channel free */
nvgpu_mutex_acquire(&f->deferred_reset_mutex);
g->fifo.deferred_reset_pending = true; g->fifo.deferred_reset_pending = true;
nvgpu_mutex_release(&f->deferred_reset_mutex);
deferred_reset_pending = true;
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"sm debugger attached," "sm debugger attached,"
" deferring channel recovery to channel free"); " deferring channel recovery to channel free");
} else { } else {
/* if lock is already taken, a reset is taking place gk20a_fifo_reset_engine(g, engine_id);
so no need to repeat */
if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {
gk20a_fifo_reset_engine(g, engine_id);
nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
}
} }
} }
@@ -1784,7 +1798,7 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
* Disable the channel/TSG from hw and increment syncpoints. * Disable the channel/TSG from hw and increment syncpoints.
*/ */
if (tsg) { if (tsg) {
if (g->fifo.deferred_reset_pending) { if (deferred_reset_pending) {
gk20a_disable_tsg(tsg); gk20a_disable_tsg(tsg);
} else { } else {
if (!fake_fault) { if (!fake_fault) {
@@ -1847,6 +1861,9 @@ static bool gk20a_fifo_handle_mmu_fault(
nvgpu_log_fn(g, " "); nvgpu_log_fn(g, " ");
nvgpu_log_info(g, "acquire engines_reset_mutex");
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
nvgpu_log_info(g, "acquire runlist_lock for all runlists"); nvgpu_log_info(g, "acquire runlist_lock for all runlists");
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -1859,6 +1876,10 @@ static bool gk20a_fifo_handle_mmu_fault(
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
} }
nvgpu_log_info(g, "release engines_reset_mutex");
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
return verbose; return verbose;
} }
@@ -1953,6 +1974,16 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
/* disable tsg so that it does not get scheduled again */ /* disable tsg so that it does not get scheduled again */
g->ops.fifo.disable_tsg(tsg); g->ops.fifo.disable_tsg(tsg);
/*
* On hitting engine reset, h/w drops the ctxsw_status to INVALID in
* fifo_engine_status register. Also while the engine is held in reset
* h/w passes busy/idle straight through. fifo_engine_status registers
* are correct in that there is no context switch outstanding
* as the CTXSW is aborted when reset is asserted.
*/
nvgpu_log_info(g, "acquire engines_reset_mutex");
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
/* /*
* stop context switching to prevent engine assignments from * stop context switching to prevent engine assignments from
* changing until engine status is checked to make sure tsg * changing until engine status is checked to make sure tsg
@@ -1980,6 +2011,9 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
} }
} }
nvgpu_log_info(g, "release engines_reset_mutex");
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
if (engines) { if (engines) {
gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
rc_type); rc_type);
@@ -2030,6 +2064,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false; bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false;
u32 rlid; u32 rlid;
nvgpu_log_info(g, "acquire engines_reset_mutex");
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
nvgpu_log_info(g, "acquire runlist_lock for all runlists"); nvgpu_log_info(g, "acquire runlist_lock for all runlists");
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -2094,6 +2131,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
} }
nvgpu_log_info(g, "release engines_reset_mutex");
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
} }
void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,

View File

@@ -184,7 +184,7 @@ struct fifo_gk20a {
/* zero-kref'd channels here */ /* zero-kref'd channels here */
struct nvgpu_list_node free_chs; struct nvgpu_list_node free_chs;
struct nvgpu_mutex free_chs_mutex; struct nvgpu_mutex free_chs_mutex;
struct nvgpu_mutex gr_reset_mutex; struct nvgpu_mutex engines_reset_mutex;
struct tsg_gk20a *tsg; struct tsg_gk20a *tsg;
struct nvgpu_mutex tsg_inuse_mutex; struct nvgpu_mutex tsg_inuse_mutex;

View File

@@ -1024,6 +1024,11 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
u32 num_runlists = 0; u32 num_runlists = 0;
unsigned long runlist_served_pbdmas; unsigned long runlist_served_pbdmas;
bool deferred_reset_pending = false;
nvgpu_log_info(g, "acquire engines_reset_mutex");
nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
nvgpu_log_fn(g, "acquire runlist_lock for all runlists"); nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
nvgpu_mutex_acquire(&f->runlist_info[rlid]. nvgpu_mutex_acquire(&f->runlist_info[rlid].
@@ -1094,8 +1099,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
/* Disable runlist scheduler */ /* Disable runlist scheduler */
gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED); gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
g->fifo.deferred_reset_pending = false;
/* Disable power management */ /* Disable power management */
if (g->support_pmu) { if (g->support_pmu) {
if (nvgpu_cg_pg_disable(g) != 0) { if (nvgpu_cg_pg_disable(g) != 0) {
@@ -1143,6 +1146,10 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
} }
} }
nvgpu_mutex_acquire(&f->deferred_reset_mutex);
g->fifo.deferred_reset_pending = false;
nvgpu_mutex_release(&f->deferred_reset_mutex);
/* check if engine reset should be deferred */ /* check if engine reset should be deferred */
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
@@ -1159,28 +1166,21 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
gk20a_fifo_should_defer_engine_reset(g, gk20a_fifo_should_defer_engine_reset(g,
engine_id, client_type, false)) { engine_id, client_type, false)) {
g->fifo.deferred_fault_engines |= g->fifo.deferred_fault_engines |=
BIT(engine_id); BIT(engine_id);
/* handled during channel free */ /* handled during channel free */
g->fifo.deferred_reset_pending = true; nvgpu_mutex_acquire(&f->deferred_reset_mutex);
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, g->fifo.deferred_reset_pending = true;
"sm debugger attached," nvgpu_mutex_release(&f->deferred_reset_mutex);
" deferring channel recovery to channel free");
deferred_reset_pending = true;
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"sm debugger attached,"
" deferring channel recovery to channel free");
} else { } else {
/* gk20a_fifo_reset_engine(g, engine_id);
* if lock is already taken, a reset is
* taking place so no need to repeat
*/
if (nvgpu_mutex_tryacquire(
&g->fifo.gr_reset_mutex)) {
gk20a_fifo_reset_engine(g,
engine_id);
nvgpu_mutex_release(
&g->fifo.gr_reset_mutex);
}
} }
} }
} }
@@ -1191,7 +1191,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
gk20a_ctxsw_trace_tsg_reset(g, tsg); gk20a_ctxsw_trace_tsg_reset(g, tsg);
#endif #endif
if (tsg) { if (tsg) {
if (g->fifo.deferred_reset_pending) { if (deferred_reset_pending) {
gk20a_disable_tsg(tsg); gk20a_disable_tsg(tsg);
} else { } else {
if (rc_type == RC_TYPE_MMU_FAULT) { if (rc_type == RC_TYPE_MMU_FAULT) {
@@ -1228,6 +1228,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
runlist_lock); runlist_lock);
} }
} }
nvgpu_log_info(g, "release engines_reset_mutex");
nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
} }
void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f) void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)