gpu: nvgpu: gv11b: channel/tsg recovery reorged

Context TSG teardown procedure:
1. Disable scheduling for the engine's runlist via PFIFO_SCHED_DISABLE.
   This enables SW to determine whether a context has hung later in the
   process: otherwise, ongoing work on the runlist may keep ENG_STATUS from
   reaching a steady state.

2. Disable all channels in the TSG being torn down or submit a new runlist
   that does not contain the TSG.  This is to prevent the TSG from being
   rescheduled once scheduling is reenabled in step 6.

3.
 a)Initiate a preempt of the TSG by writing NV_PFIFO_PREEMPT
   with the TSG's ID and the TYPE set to TSG if TSG id is known else
   do 3b

 b)Initiate a preempt of the engine by writing the bit associated with its
   runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt
   process prior to doing the slow register reads needed to determine
   whether the context has hit any interrupts or is hung.  Do not poll
   NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete.

4. Check for preempt done
5. If a reset is needed as determined by step 4:
  a. Halt the memory interface for the engine (as per the relevant engine
     procedure).
  b. Reset the engine via PMC_ENABLE.
  c. Take the engine out of reset and reinit the engine (as per the relevant
     engine procedure)
6. Re-enable scheduling for the engine's runlist via PFIFO_SCHED_ENABLE.

Bug 200277163

Change-Id: I1e945a2c6b9845f365d6952109f6803309aa2270
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1599841
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Seema Khowala
2017-11-16 13:46:11 -08:00
committed by mobile promotions
parent 87f42744e0
commit 3fbb44d757

View File

@@ -660,7 +660,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
}
}
}
gk20a_dbg_info("runlists_mask = %08x", runlists_mask);
nvgpu_log(g, gpu_dbg_info, "runlists_mask = 0x%08x", runlists_mask);
return runlists_mask;
}
@@ -873,9 +873,14 @@ static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask)
nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id))
if (runlists_mask &
fifo_runlist_preempt_runlist_m(runlist_id)) {
/* during recovery reset engs served by this runlist */
g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
g->fifo.runlist_info[runlist_id].eng_bitmask;
nvgpu_mutex_release(&g->fifo.
runlist_info[runlist_id].mutex);
}
}
return ret;
@@ -955,11 +960,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
struct fifo_runlist_info_gk20a *runlist = NULL;
u32 engine_id, client_type = ~0;
gk20a_dbg_info("active engine ids bitmask =0x%x", act_eng_bitmask);
gk20a_dbg_info("hw id =%d", id);
gk20a_dbg_info("id_type =%d", id_type);
gk20a_dbg_info("rc_type =%d", rc_type);
gk20a_dbg_info("mmu_fault =0x%p", mmfault);
nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
"act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
id, id_type, rc_type, act_eng_bitmask, mmfault);
runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
id_type, rc_type, mmfault);
@@ -986,25 +989,29 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
if (rc_type == RC_TYPE_MMU_FAULT)
/* Get tsg/ch */
if (rc_type == RC_TYPE_MMU_FAULT) {
gk20a_debug_dump(g);
/* get the channel/TSG */
if (rc_type == RC_TYPE_MMU_FAULT && mmfault && mmfault->refch) {
refch = mmfault->refch;
client_type = mmfault->client_type;
if (gk20a_is_channel_marked_as_tsg(refch))
tsg = &g->fifo.tsg[refch->tsgid];
gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
mmfault->faulted_pbdma,
mmfault->faulted_engine);
} else {
if (id_type == ID_TYPE_TSG)
tsg = &g->fifo.tsg[id];
else if (id_type == ID_TYPE_CHANNEL)
refch = gk20a_channel_get(&g->fifo.channel[id]);
}
if (id_type == ID_TYPE_TSG) {
tsg = &g->fifo.tsg[id];
} else if (id_type == ID_TYPE_CHANNEL) {
if (refch == NULL)
refch = gk20a_channel_get(&g->fifo.channel[id]);
}
/* Disable tsg/ch */
if (tsg)
gk20a_disable_tsg(tsg);
else if (refch)
g->ops.fifo.disable_channel(refch);
/* Preempt tsg/ch */
if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) {
g->ops.fifo.preempt_ch_tsg(g, id, id_type,
PREEMPT_TIMEOUT_NORC);
@@ -1012,6 +1019,57 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
gv11b_fifo_preempt_runlists(g, runlists_mask);
}
/* check if engine reset should be deferred */
for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
runlist = &g->fifo.runlist_info[runlist_id];
if ((runlists_mask & BIT(runlist_id)) &&
runlist->reset_eng_bitmask) {
unsigned long __reset_eng_bitmask =
runlist->reset_eng_bitmask;
for_each_set_bit(engine_id, &__reset_eng_bitmask,
g->fifo.max_engines) {
if ((refch || tsg) &&
gk20a_fifo_should_defer_engine_reset(g,
engine_id, client_type, false)) {
g->fifo.deferred_fault_engines |=
BIT(engine_id);
/* handled during channel free */
g->fifo.deferred_reset_pending = true;
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"sm debugger attached,"
" deferring channel recovery to channel free");
} else {
/*
* if lock is already taken, a reset is
* taking place so no need to repeat
*/
if (nvgpu_mutex_tryacquire(
&g->fifo.gr_reset_mutex)) {
gk20a_fifo_reset_engine(g,
engine_id);
nvgpu_mutex_release(
&g->fifo.gr_reset_mutex);
}
}
}
}
}
#ifdef CONFIG_GK20A_CTXSW_TRACE
/* tsg and refch both could be valid for mmu fault. Check tsg first */
if (tsg)
gk20a_ctxsw_trace_tsg_reset(g, tsg);
else if (refch)
gk20a_ctxsw_trace_channel_reset(g, refch);
#endif
if (tsg) {
if (!g->fifo.deferred_reset_pending) {
if (rc_type == RC_TYPE_MMU_FAULT) {
@@ -1041,55 +1099,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
}
}
/* check if engine reset should be deferred */
for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
runlist = &g->fifo.runlist_info[runlist_id];
if ((runlists_mask & BIT(runlist_id)) &&
runlist->reset_eng_bitmask) {
unsigned long __reset_eng_bitmask =
runlist->reset_eng_bitmask;
for_each_set_bit(engine_id, &__reset_eng_bitmask, 32) {
if ((refch || tsg) &&
gk20a_fifo_should_defer_engine_reset(g,
engine_id, client_type, false)) {
g->fifo.deferred_fault_engines |=
BIT(engine_id);
/* handled during channel free */
g->fifo.deferred_reset_pending = true;
gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
"sm debugger attached,"
" deferring channel recovery to channel free");
} else {
/*
* if lock is already taken, a reset is
* taking place so no need to repeat
*/
if (nvgpu_mutex_tryacquire(
&g->fifo.gr_reset_mutex)) {
gk20a_fifo_reset_engine(g,
engine_id);
nvgpu_mutex_release(
&g->fifo.gr_reset_mutex);
}
}
}
}
}
#ifdef CONFIG_GK20A_CTXSW_TRACE
if (refch)
gk20a_ctxsw_trace_channel_reset(g, refch);
else if (tsg)
gk20a_ctxsw_trace_tsg_reset(g, tsg);
#endif
gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED,
!RUNLIST_INFO_MUTEX_LOCKED);