Revert: GV11B runlist preemption patches

This reverts commit 2d397e34a5.
This reverts commit cd6e821cf6.
This reverts commit 5cf1eb145f.
This reverts commit a8d6f31bde.
This reverts commit 067ddbc4e4.
This reverts commit 3eede64de0.
This reverts commit 1407133b7e.
This reverts commit 797dde3e32.

Looks like this makes the ap_compute test on embedded-qnx-hv
e3550-t194 quite bad. Might also affect ap_resmgr.

Signed-off-by: Alex Waterman <alexw@nvidia.com>
Change-Id: Ib9f06514d554d1a67993f0f2bd3d180147385e0a
Reviewed-on: https://git-master.nvidia.com/r/1761864
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Alex Waterman
2018-06-26 08:53:15 -07:00
parent 8586414cc1
commit 0b02c8589d
10 changed files with 233 additions and 400 deletions

View File

@@ -55,7 +55,9 @@
#define FECS_METHOD_WFI_RESTORE 0x80000 #define FECS_METHOD_WFI_RESTORE 0x80000
#define FECS_MAILBOX_0_ACK_RESTORE 0x4 #define FECS_MAILBOX_0_ACK_RESTORE 0x4
static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
u32 chid, bool add,
bool wait_for_finish);
static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg); static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
static const char *const pbdma_intr_fault_type_desc[] = { static const char *const pbdma_intr_fault_type_desc[] = {
@@ -2702,7 +2704,7 @@ void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg)
} }
int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id,
unsigned int id_type) unsigned int id_type, unsigned int timeout_rc_type)
{ {
struct nvgpu_timeout timeout; struct nvgpu_timeout timeout;
u32 delay = GR_IDLE_CHECK_DEFAULT; u32 delay = GR_IDLE_CHECK_DEFAULT;
@@ -2775,8 +2777,8 @@ int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
id_type = is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL; id_type = is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL;
/* wait for preempt */ /* wait for preempt */
ret = g->ops.fifo.is_preempt_pending(g, id, id_type); ret = g->ops.fifo.is_preempt_pending(g, id, id_type,
PREEMPT_TIMEOUT_RC);
return ret; return ret;
} }
@@ -3273,7 +3275,7 @@ void gk20a_fifo_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
fifo_eng_runlist_length_f(count)); fifo_eng_runlist_length_f(count));
} }
int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id, static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
u32 chid, bool add, u32 chid, bool add,
bool wait_for_finish) bool wait_for_finish)
{ {
@@ -3446,7 +3448,8 @@ static int __locked_fifo_reschedule_preempt_next(struct channel_gk20a *ch,
gk20a_readl(g, fifo_preempt_r())); gk20a_readl(g, fifo_preempt_r()));
#endif #endif
if (wait_preempt) { if (wait_preempt) {
g->ops.fifo.is_preempt_pending(g, preempt_id, preempt_type); g->ops.fifo.is_preempt_pending(
g, preempt_id, preempt_type, PREEMPT_TIMEOUT_RC);
} }
#ifdef TRACEPOINTS_ENABLED #ifdef TRACEPOINTS_ENABLED
trace_gk20a_reschedule_preempted_next(ch->chid); trace_gk20a_reschedule_preempted_next(ch->chid);

View File

@@ -50,6 +50,9 @@ enum {
#define ID_TYPE_TSG 1 #define ID_TYPE_TSG 1
#define ID_TYPE_UNKNOWN ((u32)~0) #define ID_TYPE_UNKNOWN ((u32)~0)
#define PREEMPT_TIMEOUT_RC 1
#define PREEMPT_TIMEOUT_NORC 0
#define RC_YES 1 #define RC_YES 1
#define RC_NO 0 #define RC_NO 0
@@ -254,9 +257,6 @@ int nvgpu_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next,
int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 chid, int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 chid,
bool add, bool wait_for_finish); bool add, bool wait_for_finish);
int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
u32 chid, bool add,
bool wait_for_finish);
int gk20a_fifo_suspend(struct gk20a *g); int gk20a_fifo_suspend(struct gk20a *g);
bool gk20a_fifo_mmu_fault_pending(struct gk20a *g); bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
@@ -390,8 +390,8 @@ void gk20a_fifo_channel_unbind(struct channel_gk20a *ch_gk20a);
u32 gk20a_fifo_intr_0_error_mask(struct gk20a *g); u32 gk20a_fifo_intr_0_error_mask(struct gk20a *g);
int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, unsigned int id_type,
unsigned int id_type); unsigned int timeout_rc_type);
int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg); int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg);
void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, u32 id, void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, u32 id,
unsigned int id_type); unsigned int id_type);

View File

@@ -662,9 +662,9 @@ struct gpu_ops {
struct ch_state *ch_state); struct ch_state *ch_state);
u32 (*intr_0_error_mask)(struct gk20a *g); u32 (*intr_0_error_mask)(struct gk20a *g);
int (*is_preempt_pending)(struct gk20a *g, u32 id, int (*is_preempt_pending)(struct gk20a *g, u32 id,
unsigned int id_type); unsigned int id_type, unsigned int timeout_rc_type);
int (*preempt_ch_tsg)(struct gk20a *g, u32 id, int (*preempt_ch_tsg)(struct gk20a *g, u32 id,
unsigned int id_type); unsigned int id_type, unsigned int timeout_rc_type);
void (*init_pbdma_intr_descs)(struct fifo_gk20a *f); void (*init_pbdma_intr_descs)(struct fifo_gk20a *f);
int (*reset_enable_hw)(struct gk20a *g); int (*reset_enable_hw)(struct gk20a *g);
int (*setup_userd)(struct channel_gk20a *c); int (*setup_userd)(struct channel_gk20a *c);
@@ -1109,7 +1109,7 @@ struct gpu_ops {
bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr); bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr);
bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr); bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr);
bool (*is_stall_and_eng_intr_pending)(struct gk20a *g, bool (*is_stall_and_eng_intr_pending)(struct gk20a *g,
u32 act_eng_id, u32 *eng_intr_pending); u32 act_eng_id);
u32 (*intr_stall)(struct gk20a *g); u32 (*intr_stall)(struct gk20a *g);
void (*intr_stall_pause)(struct gk20a *g); void (*intr_stall_pause)(struct gk20a *g);
void (*intr_stall_resume)(struct gk20a *g); void (*intr_stall_resume)(struct gk20a *g);

View File

@@ -72,14 +72,15 @@ bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0)
return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false); return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false);
} }
bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id)
u32 *eng_intr_pending)
{ {
u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
u32 stall_intr, eng_intr_mask; u32 stall_intr, eng_intr_mask;
eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
*eng_intr_pending = mc_intr_0 & eng_intr_mask; if ((mc_intr_0 & eng_intr_mask) != 0U) {
return true;
}
stall_intr = mc_intr_pfifo_pending_f() | stall_intr = mc_intr_pfifo_pending_f() |
mc_intr_hub_pending_f() | mc_intr_hub_pending_f() |
@@ -87,10 +88,9 @@ bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
mc_intr_pbus_pending_f() | mc_intr_pbus_pending_f() |
mc_intr_ltc_pending_f() | mc_intr_ltc_pending_f() |
mc_intr_nvlink_pending_f(); mc_intr_nvlink_pending_f();
if ((mc_intr_0 & stall_intr) != 0U) {
return true;
}
nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, return false;
"mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
mc_intr_0 & stall_intr, *eng_intr_pending);
return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
} }

View File

@@ -26,6 +26,5 @@ struct gk20a;
void mc_gv100_intr_enable(struct gk20a *g); void mc_gv100_intr_enable(struct gk20a *g);
bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0); bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0);
bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id);
u32 *eng_intr_pending);
#endif #endif

View File

@@ -870,11 +870,10 @@ static void gv11b_fb_copy_from_hw_fault_buf(struct gk20a *g,
static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g, static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
struct mmu_fault_info *mmfault, u32 *invalidate_replay_val) struct mmu_fault_info *mmfault, u32 *invalidate_replay_val)
{ {
unsigned int id_type = ID_TYPE_UNKNOWN; unsigned int id_type;
u32 num_lce, act_eng_bitmask = 0; u32 num_lce, act_eng_bitmask = 0;
int err = 0; int err = 0;
u32 id = FIFO_INVAL_TSG_ID; u32 id = ((u32)~0);
unsigned int rc_type = RC_TYPE_NO_RC;
if (!mmfault->valid) if (!mmfault->valid)
return; return;
@@ -889,23 +888,18 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
/* CE page faults are not reported as replayable */ /* CE page faults are not reported as replayable */
nvgpu_log(g, gpu_dbg_intr, "CE Faulted"); nvgpu_log(g, gpu_dbg_intr, "CE Faulted");
err = gv11b_fb_fix_page_fault(g, mmfault); err = gv11b_fb_fix_page_fault(g, mmfault);
if (mmfault->refch && gv11b_fifo_reset_pbdma_and_eng_faulted(g, mmfault->refch,
(u32)mmfault->refch->tsgid != FIFO_INVAL_TSG_ID) { mmfault->faulted_pbdma, mmfault->faulted_engine);
gv11b_fifo_reset_pbdma_and_eng_faulted(g,
&g->fifo.tsg[mmfault->refch->tsgid],
mmfault->faulted_pbdma,
mmfault->faulted_engine);
}
if (!err) { if (!err) {
nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed"); nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed");
*invalidate_replay_val = 0; *invalidate_replay_val = 0;
if (mmfault->refch) { /* refch in mmfault is assigned at the time of copying
gk20a_channel_put(mmfault->refch); * fault info from snap reg or bar2 fault buf
mmfault->refch = NULL; */
} gk20a_channel_put(mmfault->refch);
return; return;
} }
/* Do recovery */ /* Do recovery. Channel recovery needs refch */
nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed"); nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed");
} }
@@ -917,9 +911,16 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
* instance block, the fault cannot be isolated to a * instance block, the fault cannot be isolated to a
* single context so we need to reset the entire runlist * single context so we need to reset the entire runlist
*/ */
rc_type = RC_TYPE_MMU_FAULT; id_type = ID_TYPE_UNKNOWN;
} else if (mmfault->refch) { } else if (mmfault->refch) {
if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
id = mmfault->refch->tsgid;
id_type = ID_TYPE_TSG;
} else {
id = mmfault->chid;
id_type = ID_TYPE_CHANNEL;
}
if (mmfault->refch->mmu_nack_handled) { if (mmfault->refch->mmu_nack_handled) {
/* We have already recovered for the same /* We have already recovered for the same
* context, skip doing another recovery. * context, skip doing another recovery.
@@ -940,40 +941,19 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
*/ */
gk20a_channel_put(mmfault->refch); gk20a_channel_put(mmfault->refch);
return; return;
} else {
/* Indicate recovery is handled if mmu fault is
* a result of mmu nack.
*/
mmfault->refch->mmu_nack_handled = true;
}
rc_type = RC_TYPE_MMU_FAULT;
if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) {
id = mmfault->refch->tsgid;
if (id != FIFO_INVAL_TSG_ID)
id_type = ID_TYPE_TSG;
} else {
nvgpu_err(g, "bare channels not supported");
} }
} else {
id_type = ID_TYPE_UNKNOWN;
} }
if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
/* engine is faulted */
if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) {
act_eng_bitmask = BIT(mmfault->faulted_engine); act_eng_bitmask = BIT(mmfault->faulted_engine);
rc_type = RC_TYPE_MMU_FAULT;
}
/* refch in mmfault is assigned at the time of copying /* Indicate recovery is handled if mmu fault is a result of
* fault info from snap reg or bar2 fault buf * mmu nack.
*/ */
if (mmfault->refch) { mmfault->refch->mmu_nack_handled = true;
gk20a_channel_put(mmfault->refch); g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
mmfault->refch = NULL; id, id_type, RC_TYPE_MMU_FAULT, mmfault);
}
if (rc_type != RC_TYPE_NO_RC)
g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
id, id_type, rc_type, mmfault);
} else { } else {
if (mmfault->fault_type == gmmu_fault_type_pte_v()) { if (mmfault->fault_type == gmmu_fault_type_pte_v()) {
nvgpu_log(g, gpu_dbg_intr, "invalid pte! try to fix"); nvgpu_log(g, gpu_dbg_intr, "invalid pte! try to fix");
@@ -992,10 +972,7 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
/* refch in mmfault is assigned at the time of copying /* refch in mmfault is assigned at the time of copying
* fault info from snap reg or bar2 fault buf * fault info from snap reg or bar2 fault buf
*/ */
if (mmfault->refch) { gk20a_channel_put(mmfault->refch);
gk20a_channel_put(mmfault->refch);
mmfault->refch = NULL;
}
} }
} }
@@ -1084,10 +1061,8 @@ void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
next_fault_addr = mmfault->fault_addr; next_fault_addr = mmfault->fault_addr;
if (prev_fault_addr == next_fault_addr) { if (prev_fault_addr == next_fault_addr) {
nvgpu_log(g, gpu_dbg_intr, "pte already scanned"); nvgpu_log(g, gpu_dbg_intr, "pte already scanned");
if (mmfault->refch) { if (mmfault->refch)
gk20a_channel_put(mmfault->refch); gk20a_channel_put(mmfault->refch);
mmfault->refch = NULL;
}
continue; continue;
} }
} }

View File

@@ -381,24 +381,17 @@ u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g)
u32 gv11b_fifo_get_preempt_timeout(struct gk20a *g) u32 gv11b_fifo_get_preempt_timeout(struct gk20a *g)
{ {
/* if timeouts are enabled, using 3000ms timeout return gk20a_get_gr_idle_timeout(g);
* for polling pdma/eng/runlist might kick in
* timeout handler in the cases where preempt
* is stuck. Use 1000ms timeout for polling when
* timeouts are enabled */
return nvgpu_is_timeouts_enabled(g) ? PREEMPT_TIMEOUT_1000_MS :
g->gr_idle_timeout_default;
} }
static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
u32 pbdma_id) u32 pbdma_id, unsigned int timeout_rc_type)
{ {
struct nvgpu_timeout timeout; struct nvgpu_timeout timeout;
unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
u32 pbdma_stat; u32 pbdma_stat;
u32 chan_stat; u32 chan_stat;
int ret = -EBUSY; int ret = -EBUSY;
unsigned int loop_count = 0;
/* timeout in milli seconds */ /* timeout in milli seconds */
nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -407,14 +400,6 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id); nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id);
/* Verify that ch/tsg is no longer on the pbdma */ /* Verify that ch/tsg is no longer on the pbdma */
do { do {
if (!nvgpu_platform_is_silicon(g)) {
if (loop_count >= MAX_PRE_SI_RETRIES) {
nvgpu_err(g, "preempt pbdma retries: %u",
loop_count);
break;
}
loop_count++;
}
/* /*
* If the PBDMA has a stalling interrupt and receives a NACK, * If the PBDMA has a stalling interrupt and receives a NACK,
* the PBDMA won't save out until the STALLING interrupt is * the PBDMA won't save out until the STALLING interrupt is
@@ -467,24 +452,21 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
nvgpu_usleep_range(delay, delay * 2); nvgpu_usleep_range(delay, delay * 2);
delay = min_t(unsigned long, delay = min_t(unsigned long,
delay << 1, GR_IDLE_CHECK_MAX); delay << 1, GR_IDLE_CHECK_MAX);
} while (!nvgpu_timeout_expired(&timeout)); } while (!nvgpu_timeout_expired_msg(&timeout,
"preempt timeout pbdma"));
if (ret)
nvgpu_err(g, "preempt timeout pbdma: %u pbdma_stat: %u "
"tsgid: %u", pbdma_id, pbdma_stat, id);
return ret; return ret;
} }
static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
u32 act_eng_id, u32 *reset_eng_bitmask) u32 act_eng_id, u32 *reset_eng_bitmask,
unsigned int timeout_rc_type)
{ {
struct nvgpu_timeout timeout; struct nvgpu_timeout timeout;
unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
u32 eng_stat; u32 eng_stat;
u32 ctx_stat; u32 ctx_stat;
int ret = -EBUSY; int ret = -EBUSY;
unsigned int loop_count = 0; bool stall_intr = false;
u32 eng_intr_pending;
/* timeout in milli seconds */ /* timeout in milli seconds */
nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -494,56 +476,20 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
act_eng_id); act_eng_id);
/* Check if ch/tsg has saved off the engine or if ctxsw is hung */ /* Check if ch/tsg has saved off the engine or if ctxsw is hung */
do { do {
if (!nvgpu_platform_is_silicon(g)) {
if (loop_count >= MAX_PRE_SI_RETRIES) {
nvgpu_err(g, "preempt eng retries: %u",
loop_count);
break;
}
loop_count++;
}
eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id)); eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id));
ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); ctx_stat = fifo_engine_status_ctx_status_v(eng_stat);
if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id, if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id)) {
&eng_intr_pending)) { stall_intr = true;
/* From h/w team
* Engine save can be blocked by eng stalling interrupts.
* FIFO interrupts shouldnt block an engine save from
* finishing, but could block FIFO from reporting preempt done.
* No immediate reason to reset the engine if FIFO interrupt is
* pending.
* The hub, priv_ring, and ltc interrupts could block context
* switch (or memory), but doesnt necessarily have to.
* For Hub interrupts they just report access counters and page
* faults. Neither of these necessarily block context switch
* or preemption, but they could.
* For example a page fault for graphics would prevent graphics
* from saving out. An access counter interrupt is a
* notification and has no effect.
* SW should handle page faults though for preempt to complete.
* PRI interrupt (due to a failed PRI transaction) will result
* in ctxsw failure reported to HOST.
* LTC interrupts are generally ECC related and if so,
* certainly dont block preemption/ctxsw but they could.
* Bus interrupts shouldnt have anything to do with preemption
* state as they are part of the Host EXT pipe, though they may
* exhibit a symptom that indicates that GPU is in a bad state.
* To be completely fair, when an engine is preempting SW
* really should just handle other interrupts as they come in.
* Its generally bad to just poll and wait on a preempt
* to complete since there are many things in the GPU which may
* cause a system to hang/stop responding.
*/
nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
"stall intr set, " "stall intr set, "
"preemption might not finish"); "preemption will not finish");
} }
if (ctx_stat == if (ctx_stat ==
fifo_engine_status_ctx_status_ctxsw_switch_v()) { fifo_engine_status_ctx_status_ctxsw_switch_v()) {
/* Eng save hasn't started yet. Continue polling */ /* Eng save hasn't started yet. Continue polling */
if (eng_intr_pending) { if (stall_intr) {
/* if eng intr, stop polling */ /* if stall intr stop polling */
*reset_eng_bitmask |= BIT(act_eng_id); *reset_eng_bitmask |= BIT(act_eng_id);
ret = 0; ret = 0;
break; break;
@@ -555,7 +501,8 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
fifo_engine_status_ctx_status_ctxsw_save_v()) { fifo_engine_status_ctx_status_ctxsw_save_v()) {
if (id == fifo_engine_status_id_v(eng_stat)) { if (id == fifo_engine_status_id_v(eng_stat)) {
if (eng_intr_pending) { if (stall_intr ||
timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
/* preemption will not finish */ /* preemption will not finish */
*reset_eng_bitmask |= BIT(act_eng_id); *reset_eng_bitmask |= BIT(act_eng_id);
ret = 0; ret = 0;
@@ -571,7 +518,9 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
fifo_engine_status_ctx_status_ctxsw_load_v()) { fifo_engine_status_ctx_status_ctxsw_load_v()) {
if (id == fifo_engine_status_next_id_v(eng_stat)) { if (id == fifo_engine_status_next_id_v(eng_stat)) {
if (eng_intr_pending) {
if (stall_intr ||
timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
/* preemption will not finish */ /* preemption will not finish */
*reset_eng_bitmask |= BIT(act_eng_id); *reset_eng_bitmask |= BIT(act_eng_id);
ret = 0; ret = 0;
@@ -591,21 +540,8 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
nvgpu_usleep_range(delay, delay * 2); nvgpu_usleep_range(delay, delay * 2);
delay = min_t(unsigned long, delay = min_t(unsigned long,
delay << 1, GR_IDLE_CHECK_MAX); delay << 1, GR_IDLE_CHECK_MAX);
} while (!nvgpu_timeout_expired(&timeout)); } while (!nvgpu_timeout_expired_msg(&timeout,
"preempt timeout eng"));
if (ret) {
/*
* The reasons a preempt can fail are:
* 1.Some other stalling interrupt is asserted preventing
* channel or context save.
* 2.The memory system hangs.
* 3.The engine hangs during CTXSW.
*/
nvgpu_err(g, "preempt timeout eng: %u ctx_stat: %u tsgid: %u",
act_eng_id, ctx_stat, id);
*reset_eng_bitmask |= BIT(act_eng_id);
}
return ret; return ret;
} }
@@ -652,19 +588,29 @@ static void gv11b_reset_pbdma_faulted_tsg(struct tsg_gk20a *tsg)
} }
void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g, void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
struct tsg_gk20a *tsg, struct channel_gk20a *refch,
u32 faulted_pbdma, u32 faulted_engine) u32 faulted_pbdma, u32 faulted_engine)
{ {
if (!tsg) struct tsg_gk20a *tsg;
return;
nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x", nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
faulted_pbdma, faulted_engine); faulted_pbdma, faulted_engine);
if (faulted_pbdma != FIFO_INVAL_PBDMA_ID) if (!refch)
gv11b_reset_pbdma_faulted_tsg(tsg); return;
if (faulted_engine != FIFO_INVAL_ENGINE_ID)
gv11b_reset_eng_faulted_tsg(tsg); if (gk20a_is_channel_marked_as_tsg(refch)) {
tsg = &g->fifo.tsg[refch->tsgid];
if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
gv11b_reset_pbdma_faulted_tsg(tsg);
if (faulted_engine != FIFO_INVAL_ENGINE_ID)
gv11b_reset_eng_faulted_tsg(tsg);
} else {
if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
gv11b_reset_pbdma_faulted_ch(g, refch->chid);
if (faulted_engine != FIFO_INVAL_ENGINE_ID)
gv11b_reset_eng_faulted_ch(g, refch->chid);
}
} }
static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask, static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
@@ -674,7 +620,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
u32 runlists_mask = 0; u32 runlists_mask = 0;
struct fifo_gk20a *f = &g->fifo; struct fifo_gk20a *f = &g->fifo;
struct fifo_runlist_info_gk20a *runlist; struct fifo_runlist_info_gk20a *runlist;
u32 rlid, pbdma_bitmask = 0; u32 pbdma_bitmask = 0;
if (id_type != ID_TYPE_UNKNOWN) { if (id_type != ID_TYPE_UNKNOWN) {
if (id_type == ID_TYPE_TSG) if (id_type == ID_TYPE_TSG)
@@ -689,31 +635,31 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
if (mmfault->faulted_pbdma != FIFO_INVAL_PBDMA_ID) if (mmfault->faulted_pbdma != FIFO_INVAL_PBDMA_ID)
pbdma_bitmask = BIT(mmfault->faulted_pbdma); pbdma_bitmask = BIT(mmfault->faulted_pbdma);
for (rlid = 0; rlid < f->max_runlists; rlid++) { for (id = 0; id < f->max_runlists; id++) {
runlist = &f->runlist_info[rlid]; runlist = &f->runlist_info[id];
if (runlist->eng_bitmask & act_eng_bitmask) if (runlist->eng_bitmask & act_eng_bitmask)
runlists_mask |= runlists_mask |=
fifo_sched_disable_runlist_m(rlid); fifo_sched_disable_runlist_m(id);
if (runlist->pbdma_bitmask & pbdma_bitmask) if (runlist->pbdma_bitmask & pbdma_bitmask)
runlists_mask |= runlists_mask |=
fifo_sched_disable_runlist_m(rlid); fifo_sched_disable_runlist_m(id);
} }
} }
if (id_type == ID_TYPE_UNKNOWN) { if (id_type == ID_TYPE_UNKNOWN) {
for (rlid = 0; rlid < f->max_runlists; rlid++) { for (id = 0; id < f->max_runlists; id++) {
if (act_eng_bitmask) { if (act_eng_bitmask) {
/* eng ids are known */ /* eng ids are known */
runlist = &f->runlist_info[rlid]; runlist = &f->runlist_info[id];
if (runlist->eng_bitmask & act_eng_bitmask) if (runlist->eng_bitmask & act_eng_bitmask)
runlists_mask |= runlists_mask |=
fifo_sched_disable_runlist_m(rlid); fifo_sched_disable_runlist_m(id);
} else { } else {
runlists_mask |= runlists_mask |=
fifo_sched_disable_runlist_m(rlid); fifo_sched_disable_runlist_m(id);
} }
} }
} }
@@ -745,20 +691,10 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
struct nvgpu_timeout timeout; struct nvgpu_timeout timeout;
u32 delay = GR_IDLE_CHECK_DEFAULT; u32 delay = GR_IDLE_CHECK_DEFAULT;
int ret = -EBUSY; int ret = -EBUSY;
unsigned int loop_count = 0;
nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
NVGPU_TIMER_CPU_TIMER); NVGPU_TIMER_CPU_TIMER);
do { do {
if (!nvgpu_platform_is_silicon(g)) {
if (loop_count >= MAX_PRE_SI_RETRIES) {
nvgpu_err(g, "preempt runlist retries: %u",
loop_count);
break;
}
loop_count++;
}
if (!((gk20a_readl(g, fifo_runlist_preempt_r())) & if (!((gk20a_readl(g, fifo_runlist_preempt_r())) &
runlists_mask)) { runlists_mask)) {
ret = 0; ret = 0;
@@ -768,16 +704,13 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
nvgpu_usleep_range(delay, delay * 2); nvgpu_usleep_range(delay, delay * 2);
delay = min_t(unsigned long, delay = min_t(unsigned long,
delay << 1, GR_IDLE_CHECK_MAX); delay << 1, GR_IDLE_CHECK_MAX);
} while (!nvgpu_timeout_expired(&timeout)); } while (!nvgpu_timeout_expired_msg(&timeout,
"runlist preempt timeout"));
if (ret)
nvgpu_err(g, "preempt runlist timeout, runlists_mask:0x%08x",
runlists_mask);
return ret; return ret;
} }
int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
unsigned int id_type) unsigned int id_type, unsigned int timeout_rc_type)
{ {
struct fifo_gk20a *f = &g->fifo; struct fifo_gk20a *f = &g->fifo;
unsigned long runlist_served_pbdmas; unsigned long runlist_served_pbdmas;
@@ -785,6 +718,7 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
u32 pbdma_id; u32 pbdma_id;
u32 act_eng_id; u32 act_eng_id;
u32 runlist_id; u32 runlist_id;
int func_ret;
int ret = 0; int ret = 0;
u32 tsgid; u32 tsgid;
@@ -801,14 +735,30 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask; runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask;
runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask; runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask;
for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) {
ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id);
func_ret = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
timeout_rc_type);
if (func_ret != 0) {
nvgpu_log_info(g, "preempt timeout pbdma %d", pbdma_id);
ret |= func_ret;
}
}
f->runlist_info[runlist_id].reset_eng_bitmask = 0; f->runlist_info[runlist_id].reset_eng_bitmask = 0;
for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) {
ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
&f->runlist_info[runlist_id].reset_eng_bitmask); func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
&f->runlist_info[runlist_id].reset_eng_bitmask,
timeout_rc_type);
if (func_ret != 0) {
nvgpu_log_info(g, "preempt timeout engine %d", act_eng_id);
ret |= func_ret;
}
}
return ret; return ret;
} }
@@ -887,9 +837,6 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock); nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
/* WAR for Bug 2065990 */
gk20a_fifo_disable_tsg_sched(g, &f->tsg[tsgid]);
mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
ret = __locked_fifo_preempt(g, tsgid, true); ret = __locked_fifo_preempt(g, tsgid, true);
@@ -897,9 +844,6 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
if (!mutex_ret) if (!mutex_ret)
nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
/* WAR for Bug 2065990 */
gk20a_fifo_enable_tsg_sched(g, &f->tsg[tsgid]);
nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock); nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
if (ret) if (ret)
@@ -908,36 +852,44 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
return ret; return ret;
} }
static void gv11b_fifo_locked_preempt_runlists(struct gk20a *g, u32 runlists_mask) static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask)
{ {
int ret = 0; int ret = 0;
u32 token = PMU_INVALID_MUTEX_OWNER_ID; u32 token = PMU_INVALID_MUTEX_OWNER_ID;
u32 mutex_ret = 0; u32 mutex_ret = 0;
u32 rlid; u32 runlist_id;
/* runlist_lock are locked by teardown and sched are disabled too */ nvgpu_log_fn(g, " ");
nvgpu_log_fn(g, "preempt runlists_mask:0x%08x", runlists_mask);
for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id))
nvgpu_mutex_acquire(&g->fifo.
runlist_info[runlist_id].runlist_lock);
}
mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
ret = __locked_fifo_preempt_runlists(g, runlists_mask); ret = __locked_fifo_preempt_runlists(g, runlists_mask);
if (ret) { if (!mutex_ret)
/* if preempt timed out, reset engs served by runlists */ nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
if (runlists_mask & for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
fifo_runlist_preempt_runlist_m(rlid)) if (runlists_mask &
g->fifo.runlist_info[rlid].reset_eng_bitmask = fifo_runlist_preempt_runlist_m(runlist_id)) {
g->fifo.runlist_info[rlid].eng_bitmask; /* during recovery reset engs served by this runlist */
g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
g->fifo.runlist_info[runlist_id].eng_bitmask;
nvgpu_mutex_release(&g->fifo.
runlist_info[runlist_id].runlist_lock);
} }
} }
if (!mutex_ret) return ret;
nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
} }
static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
unsigned int id_type) unsigned int id_type, unsigned int timeout_rc_type)
{ {
int ret; int ret;
struct fifo_gk20a *f = &g->fifo; struct fifo_gk20a *f = &g->fifo;
@@ -951,164 +903,63 @@ static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
gk20a_fifo_issue_preempt(g, id, true); gk20a_fifo_issue_preempt(g, id, true);
/* wait for preempt */ /* wait for preempt */
ret = g->ops.fifo.is_preempt_pending(g, id, id_type); ret = g->ops.fifo.is_preempt_pending(g, id, id_type,
timeout_rc_type);
/* No recovery even if preempt timed out since if (ret && (timeout_rc_type == PREEMPT_TIMEOUT_RC))
* this is called from recovery path gk20a_fifo_preempt_timeout_rc(g, id, id_type);
*/
return ret; return ret;
} }
int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
unsigned int id_type) unsigned int id_type, unsigned int timeout_rc_type)
{ {
struct fifo_gk20a *f = &g->fifo;
u32 ret = 0; u32 ret = 0;
u32 token = PMU_INVALID_MUTEX_OWNER_ID; u32 token = PMU_INVALID_MUTEX_OWNER_ID;
u32 mutex_ret = 0; u32 mutex_ret = 0;
u32 runlist_id;
if (id_type == ID_TYPE_TSG)
runlist_id = f->tsg[id].runlist_id;
else if (id_type == ID_TYPE_CHANNEL)
runlist_id = f->channel[id].runlist_id;
else
return -EINVAL;
if (runlist_id >= g->fifo.max_runlists) {
nvgpu_log_info(g, "runlist_id = %d", runlist_id);
return -EINVAL;
}
nvgpu_log_fn(g, "preempt id = %d, runlist_id = %d", id, runlist_id);
nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
/*
* This is called from teardown path only. runlist_lock ret = __locked_fifo_preempt_ch_tsg(g, id, id_type, timeout_rc_type);
* is already acquired before calling this function.
*/
ret = __locked_fifo_preempt_ch_tsg(g, id, id_type);
if (!mutex_ret) if (!mutex_ret)
nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
return ret; return ret;
} }
static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
unsigned int rc_type,
u32 runlists_mask)
{
struct tsg_gk20a *tsg = NULL;
u32 rlid, tsgid;
struct fifo_runlist_info_gk20a *runlist = NULL;
u32 token = PMU_INVALID_MUTEX_OWNER_ID;
u32 mutex_ret = 0;
bool add = false, wait_for_finish = false;
int err;
nvgpu_err(g, "runlist id unknown, abort active tsgs in runlists");
/* runlist_lock are locked by teardown */
mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
for (rlid = 0; rlid < g->fifo.max_runlists;
rlid++) {
if (!(runlists_mask & BIT(rlid)))
continue;
nvgpu_log(g, gpu_dbg_info, "abort runlist id %d",
rlid);
runlist = &g->fifo.runlist_info[rlid];
for_each_set_bit(tsgid, runlist->active_tsgs,
g->fifo.num_channels) {
nvgpu_log(g, gpu_dbg_info, "abort tsg id %d", tsgid);
tsg = &g->fifo.tsg[tsgid];
gk20a_disable_tsg(tsg);
/* assume all pbdma and eng faulted are set */
nvgpu_log(g, gpu_dbg_info, "reset pbdma and eng faulted");
gv11b_reset_pbdma_faulted_tsg(tsg);
gv11b_reset_eng_faulted_tsg(tsg);
#ifdef CONFIG_GK20A_CTXSW_TRACE
gk20a_ctxsw_trace_tsg_reset(g, tsg);
#endif
if (!g->fifo.deferred_reset_pending) {
if (rc_type == RC_TYPE_MMU_FAULT) {
gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
gk20a_fifo_error_tsg(g, tsg);
}
}
/* (chid == ~0 && !add) remove all act ch from runlist*/
err = gk20a_fifo_update_runlist_locked(g, rlid,
FIFO_INVAL_CHANNEL_ID, add, wait_for_finish);
if (err)
nvgpu_err(g, "runlist id %d is not cleaned up",
rlid);
gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
nvgpu_log(g, gpu_dbg_info, "aborted tsg id %d", tsgid);
}
}
if (!mutex_ret)
nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
}
void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
u32 id, unsigned int id_type, unsigned int rc_type, u32 id, unsigned int id_type, unsigned int rc_type,
struct mmu_fault_info *mmfault) struct mmu_fault_info *mmfault)
{ {
struct tsg_gk20a *tsg = NULL; struct tsg_gk20a *tsg = NULL;
u32 runlists_mask, rlid; struct channel_gk20a *refch = NULL;
u32 runlists_mask, runlist_id;
struct fifo_runlist_info_gk20a *runlist = NULL; struct fifo_runlist_info_gk20a *runlist = NULL;
u32 engine_id, client_type = ~0; u32 engine_id, client_type = ~0;
struct fifo_gk20a *f = &g->fifo;
u32 runlist_id = FIFO_INVAL_RUNLIST_ID;
u32 num_runlists = 0;
nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
nvgpu_mutex_acquire(&f->runlist_info[rlid].
runlist_lock);
/* get runlist id and tsg */
if (id_type == ID_TYPE_TSG) {
if (id != FIFO_INVAL_TSG_ID) {
tsg = &g->fifo.tsg[id];
runlist_id = tsg->runlist_id;
if (runlist_id != FIFO_INVAL_RUNLIST_ID)
num_runlists++;
else
nvgpu_log_fn(g, "tsg runlist id is invalid");
} else {
nvgpu_log_fn(g, "id type is tsg but tsg id is inval");
}
} else {
/*
* id type is unknown, get runlist_id if eng mask is such that
* it corresponds to single runlist id. If eng mask corresponds
* to multiple runlists, then abort all runlists
*/
for (rlid = 0; rlid < f->max_runlists; rlid++) {
if (act_eng_bitmask) {
/* eng ids are known */
runlist = &f->runlist_info[rlid];
if (runlist->eng_bitmask & act_eng_bitmask) {
runlist_id = rlid;
num_runlists++;
}
} else {
break;
}
}
if (num_runlists > 1 ) /* abort all runlists */
runlist_id = FIFO_INVAL_RUNLIST_ID;
}
/* if runlist_id is valid and there is only single runlist to be
* aborted, release runlist lock that are not
* needed for this recovery
*/
if (runlist_id != FIFO_INVAL_RUNLIST_ID && num_runlists == 1) {
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
if (rlid != runlist_id) {
nvgpu_log_fn(g, "release runlist_lock for "
"unused runlist id: %d", rlid);
nvgpu_mutex_release(&f->runlist_info[rlid].
runlist_lock);
}
}
}
nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, " nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
"act_eng_bitmask = 0x%x, mmfault ptr = 0x%p", "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
@@ -1117,7 +968,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
id_type, rc_type, mmfault); id_type, rc_type, mmfault);
/* Disable runlist scheduler */
gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED); gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
g->fifo.deferred_reset_pending = false; g->fifo.deferred_reset_pending = false;
@@ -1139,41 +989,41 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
/* Get tsg/ch */
if (rc_type == RC_TYPE_MMU_FAULT) { if (rc_type == RC_TYPE_MMU_FAULT) {
gk20a_debug_dump(g); gk20a_debug_dump(g);
refch = mmfault->refch;
client_type = mmfault->client_type; client_type = mmfault->client_type;
gv11b_fifo_reset_pbdma_and_eng_faulted(g, tsg, gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
mmfault->faulted_pbdma, mmfault->faulted_pbdma,
mmfault->faulted_engine); mmfault->faulted_engine);
} }
if (id_type == ID_TYPE_TSG) {
tsg = &g->fifo.tsg[id];
} else if (id_type == ID_TYPE_CHANNEL) {
if (refch == NULL)
refch = gk20a_channel_get(&g->fifo.channel[id]);
}
/* Disable tsg/ch */
if (tsg) if (tsg)
gk20a_disable_tsg(tsg); gk20a_disable_tsg(tsg);
else if (refch)
g->ops.fifo.disable_channel(refch);
/* /* Preempt tsg/ch */
* Even though TSG preempt timed out, the RC sequence would by design if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) {
* require s/w to issue another preempt. g->ops.fifo.preempt_ch_tsg(g, id, id_type,
* If recovery includes an ENGINE_RESET, to not have race conditions, PREEMPT_TIMEOUT_NORC);
* use RUNLIST_PREEMPT to kick all work off, and cancel any context
* load which may be pending. This is also needed to make sure
* that all PBDMAs serving the engine are not loaded when engine is
* reset.
*/
if (tsg) {
int preempt_failed;
preempt_failed = g->ops.fifo.preempt_ch_tsg(g, id, id_type);
if (preempt_failed)
gv11b_fifo_locked_preempt_runlists(g, runlists_mask);
} else { } else {
gv11b_fifo_locked_preempt_runlists(g, runlists_mask); gv11b_fifo_preempt_runlists(g, runlists_mask);
} }
/* check if engine reset should be deferred */ /* check if engine reset should be deferred */
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
runlist = &g->fifo.runlist_info[rlid]; runlist = &g->fifo.runlist_info[runlist_id];
if ((runlists_mask & BIT(rlid)) && if ((runlists_mask & BIT(runlist_id)) &&
runlist->reset_eng_bitmask) { runlist->reset_eng_bitmask) {
unsigned long __reset_eng_bitmask = unsigned long __reset_eng_bitmask =
@@ -1181,7 +1031,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
for_each_set_bit(engine_id, &__reset_eng_bitmask, for_each_set_bit(engine_id, &__reset_eng_bitmask,
g->fifo.max_engines) { g->fifo.max_engines) {
if (tsg && if ((refch || tsg) &&
gk20a_fifo_should_defer_engine_reset(g, gk20a_fifo_should_defer_engine_reset(g,
engine_id, client_type, false)) { engine_id, client_type, false)) {
@@ -1213,9 +1063,13 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
} }
#ifdef CONFIG_GK20A_CTXSW_TRACE #ifdef CONFIG_GK20A_CTXSW_TRACE
/* tsg and refch both could be valid for mmu fault. Check tsg first */
if (tsg) if (tsg)
gk20a_ctxsw_trace_tsg_reset(g, tsg); gk20a_ctxsw_trace_tsg_reset(g, tsg);
else if (refch)
gk20a_ctxsw_trace_channel_reset(g, refch);
#endif #endif
if (tsg) { if (tsg) {
if (g->fifo.deferred_reset_pending) { if (g->fifo.deferred_reset_pending) {
gk20a_disable_tsg(tsg); gk20a_disable_tsg(tsg);
@@ -1225,9 +1079,26 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
gk20a_fifo_abort_tsg(g, tsg->tsgid, false); gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
} }
if (refch)
gk20a_channel_put(refch);
} else if (refch) {
if (g->fifo.deferred_reset_pending) {
g->ops.fifo.disable_channel(refch);
} else {
if (rc_type == RC_TYPE_MMU_FAULT)
gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
gk20a_channel_abort(refch, false);
}
gk20a_channel_put(refch);
} else { } else {
gv11b_fifo_locked_abort_runlist_active_tsgs(g, rc_type, nvgpu_err(g, "id unknown, abort runlist");
runlists_mask); for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
runlist_id++) {
if (runlists_mask & BIT(runlist_id))
g->ops.fifo.update_runlist(g, runlist_id,
FIFO_INVAL_CHANNEL_ID, false, true);
}
} }
gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED); gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED);
@@ -1235,18 +1106,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
/* It is safe to enable ELPG again. */ /* It is safe to enable ELPG again. */
if (g->support_pmu && g->elpg_enabled) if (g->support_pmu && g->elpg_enabled)
nvgpu_pmu_enable_elpg(g); nvgpu_pmu_enable_elpg(g);
/* release runlist_lock */
if (runlist_id != FIFO_INVAL_RUNLIST_ID) {
nvgpu_log_fn(g, "release runlist_lock runlist_id = %d",
runlist_id);
nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
} else {
nvgpu_log_fn(g, "release runlist_lock for all runlists");
for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
nvgpu_mutex_release(&f->runlist_info[rlid].
runlist_lock);
}
} }
void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f) void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)

View File

@@ -50,13 +50,10 @@
#define CHANNEL_INFO_VEID0 0 #define CHANNEL_INFO_VEID0 0
#define MAX_PRE_SI_RETRIES 200000 /* 1G/500KHz * 100 */
#define PREEMPT_TIMEOUT_1000_MS 1000
struct gpu_ops; struct gpu_ops;
void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g, void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
struct tsg_gk20a *tsg, struct channel_gk20a *refch,
u32 faulted_pbdma, u32 faulted_engine); u32 faulted_pbdma, u32 faulted_engine);
void gv11b_mmu_fault_id_to_eng_pbdma_id_and_veid(struct gk20a *g, void gv11b_mmu_fault_id_to_eng_pbdma_id_and_veid(struct gk20a *g,
u32 mmu_fault_id, u32 *active_engine_id, u32 *veid, u32 *pbdma_id); u32 mmu_fault_id, u32 *active_engine_id, u32 *veid, u32 *pbdma_id);
@@ -81,11 +78,12 @@ void gv11b_dump_eng_status(struct gk20a *g,
u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g); u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g);
int gv11b_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next); int gv11b_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next);
int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
unsigned int id_type); unsigned int id_type, unsigned int timeout_rc_type);
int gv11b_fifo_preempt_channel(struct gk20a *g, u32 chid); int gv11b_fifo_preempt_channel(struct gk20a *g, u32 chid);
int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid); int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid);
int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg); int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg);
int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, unsigned int id_type); int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
unsigned int id_type, unsigned int timeout_rc_type);
void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
u32 id, unsigned int id_type, unsigned int rc_type, u32 id, unsigned int id_type, unsigned int rc_type,
struct mmu_fault_info *mmfault); struct mmu_fault_info *mmfault);

View File

@@ -71,24 +71,24 @@ bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0)
return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false); return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false);
} }
bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id)
u32 *eng_intr_pending)
{ {
u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
u32 stall_intr, eng_intr_mask; u32 stall_intr, eng_intr_mask;
eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
*eng_intr_pending = mc_intr_0 & eng_intr_mask; if ((mc_intr_0 & eng_intr_mask) != 0U) {
return true;
}
stall_intr = mc_intr_pfifo_pending_f() | stall_intr = mc_intr_pfifo_pending_f() |
mc_intr_hub_pending_f() | mc_intr_hub_pending_f() |
mc_intr_priv_ring_pending_f() | mc_intr_priv_ring_pending_f() |
mc_intr_pbus_pending_f() | mc_intr_pbus_pending_f() |
mc_intr_ltc_pending_f(); mc_intr_ltc_pending_f();
if ((mc_intr_0 & stall_intr) != 0U) {
return true;
}
nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, return false;
"mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
mc_intr_0 & stall_intr, *eng_intr_pending);
return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
} }

View File

@@ -26,6 +26,5 @@ struct gk20a;
void mc_gv11b_intr_enable(struct gk20a *g); void mc_gv11b_intr_enable(struct gk20a *g);
bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0); bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0);
bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id, bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id);
u32 *eng_intr_pending);
#endif #endif