gpu: nvgpu: fix ctxsw timeout handling for TSGs

While collecting failing engine data, id type (is_tsg) was not
set for ctxsw and save engine states. This could result in some
ctxsw timeout interrupts to be ignored (id reported with wrong
is_tsg).
For TSGs, check if we made some progress on any of the channels
before kicking fifo recovery.

Bug 200228310
Jira EVLR-597

Change-Id: I231549ae68317919532de0f87effb78ee9c119c6
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: http://git-master/r/1204035
(cherry picked from commit 7221d256fd7e9b418f7789b3d81eede8faa16f0b)
Reviewed-on: http://git-master/r/1204037
Reviewed-by: Richard Zhao <rizhao@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Thomas Fleury
2016-08-17 17:26:30 -07:00
committed by mobile promotions
parent 06780e0681
commit 5286fd5257
3 changed files with 127 additions and 36 deletions

View File

@@ -1539,16 +1539,19 @@ static inline u32 gp_free_count(struct channel_gk20a *c)
}
bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
u32 timeout_delta_ms)
u32 timeout_delta_ms, bool *progress)
{
u32 gpfifo_get = update_gp_get(ch->g, ch);
/* Count consequent timeout isr */
if (gpfifo_get == ch->timeout_gpfifo_get) {
/* we didn't advance since previous channel timeout check */
ch->timeout_accumulated_ms += timeout_delta_ms;
*progress = false;
} else {
/* first timeout isr encountered */
ch->timeout_accumulated_ms = timeout_delta_ms;
*progress = true;
}
ch->timeout_gpfifo_get = gpfifo_get;

View File

@@ -218,7 +218,7 @@ int gk20a_init_channel_support(struct gk20a *, u32 chid);
void gk20a_channel_close(struct channel_gk20a *ch);
bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
u32 timeout_delta_ms);
u32 timeout_delta_ms, bool *progress);
void gk20a_disable_channel(struct channel_gk20a *ch);
void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt);
void gk20a_channel_abort_clean_up(struct channel_gk20a *ch);

View File

@@ -1814,17 +1814,24 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
if (ctx_status ==
fifo_engine_status_ctx_status_ctxsw_load_v()) {
id = fifo_engine_status_next_id_v(status);
is_tsg = fifo_pbdma_status_id_type_v(status)
!= fifo_pbdma_status_id_type_chid_v();
is_tsg = fifo_engine_status_next_id_type_v(status) !=
fifo_engine_status_next_id_type_chid_v();
} else if (ctx_status ==
fifo_engine_status_ctx_status_ctxsw_switch_v()) {
mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2));
if (mailbox2 & FECS_METHOD_WFI_RESTORE)
if (mailbox2 & FECS_METHOD_WFI_RESTORE) {
id = fifo_engine_status_next_id_v(status);
else
is_tsg = fifo_engine_status_next_id_type_v(status) !=
fifo_engine_status_next_id_type_chid_v();
} else {
id = fifo_engine_status_id_v(status);
is_tsg = fifo_engine_status_id_type_v(status) !=
fifo_engine_status_id_type_chid_v();
}
} else {
id = fifo_engine_status_id_v(status);
is_tsg = fifo_engine_status_id_type_v(status) !=
fifo_engine_status_id_type_chid_v();
}
break;
}
@@ -1835,6 +1842,97 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
return active_engine_id;
}
static bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch,
bool *verbose, u32 *ms)
{
bool recover = false;
bool progress = false;
if (gk20a_channel_get(ch)) {
recover = gk20a_channel_update_and_check_timeout(ch,
GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000,
&progress);
*verbose = ch->timeout_debug_dump;
*ms = ch->timeout_accumulated_ms;
if (recover)
gk20a_set_error_notifier(ch,
NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
gk20a_channel_put(ch);
}
return recover;
}
static bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
bool *verbose, u32 *ms)
{
struct channel_gk20a *ch;
bool recover = false;
bool progress = false;
*verbose = false;
*ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000;
mutex_lock(&tsg->ch_list_lock);
/* check if there was some progress on any of the TSG channels.
* fifo recovery is needed if at least one channel reached the
* maximum timeout without progress (update in gpfifo pointers).
*/
list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
if (gk20a_channel_get(ch)) {
recover = gk20a_channel_update_and_check_timeout(ch,
*ms, &progress);
if (progress || recover)
break;
gk20a_channel_put(ch);
}
}
/* if at least one channel in the TSG made some progress, reset
* accumulated timeout for all channels in the TSG. In particular,
* this resets timeout for channels that already completed their work
*/
if (progress) {
gk20a_dbg_info("progress on tsg=%d ch=%d",
tsg->tsgid, ch->hw_chid);
gk20a_channel_put(ch);
*ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000;
list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
if (gk20a_channel_get(ch)) {
ch->timeout_accumulated_ms = *ms;
gk20a_channel_put(ch);
}
}
}
/* if one channel is presumed dead (no progress for too long), then
* fifo recovery is needed. we can't really figure out which channel
* caused the problem, so set timeout error notifier for all channels.
*/
if (recover) {
gk20a_dbg_info("timeout on tsg=%d ch=%d",
tsg->tsgid, ch->hw_chid);
*ms = ch->timeout_accumulated_ms;
gk20a_channel_put(ch);
list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
if (gk20a_channel_get(ch)) {
gk20a_set_error_notifier(ch,
NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
*verbose |= ch->timeout_debug_dump;
gk20a_channel_put(ch);
}
}
}
/* if we could not detect progress on any of the channel, but none
* of them has reached the timeout, there is nothing more to do:
* timeout_accumulated_ms has been updated for all of them.
*/
mutex_unlock(&tsg->ch_list_lock);
return recover;
}
static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
{
u32 sched_error;
@@ -1859,50 +1957,40 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
if (fifo_intr_sched_error_code_f(sched_error) ==
fifo_intr_sched_error_code_ctxsw_timeout_v()) {
struct fifo_gk20a *f = &g->fifo;
struct channel_gk20a *ch = &f->channel[id];
u32 ms = 0;
bool verbose = false;
if (is_tsg) {
gk20a_channel_timeout_restart_all_channels(g);
gk20a_fifo_recover(g, BIT(engine_id), id, true,
true, true);
ret = true;
goto err;
ret = gk20a_fifo_check_tsg_ctxsw_timeout(
&f->tsg[id], &verbose, &ms);
} else {
ret = gk20a_fifo_check_ch_ctxsw_timeout(
&f->channel[id], &verbose, &ms);
}
if (!gk20a_channel_get(ch))
goto err;
if (gk20a_channel_update_and_check_timeout(ch,
GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
gk20a_set_error_notifier(ch,
NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
if (ret) {
gk20a_err(dev_from_gk20a(g),
"fifo sched ctxsw timeout error:"
"engine = %u, ch = %d", engine_id, id);
gk20a_gr_debug_dump(g->dev);
"fifo sched ctxsw timeout error: "
"engine=%u, %s=%d, ms=%u",
engine_id, is_tsg ? "tsg" : "ch", id, ms);
/*
* Cancel all channels' timeout since SCHED error might
* trigger multiple watchdogs at a time
*/
gk20a_channel_timeout_restart_all_channels(g);
gk20a_fifo_recover(g, BIT(engine_id), id, false,
true, ch->timeout_debug_dump);
ret = true;
gk20a_fifo_recover(g, BIT(engine_id), id,
is_tsg, true, verbose);
} else {
gk20a_dbg_info(
"fifo is waiting for ctx switch for %d ms,"
"ch = %d\n",
ch->timeout_accumulated_ms,
id);
ret = false;
"fifo is waiting for ctx switch for %d ms, "
"%s=%d", ms, is_tsg ? "tsg" : "ch", id);
}
gk20a_channel_put(ch);
return ret;
} else {
gk20a_err(dev_from_gk20a(g),
"fifo sched error : 0x%08x, engine=%u, %s=%d",
sched_error, engine_id, is_tsg ? "tsg" : "ch", id);
}
gk20a_err(dev_from_gk20a(g), "fifo sched error : 0x%08x, engine=%u, %s=%d",
sched_error, engine_id, is_tsg ? "tsg" : "ch", id);
err:
return ret;
}
@@ -1913,7 +2001,7 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
struct device *dev = dev_from_gk20a(g);
u32 handled = 0;
gk20a_dbg_fn("");
gk20a_dbg_fn("fifo_intr=0x%08x", fifo_intr);
if (fifo_intr & fifo_intr_0_pio_error_pending_f()) {
/* pio mode is unused. this shouldn't happen, ever. */