gpu: nvgpu: add rc_type i/p param to gk20a_fifo_recover

Add below rc_types to be passed to gk20a_fifo_recover
MMU_FAULT
PBDMA_FAULT
GR_FAULT
PREEMPT_TIMEOUT
CTXSW_TIMEOUT
RUNLIST_UPDATE_TIMEOUT
FORCE_RESET
SCHED_ERR
This is nice to have to know what triggered recovery.

Bug 2065990

Change-Id: I202268c5f237be2180b438e8ba027fce684967b6
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1662619
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Seema Khowala
2018-02-22 13:00:25 -08:00
committed by mobile promotions
parent bf03799977
commit c9463fdbb3
4 changed files with 53 additions and 28 deletions

View File

@@ -1817,7 +1817,7 @@ static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg)
return engines; return engines;
} }
void gk20a_fifo_recover_ch(struct gk20a *g, u32 chid, bool verbose) void gk20a_fifo_recover_ch(struct gk20a *g, u32 chid, bool verbose, int rc_type)
{ {
u32 engines; u32 engines;
@@ -1829,7 +1829,8 @@ void gk20a_fifo_recover_ch(struct gk20a *g, u32 chid, bool verbose)
engines = gk20a_fifo_engines_on_id(g, chid, false); engines = gk20a_fifo_engines_on_id(g, chid, false);
if (engines) if (engines)
gk20a_fifo_recover(g, engines, chid, false, true, verbose); gk20a_fifo_recover(g, engines, chid, false, true, verbose,
rc_type);
else { else {
struct channel_gk20a *ch = &g->fifo.channel[chid]; struct channel_gk20a *ch = &g->fifo.channel[chid];
@@ -1847,7 +1848,8 @@ void gk20a_fifo_recover_ch(struct gk20a *g, u32 chid, bool verbose)
nvgpu_mutex_release(&g->dbg_sessions_lock); nvgpu_mutex_release(&g->dbg_sessions_lock);
} }
void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose) void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose,
int rc_type)
{ {
u32 engines; u32 engines;
@@ -1859,7 +1861,8 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
engines = gk20a_fifo_engines_on_id(g, tsgid, true); engines = gk20a_fifo_engines_on_id(g, tsgid, true);
if (engines) if (engines)
gk20a_fifo_recover(g, engines, tsgid, true, true, verbose); gk20a_fifo_recover(g, engines, tsgid, true, true, verbose,
rc_type);
else { else {
struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
@@ -1956,7 +1959,7 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
u32 hw_id, bool id_is_tsg, u32 hw_id, bool id_is_tsg,
bool id_is_known, bool verbose) bool id_is_known, bool verbose, int rc_type)
{ {
unsigned int id_type; unsigned int id_type;
@@ -1972,7 +1975,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
id_type = ID_TYPE_UNKNOWN; id_type = ID_TYPE_UNKNOWN;
g->ops.fifo.teardown_ch_tsg(g, __engine_ids, hw_id, id_type, g->ops.fifo.teardown_ch_tsg(g, __engine_ids, hw_id, id_type,
RC_TYPE_NORMAL, NULL); rc_type, NULL);
} }
/* force reset channel and tsg (if it's part of one) */ /* force reset channel and tsg (if it's part of one) */
@@ -1998,10 +2001,12 @@ int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,
} }
nvgpu_rwsem_up_read(&tsg->ch_list_lock); nvgpu_rwsem_up_read(&tsg->ch_list_lock);
gk20a_fifo_recover_tsg(g, ch->tsgid, verbose); gk20a_fifo_recover_tsg(g, ch->tsgid, verbose,
RC_TYPE_FORCE_RESET);
} else { } else {
g->ops.fifo.set_error_notifier(ch, err_code); g->ops.fifo.set_error_notifier(ch, err_code);
gk20a_fifo_recover_ch(g, ch->chid, verbose); gk20a_fifo_recover_ch(g, ch->chid, verbose,
RC_TYPE_FORCE_RESET);
} }
return 0; return 0;
@@ -2288,7 +2293,8 @@ bool gk20a_fifo_handle_sched_error(struct gk20a *g)
*/ */
gk20a_channel_timeout_restart_all_channels(g); gk20a_channel_timeout_restart_all_channels(g);
gk20a_fifo_recover(g, BIT(engine_id), id, gk20a_fifo_recover(g, BIT(engine_id), id,
is_tsg, true, verbose); is_tsg, true, verbose,
RC_TYPE_CTXSW_TIMEOUT);
} else { } else {
gk20a_dbg_info( gk20a_dbg_info(
"fifo is waiting for ctx switch for %d ms, " "fifo is waiting for ctx switch for %d ms, "
@@ -2542,7 +2548,7 @@ static void gk20a_fifo_pbdma_fault_rc(struct gk20a *g,
if (gk20a_channel_get(ch)) { if (gk20a_channel_get(ch)) {
g->ops.fifo.set_error_notifier(ch, error_notifier); g->ops.fifo.set_error_notifier(ch, error_notifier);
gk20a_fifo_recover_ch(g, id, true); gk20a_fifo_recover_ch(g, id, true, RC_TYPE_PBDMA_FAULT);
gk20a_channel_put(ch); gk20a_channel_put(ch);
} }
} else if (fifo_pbdma_status_id_type_v(status) } else if (fifo_pbdma_status_id_type_v(status)
@@ -2560,7 +2566,7 @@ static void gk20a_fifo_pbdma_fault_rc(struct gk20a *g,
} }
} }
nvgpu_rwsem_up_read(&tsg->ch_list_lock); nvgpu_rwsem_up_read(&tsg->ch_list_lock);
gk20a_fifo_recover_tsg(g, id, true); gk20a_fifo_recover_tsg(g, id, true, RC_TYPE_PBDMA_FAULT);
} }
} }
@@ -2578,8 +2584,10 @@ u32 gk20a_fifo_handle_pbdma_intr(struct gk20a *g, struct fifo_gk20a *f,
nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
"pbdma id %d intr_0 0x%08x pending", "pbdma id %d intr_0 0x%08x pending",
pbdma_id, pbdma_intr_0); pbdma_id, pbdma_intr_0);
rc_type = g->ops.fifo.handle_pbdma_intr_0(g, pbdma_id,
pbdma_intr_0, &handled, &error_notifier); if (g->ops.fifo.handle_pbdma_intr_0(g, pbdma_id, pbdma_intr_0,
&handled, &error_notifier) != RC_TYPE_NO_RC)
rc_type = RC_TYPE_PBDMA_FAULT;
gk20a_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0); gk20a_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
} }
@@ -2587,8 +2595,10 @@ u32 gk20a_fifo_handle_pbdma_intr(struct gk20a *g, struct fifo_gk20a *f,
nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
"pbdma id %d intr_1 0x%08x pending", "pbdma id %d intr_1 0x%08x pending",
pbdma_id, pbdma_intr_1); pbdma_id, pbdma_intr_1);
rc_type = g->ops.fifo.handle_pbdma_intr_1(g, pbdma_id,
pbdma_intr_1, &handled, &error_notifier); if (g->ops.fifo.handle_pbdma_intr_1(g, pbdma_id, pbdma_intr_1,
&handled, &error_notifier) != RC_TYPE_NO_RC)
rc_type = RC_TYPE_PBDMA_FAULT;
gk20a_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1); gk20a_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
} }
@@ -2721,7 +2731,8 @@ void __locked_fifo_preempt_timeout_rc(struct gk20a *g, u32 id,
gk20a_channel_put(ch); gk20a_channel_put(ch);
} }
nvgpu_rwsem_up_read(&tsg->ch_list_lock); nvgpu_rwsem_up_read(&tsg->ch_list_lock);
gk20a_fifo_recover_tsg(g, id, true); gk20a_fifo_recover_tsg(g, id, true,
RC_TYPE_PREEMPT_TIMEOUT);
} else { } else {
struct channel_gk20a *ch = &g->fifo.channel[id]; struct channel_gk20a *ch = &g->fifo.channel[id];
@@ -2731,7 +2742,8 @@ void __locked_fifo_preempt_timeout_rc(struct gk20a *g, u32 id,
if (gk20a_channel_get(ch)) { if (gk20a_channel_get(ch)) {
g->ops.fifo.set_error_notifier(ch, g->ops.fifo.set_error_notifier(ch,
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
gk20a_fifo_recover_ch(g, id, true); gk20a_fifo_recover_ch(g, id, true,
RC_TYPE_PREEMPT_TIMEOUT);
gk20a_channel_put(ch); gk20a_channel_put(ch);
} }
} }
@@ -3024,7 +3036,8 @@ static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
} }
if (engines) if (engines)
gk20a_fifo_recover(g, engines, ~(u32)0, false, false, true); gk20a_fifo_recover(g, engines, ~(u32)0, false, false, true,
RC_TYPE_RUNLIST_UPDATE_TIMEOUT);
} }
int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id) int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)

View File

@@ -57,10 +57,15 @@ enum {
#define GRFIFO_TIMEOUT_CHECK_PERIOD_US 100000 #define GRFIFO_TIMEOUT_CHECK_PERIOD_US 100000
#define RC_TYPE_NORMAL 0 #define RC_TYPE_NO_RC 0
#define RC_TYPE_MMU_FAULT 1 #define RC_TYPE_MMU_FAULT 1
#define RC_TYPE_PBDMA_FAULT 2 #define RC_TYPE_PBDMA_FAULT 2
#define RC_TYPE_NO_RC 0xff #define RC_TYPE_GR_FAULT 3
#define RC_TYPE_PREEMPT_TIMEOUT 4
#define RC_TYPE_CTXSW_TIMEOUT 5
#define RC_TYPE_RUNLIST_UPDATE_TIMEOUT 6
#define RC_TYPE_FORCE_RESET 7
#define RC_TYPE_SCHED_ERR 8
#define NVGPU_FIFO_DEFAULT_TIMESLICE_TIMEOUT 128UL #define NVGPU_FIFO_DEFAULT_TIMESLICE_TIMEOUT 128UL
#define NVGPU_FIFO_DEFAULT_TIMESLICE_SCALE 3UL #define NVGPU_FIFO_DEFAULT_TIMESLICE_SCALE 3UL
@@ -256,9 +261,11 @@ void gk20a_fifo_recover(struct gk20a *g,
u32 engine_ids, /* if zero, will be queried from HW */ u32 engine_ids, /* if zero, will be queried from HW */
u32 hw_id, /* if ~0, will be queried from HW */ u32 hw_id, /* if ~0, will be queried from HW */
bool hw_id_is_tsg, /* ignored if hw_id == ~0 */ bool hw_id_is_tsg, /* ignored if hw_id == ~0 */
bool id_is_known, bool verbose); bool id_is_known, bool verbose, int rc_type);
void gk20a_fifo_recover_ch(struct gk20a *g, u32 chid, bool verbose); void gk20a_fifo_recover_ch(struct gk20a *g, u32 chid, bool verbose,
void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose); int rc_type);
void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose,
int rc_type);
int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,
u32 err_code, bool verbose); u32 err_code, bool verbose);
void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id); void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id);

View File

@@ -6088,13 +6088,16 @@ int gk20a_gr_isr(struct gk20a *g)
if (need_reset) { if (need_reset) {
if (tsgid != NVGPU_INVALID_TSG_ID) if (tsgid != NVGPU_INVALID_TSG_ID)
gk20a_fifo_recover(g, gr_engine_id, gk20a_fifo_recover(g, gr_engine_id,
tsgid, true, true, true); tsgid, true, true, true,
RC_TYPE_GR_FAULT);
else if (ch) else if (ch)
gk20a_fifo_recover(g, gr_engine_id, gk20a_fifo_recover(g, gr_engine_id,
ch->chid, false, true, true); ch->chid, false, true, true,
RC_TYPE_GR_FAULT);
else else
gk20a_fifo_recover(g, gr_engine_id, gk20a_fifo_recover(g, gr_engine_id,
0, false, false, true); 0, false, false, true,
RC_TYPE_GR_FAULT);
} }
if (gr_intr && !ch) { if (gr_intr && !ch) {

View File

@@ -1306,7 +1306,8 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g)
if (sched_error == SCHED_ERROR_CODE_BAD_TSG ) { if (sched_error == SCHED_ERROR_CODE_BAD_TSG ) {
/* id is unknown, preempt all runlists and do recovery */ /* id is unknown, preempt all runlists and do recovery */
gk20a_fifo_recover(g, 0, 0, false, false, false); gk20a_fifo_recover(g, 0, 0, false, false, false,
RC_TYPE_SCHED_ERR);
} }
return false; return false;
@@ -1465,7 +1466,8 @@ bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g, u32 fifo_intr)
/* Cancel all channels' timeout */ /* Cancel all channels' timeout */
gk20a_channel_timeout_restart_all_channels(g); gk20a_channel_timeout_restart_all_channels(g);
gk20a_fifo_recover(g, BIT(active_eng_id), tsgid, gk20a_fifo_recover(g, BIT(active_eng_id), tsgid,
true, true, verbose); true, true, verbose,
RC_TYPE_CTXSW_TIMEOUT);
} else { } else {
gk20a_dbg_info( gk20a_dbg_info(
"fifo is waiting for ctx switch: " "fifo is waiting for ctx switch: "