diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index 0c99d6cf9..ceee4fb7f 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -1367,6 +1367,48 @@ u32 nvgpu_gp_free_count(struct channel_gk20a *c) c->gpfifo.entry_num; } +static bool nvgpu_channel_timeout_debug_dump_state(struct gk20a *g, + struct channel_gk20a *ch) +{ + bool verbose = true; + if (nvgpu_is_error_notifier_set(ch, + NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT)) { + verbose = ch->timeout_debug_dump; + } + + return verbose; +} + +static void nvgpu_channel_set_has_timedout_and_wakeup_wqs(struct gk20a *g, + struct channel_gk20a *ch) +{ + /* mark channel as faulted */ + gk20a_channel_set_timedout(ch); + + /* unblock pending waits */ + nvgpu_cond_broadcast_interruptible(&ch->semaphore_wq); + nvgpu_cond_broadcast_interruptible(&ch->notifier_wq); +} + +bool nvgpu_channel_mark_error(struct gk20a *g, struct channel_gk20a *ch) +{ + bool verbose; + + verbose = nvgpu_channel_timeout_debug_dump_state(g, ch); + nvgpu_channel_set_has_timedout_and_wakeup_wqs(g, ch); + + return verbose; +} + +void nvgpu_channel_set_ctx_mmu_error(struct gk20a *g, + struct channel_gk20a *ch) +{ + nvgpu_err(g, + "channel %d generated a mmu fault", ch->chid); + g->ops.fifo.set_error_notifier(ch, + NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT); +} + bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, u32 timeout_delta_ms, bool *progress) { @@ -1389,6 +1431,26 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, ch->timeout_accumulated_ms > ch->timeout_ms_max; } +bool nvgpu_channel_check_ctxsw_timeout(struct channel_gk20a *ch, + bool *verbose, u32 *ms) +{ + bool recover = false; + bool progress = false; + struct gk20a *g = ch->g; + + recover = gk20a_channel_update_and_check_timeout(ch, + g->fifo_eng_timeout_us / 1000U, + &progress); + *verbose = ch->timeout_debug_dump; + *ms = ch->timeout_accumulated_ms; + if (recover) { + g->ops.fifo.set_error_notifier(ch, + NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); + } + + return recover; +} + u32 nvgpu_get_gp_free_count(struct channel_gk20a *c) { update_gp_get(c->g, c); diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 60bbfce86..bf88c9932 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -1428,49 +1428,6 @@ bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id, return g->ops.fifo.is_fault_engine_subid_gpc(g, engine_subid); } -/* caller must hold a channel reference */ -static bool gk20a_fifo_ch_timeout_debug_dump_state(struct gk20a *g, - struct channel_gk20a *refch) -{ - bool verbose = true; - if (refch == NULL) { - return verbose; - } - - if (nvgpu_is_error_notifier_set(refch, - NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT)) { - verbose = refch->timeout_debug_dump; - } - - return verbose; -} - -/* caller must hold a channel reference */ -static void gk20a_fifo_set_has_timedout_and_wake_up_wqs(struct gk20a *g, - struct channel_gk20a *refch) -{ - if (refch != NULL) { - /* mark channel as faulted */ - gk20a_channel_set_timedout(refch); - - /* unblock pending waits */ - nvgpu_cond_broadcast_interruptible(&refch->semaphore_wq); - nvgpu_cond_broadcast_interruptible(&refch->notifier_wq); - } -} - -/* caller must hold a channel reference */ -bool gk20a_fifo_error_ch(struct gk20a *g, - struct channel_gk20a *refch) -{ - bool verbose; - - verbose = gk20a_fifo_ch_timeout_debug_dump_state(g, refch); - gk20a_fifo_set_has_timedout_and_wake_up_wqs(g, refch); - - return verbose; -} - bool gk20a_fifo_error_tsg(struct gk20a *g, struct tsg_gk20a *tsg) { @@ -1480,7 +1437,7 @@ bool gk20a_fifo_error_tsg(struct gk20a *g, nvgpu_rwsem_down_read(&tsg->ch_list_lock); nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) { if (gk20a_channel_get(ch) != NULL) { - if (gk20a_fifo_error_ch(g, ch)) { + if (nvgpu_channel_mark_error(g, ch)) { verbose = true; } gk20a_channel_put(ch); @@ -1491,15 +1448,6 @@ bool gk20a_fifo_error_tsg(struct gk20a *g, return verbose; } -/* caller must hold a channel reference */ -void gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, - struct channel_gk20a *refch) -{ - nvgpu_err(g, - "channel %d generated a mmu fault", refch->chid); - g->ops.fifo.set_error_notifier(refch, - NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT); -} void gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, struct tsg_gk20a *tsg) @@ -1512,7 +1460,7 @@ void gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, nvgpu_rwsem_down_read(&tsg->ch_list_lock); nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) { if (gk20a_channel_get(ch) != NULL) { - gk20a_fifo_set_ctx_mmu_error_ch(g, ch); + nvgpu_channel_set_ctx_mmu_error(g, ch); gk20a_channel_put(ch); } } @@ -1789,11 +1737,11 @@ static bool gk20a_fifo_handle_mmu_fault_locked( g->ops.fifo.disable_channel(ch); } else { if (!fake_fault) { - gk20a_fifo_set_ctx_mmu_error_ch( + nvgpu_channel_set_ctx_mmu_error( g, refch); } - verbose = gk20a_fifo_error_ch(g, + verbose = nvgpu_channel_mark_error(g, refch); gk20a_channel_abort(ch, false); } @@ -1932,7 +1880,7 @@ void gk20a_fifo_recover_ch(struct gk20a *g, struct channel_gk20a *ch, } else { gk20a_channel_abort(ch, false); - if (gk20a_fifo_error_ch(g, ch)) { + if (nvgpu_channel_mark_error(g, ch)) { gk20a_debug_dump(g); } } @@ -2272,29 +2220,6 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, return active_engine_id; } -bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch, - bool *verbose, u32 *ms) -{ - bool recover = false; - bool progress = false; - struct gk20a *g = ch->g; - - if (gk20a_channel_get(ch) != NULL) { - recover = gk20a_channel_update_and_check_timeout(ch, - g->fifo_eng_timeout_us / 1000U, - &progress); - *verbose = ch->timeout_debug_dump; - *ms = ch->timeout_accumulated_ms; - if (recover) { - g->ops.fifo.set_error_notifier(ch, - NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); - } - - gk20a_channel_put(ch); - } - return recover; -} - bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg, bool *verbose, u32 *ms) { @@ -2380,6 +2305,7 @@ bool gk20a_fifo_handle_sched_error(struct gk20a *g) u32 id = U32_MAX; bool is_tsg = false; bool ret = false; + struct channel_gk20a *ch = NULL; /* read the scheduler error register */ sched_error = gk20a_readl(g, fifo_intr_sched_error_r()); @@ -2411,8 +2337,16 @@ bool gk20a_fifo_handle_sched_error(struct gk20a *g) ret = g->ops.fifo.check_tsg_ctxsw_timeout( &f->tsg[id], &verbose, &ms); } else { - ret = g->ops.fifo.check_ch_ctxsw_timeout( - &f->channel[id], &verbose, &ms); + ch = gk20a_channel_from_id(g, id); + if (ch != NULL) { + ret = g->ops.fifo.check_ch_ctxsw_timeout( + ch, &verbose, &ms); + + gk20a_channel_put(ch); + } else { + /* skip recovery since channel is null */ + ret = false; + } } if (ret) { diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 05158abb4..109b5b24e 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -317,10 +317,7 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, void gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, struct tsg_gk20a *tsg); void gk20a_fifo_abort_tsg(struct gk20a *g, struct tsg_gk20a *tsg, bool preempt); -void gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, - struct channel_gk20a *refch); bool gk20a_fifo_error_tsg(struct gk20a *g, struct tsg_gk20a *tsg); -bool gk20a_fifo_error_ch(struct gk20a *g, struct channel_gk20a *refch); void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg); int gk20a_fifo_set_runlist_interleave(struct gk20a *g, @@ -452,8 +449,6 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids, u32 hw_id, unsigned int id_type, unsigned int rc_type, struct mmu_fault_info *mmfault); -bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch, - bool *verbose, u32 *ms); bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg, bool *verbose, u32 *ms); bool gk20a_fifo_handle_sched_error(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c index ca147e352..41c557205 100644 --- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c @@ -531,7 +531,7 @@ static const struct gpu_ops gm20b_ops = { .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, - .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, + .check_ch_ctxsw_timeout = nvgpu_channel_check_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, .set_error_notifier = nvgpu_set_error_notifier, diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c index 5d80c64af..5dae5cbc5 100644 --- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c @@ -581,7 +581,7 @@ static const struct gpu_ops gp10b_ops = { .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, - .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, + .check_ch_ctxsw_timeout = nvgpu_channel_check_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, .set_error_notifier = nvgpu_set_error_notifier, diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index aef288e68..4dd890541 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -751,7 +751,7 @@ static const struct gpu_ops gv100_ops = { .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, - .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, + .check_ch_ctxsw_timeout = nvgpu_channel_check_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, .set_error_notifier = nvgpu_set_error_notifier_if_empty, diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index d49af1626..38d3e7931 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -705,7 +705,7 @@ static const struct gpu_ops gv11b_ops = { .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, - .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, + .check_ch_ctxsw_timeout = nvgpu_channel_check_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, .set_error_notifier = nvgpu_set_error_notifier_if_empty, diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index 5c7f32e55..58a7c6ba4 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -364,8 +364,14 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid); void gk20a_channel_close(struct channel_gk20a *ch); void __gk20a_channel_kill(struct channel_gk20a *ch); +void nvgpu_channel_set_ctx_mmu_error(struct gk20a *g, + struct channel_gk20a *ch); +bool nvgpu_channel_mark_error(struct gk20a *g, struct channel_gk20a *ch); + bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, u32 timeout_delta_ms, bool *progress); +bool nvgpu_channel_check_ctxsw_timeout(struct channel_gk20a *ch, + bool *verbose, u32 *ms); void gk20a_disable_channel(struct channel_gk20a *ch); void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt); void gk20a_channel_abort_clean_up(struct channel_gk20a *ch); diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index 445db9d90..cd4e4e32a 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -779,7 +779,7 @@ static const struct gpu_ops tu104_ops = { .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, - .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, + .check_ch_ctxsw_timeout = nvgpu_channel_check_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, .set_error_notifier = nvgpu_set_error_notifier_if_empty, diff --git a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c index aed65c1e4..0b48d4e4d 100644 --- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c +++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c @@ -407,7 +407,7 @@ static const struct gpu_ops vgpu_gp10b_ops = { .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, - .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, + .check_ch_ctxsw_timeout = nvgpu_channel_check_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, .set_error_notifier = nvgpu_set_error_notifier, diff --git a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c index 67bfbc367..14586c6d7 100644 --- a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c @@ -479,7 +479,7 @@ static const struct gpu_ops vgpu_gv11b_ops = { .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, - .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, + .check_ch_ctxsw_timeout = nvgpu_channel_check_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, .set_error_notifier = nvgpu_set_error_notifier,