mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 10:34:43 +03:00
gpu: nvgpu: stop timer on failing channel
In gk20a_channel_timeout_handler(), below deadlock scenario is possible : thread 1: - take global lock g->ch_wdt_lock - identify timed out channel (as ch1) - check engine status which is stuck - identify failing channel on engine as ch2 - we need to trigger recovery with ch2 - as part of recovery, call channel_abort() for ch2 - in channel_abort(), we wait to cancel the timer wq - but timer wq for ch2 never completes due to thread 2 thread 2: - ch2 has already timed out - to process, we wait for global lock g->ch_wdt_lock - this lock needs to be released by thread 1 To fix this, cancel the timer (through flag) of ch2 (failing channel on engine) before triggering recovery on that channel Bug 200164753 Change-Id: Idb42d01c8440a53f43cb5e87e41f1c283f7e8fcf Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/929924 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
committed by
Terje Bergstrom
parent
9713e3572a
commit
0ce201e8de
@@ -1700,9 +1700,14 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
|
||||
if (!failing_ch)
|
||||
goto fail_enable_ctxsw;
|
||||
|
||||
if (failing_ch->hw_chid != ch->hw_chid)
|
||||
if (failing_ch->hw_chid != ch->hw_chid) {
|
||||
gk20a_channel_timeout_start(ch, job);
|
||||
|
||||
mutex_lock(&failing_ch->timeout.lock);
|
||||
failing_ch->timeout.initialized = false;
|
||||
mutex_unlock(&failing_ch->timeout.lock);
|
||||
}
|
||||
|
||||
gk20a_fifo_recover(g, BIT(engine_id),
|
||||
failing_ch->hw_chid, is_tsg,
|
||||
true, failing_ch->timeout_debug_dump);
|
||||
|
||||
Reference in New Issue
Block a user