diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index 08a808bba..40af71ad4 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -1694,12 +1694,9 @@ static bool nvgpu_channel_ctxsw_timeout_debug_dump_state( return verbose; } -static void nvgpu_channel_set_has_timedout_and_wakeup_wqs(struct gk20a *g, - struct nvgpu_channel *ch) +void nvgpu_channel_wakeup_wqs(struct gk20a *g, + struct nvgpu_channel *ch) { - /* mark channel as faulted */ - nvgpu_channel_set_unserviceable(ch); - /* unblock pending waits */ if (nvgpu_cond_broadcast_interruptible(&ch->semaphore_wq) != 0) { nvgpu_warn(g, "failed to broadcast"); @@ -1714,7 +1711,11 @@ bool nvgpu_channel_mark_error(struct gk20a *g, struct nvgpu_channel *ch) bool verbose; verbose = nvgpu_channel_ctxsw_timeout_debug_dump_state(ch); - nvgpu_channel_set_has_timedout_and_wakeup_wqs(g, ch); + + /* mark channel as faulted */ + nvgpu_channel_set_unserviceable(ch); + + nvgpu_channel_wakeup_wqs(g, ch); return verbose; } @@ -1736,7 +1737,8 @@ void nvgpu_channel_sw_quiesce(struct gk20a *g) if (ch != NULL) { nvgpu_channel_set_error_notifier(g, ch, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); - nvgpu_channel_set_has_timedout_and_wakeup_wqs(g, ch); + nvgpu_channel_set_unserviceable(ch); + nvgpu_channel_wakeup_wqs(g, ch); nvgpu_channel_put(ch); } } diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index a01139a7d..692499099 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -423,6 +423,36 @@ clean_up_mutex: return err; } +void nvgpu_tsg_set_unserviceable(struct gk20a *g, + struct nvgpu_tsg *tsg) +{ + struct nvgpu_channel *ch = NULL; + + nvgpu_rwsem_down_read(&tsg->ch_list_lock); + nvgpu_list_for_each_entry(ch, &tsg->ch_list, nvgpu_channel, ch_entry) { + if (nvgpu_channel_get(ch) != NULL) { + nvgpu_channel_set_unserviceable(ch); + nvgpu_channel_put(ch); + } + } + nvgpu_rwsem_up_read(&tsg->ch_list_lock); +} + +void nvgpu_tsg_wakeup_wqs(struct gk20a *g, + struct nvgpu_tsg *tsg) +{ + struct nvgpu_channel *ch = NULL; + + nvgpu_rwsem_down_read(&tsg->ch_list_lock); + nvgpu_list_for_each_entry(ch, &tsg->ch_list, nvgpu_channel, ch_entry) { + if (nvgpu_channel_get(ch) != NULL) { + nvgpu_channel_wakeup_wqs(g, ch); + nvgpu_channel_put(ch); + } + } + nvgpu_rwsem_up_read(&tsg->ch_list_lock); +} + bool nvgpu_tsg_mark_error(struct gk20a *g, struct nvgpu_tsg *tsg) { diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c index edd61a6d2..e8e62001e 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -188,7 +188,7 @@ void gk20a_fifo_intr_0_isr(struct gk20a *g) } if ((fifo_intr & fifo_intr_0_mmu_fault_pending_f()) != 0U) { - (void) gk20a_fifo_handle_mmu_fault(g, 0, INVAL_ID, false); + gk20a_fifo_handle_mmu_fault(g, 0, INVAL_ID, false); clear_intr |= fifo_intr_0_mmu_fault_pending_f(); } diff --git a/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.c b/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.c index 1545fe1ea..f29bd6c78 100644 --- a/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.c +++ b/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.c @@ -231,7 +231,7 @@ void gk20a_fifo_handle_dropped_mmu_fault(struct gk20a *g) nvgpu_err(g, "dropped mmu fault (0x%08x)", fault_id); } -bool gk20a_fifo_handle_mmu_fault_locked( +void gk20a_fifo_handle_mmu_fault_locked( struct gk20a *g, u32 mmu_fault_engines, /* queried from HW if 0 */ u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/ @@ -240,7 +240,6 @@ bool gk20a_fifo_handle_mmu_fault_locked( bool fake_fault; unsigned long fault_id; unsigned long engine_mmu_fault_id; - bool debug_dump = true; struct nvgpu_engine_status_info engine_status; bool deferred_reset_pending = false; @@ -340,6 +339,28 @@ bool gk20a_fifo_handle_mmu_fault_locked( } } + /* Set unserviceable flag right at start of recovery to reduce + * the window of race between job submit and recovery on same + * TSG. + * The unserviceable flag is checked during job submit and + * prevent new jobs from being submitted to TSG which is headed + * for teardown. + */ + if (tsg != NULL) { + /* Set error notifier before letting userspace + * know about faulty channel. + * The unserviceable flag is moved early to + * disallow submits on the broken channel. If + * userspace checks the notifier code when a + * submit fails, we need it set to convey to + * userspace that channel is no longer usable. + */ + if (!fake_fault) { + nvgpu_tsg_set_ctx_mmu_error(g, tsg); + } + nvgpu_tsg_set_unserviceable(g, tsg); + } + /* check if engine reset should be deferred */ if (engine_id != NVGPU_INVALID_ENG_ID) { #ifdef CONFIG_NVGPU_DEBUGGER @@ -381,10 +402,7 @@ bool gk20a_fifo_handle_mmu_fault_locked( if (deferred_reset_pending) { g->ops.tsg.disable(tsg); } else { - if (!fake_fault) { - nvgpu_tsg_set_ctx_mmu_error(g, tsg); - } - debug_dump = nvgpu_tsg_mark_error(g, tsg); + nvgpu_tsg_wakeup_wqs(g, tsg); nvgpu_tsg_abort(g, tsg, false); } @@ -426,17 +444,14 @@ bool gk20a_fifo_handle_mmu_fault_locked( if (nvgpu_cg_pg_enable(g) != 0) { nvgpu_warn(g, "fail to enable power mgmt"); } - return debug_dump; } -bool gk20a_fifo_handle_mmu_fault( +void gk20a_fifo_handle_mmu_fault( struct gk20a *g, u32 mmu_fault_engines, /* queried from HW if 0 */ u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/ bool id_is_tsg) { - bool debug_dump; - nvgpu_log_fn(g, " "); nvgpu_log_info(g, "acquire engines_reset_mutex"); @@ -444,13 +459,11 @@ bool gk20a_fifo_handle_mmu_fault( nvgpu_runlist_lock_active_runlists(g); - debug_dump = gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines, + gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines, hw_id, id_is_tsg); nvgpu_runlist_unlock_active_runlists(g); nvgpu_log_info(g, "release engines_reset_mutex"); nvgpu_mutex_release(&g->fifo.engines_reset_mutex); - - return debug_dump; } diff --git a/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.h b/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.h index 6813a5fe4..3386720be 100644 --- a/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.h +++ b/drivers/gpu/nvgpu/hal/fifo/mmu_fault_gk20a.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -39,10 +39,10 @@ void gk20a_fifo_mmu_fault_info_dump(struct gk20a *g, u32 engine_id, void gk20a_fifo_handle_dropped_mmu_fault(struct gk20a *g); -bool gk20a_fifo_handle_mmu_fault(struct gk20a *g, u32 mmu_fault_engines, +void gk20a_fifo_handle_mmu_fault(struct gk20a *g, u32 mmu_fault_engines, u32 hw_id, bool id_is_tsg); -bool gk20a_fifo_handle_mmu_fault_locked(struct gk20a *g, u32 mmu_fault_engines, +void gk20a_fifo_handle_mmu_fault_locked(struct gk20a *g, u32 mmu_fault_engines, u32 hw_id, bool id_is_tsg); #endif /* NVGPU_FIFO_MMU_FAULT_GK20A_H */ diff --git a/drivers/gpu/nvgpu/hal/rc/rc_gk20a.c b/drivers/gpu/nvgpu/hal/rc/rc_gk20a.c index 622e7d186..cca661460 100644 --- a/drivers/gpu/nvgpu/hal/rc/rc_gk20a.c +++ b/drivers/gpu/nvgpu/hal/rc/rc_gk20a.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -116,12 +116,8 @@ void gk20a_fifo_recover(struct gk20a *g, u32 eng_bitmask, g->ops.fifo.intr_set_recover_mask(g); g->ops.fifo.trigger_mmu_fault(g, engine_ids); - /* - * Ignore the "Verbose" flag from - * gk20a_fifo_handle_mmu_fault_locked since it is not needed - * here - */ - (void) gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines, + + gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines, ref_id, ref_id_is_tsg); g->ops.fifo.intr_unset_recover_mask(g); diff --git a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c index e19d94ede..f216dcaf9 100644 --- a/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c @@ -175,6 +175,28 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, "act_eng_bitmask = 0x%x, mmufault ptr = 0x%p", id, id_type, rc_type, act_eng_bitmask, mmufault); + /* Set unserviceable flag right at start of recovery to reduce + * the window of race between job submit and recovery on same + * TSG. + * The unserviceable flag is checked during job submit and + * prevent new jobs from being submitted to TSG which is headed + * for teardown. + */ + if (tsg != NULL) { + /* Set error notifier before letting userspace + * know about faulty channel + * The unserviceable flag is moved early to + * disallow submits on the broken channel. If + * userspace checks the notifier code when a + * submit fails, we need it set to convey to + * userspace that channel is no longer usable. + */ + if (rc_type == RC_TYPE_MMU_FAULT) { + nvgpu_tsg_set_ctx_mmu_error(g, tsg); + } + nvgpu_tsg_set_unserviceable(g, tsg); + } + if (rc_type == RC_TYPE_MMU_FAULT && mmufault != NULL) { if(mmufault->faulted_pbdma != INVAL_ID) { pbdma_bitmask = BIT32(mmufault->faulted_pbdma); @@ -290,10 +312,7 @@ void gv11b_fifo_recover(struct gk20a *g, u32 act_eng_bitmask, g->ops.tsg.disable(tsg); } else { #endif - if (rc_type == RC_TYPE_MMU_FAULT) { - nvgpu_tsg_set_ctx_mmu_error(g, tsg); - } - (void)nvgpu_tsg_mark_error(g, tsg); + nvgpu_tsg_wakeup_wqs(g, tsg); nvgpu_tsg_abort(g, tsg, false); #ifdef CONFIG_NVGPU_DEBUGGER } diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index 2805eca3d..631ba2110 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -1050,6 +1050,18 @@ void nvgpu_channel_set_unserviceable(struct nvgpu_channel *ch); */ bool nvgpu_channel_check_unserviceable(struct nvgpu_channel *ch); +/** + * @brief Signal on wait queues (notify_wq and semaphore_wq). + * + * @param g [in] Pointer to GPU driver struct. + * @param ch [in] Channel pointer. + * + * Unblock pending waits on this channel (semaphore and error + * notifier wait queues). + * + */ +void nvgpu_channel_wakeup_wqs(struct gk20a *g, struct nvgpu_channel *ch); + #ifdef CONFIG_NVGPU_USERD /** * @brief Channel userd physical address. diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg.h b/drivers/gpu/nvgpu/include/nvgpu/tsg.h index ed76a6195..f1b189799 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h +++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h @@ -541,6 +541,49 @@ void nvgpu_tsg_disable_sched(struct gk20a *g, struct nvgpu_tsg *tsg); int nvgpu_tsg_alloc_sm_error_states_mem(struct gk20a *g, struct nvgpu_tsg *tsg, u32 num_sm); + +/** + * @brief Mark for all the referenceable channels of tsg's channel + * list as unserviceable. + * + * @param g [in] The GPU driver struct. + * @param tsg [in] Pointer to TSG struct. + * + * The channels are set unserviceable to convey that an uncorrectable + * error has occurred on the channel. It is not possible to take more + * references on this channel and only available option for userspace + * is to close the channel fd. + * - Acquire #nvgpu_tsg.ch_list_lock of the tsg. + * - For each entry of the channels in #nvgpu_tsg.ch_list of the tsg, + * -- Get reference to the channel. + * -- If channel is referenceable, + * --- Call #nvgpu_channel_set_unserviceable. + * --- Put reference to the channel. + * - Release #nvgpu_tsg.ch_list_lock of the tsg. + * + */ +void nvgpu_tsg_set_unserviceable(struct gk20a *g, struct nvgpu_tsg *tsg); + +/** + * @brief Release waiters for all the referenceable channels of tsg's + * channel list. + * + * @param g [in] The GPU driver struct. + * @param tsg [in] Pointer to TSG struct. + * + * Unblock pending waits on this channel (semaphore and error + * notifier wait queues) + * - Acquire #nvgpu_tsg.ch_list_lock of the tsg. + * - For each entry of the channels in #nvgpu_tsg.ch_list of the tsg, + * -- Get reference to the channel. + * -- If channel is referenceable, + * --- Call #nvgpu_channel_wakeup_wqs. + * --- Put reference to the channel. + * - Release #nvgpu_tsg.ch_list_lock of the tsg. + * + */ +void nvgpu_tsg_wakeup_wqs(struct gk20a *g, struct nvgpu_tsg *tsg); + #ifdef CONFIG_NVGPU_DEBUGGER int nvgpu_tsg_set_sm_exception_type_mask(struct nvgpu_channel *ch, u32 exception_mask);