Files
linux-nvgpu/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gk20a.c
Debarshi Dutta f6c96f620f gpu: nvgpu: add CONFIG_NVGPU_KERNEL_MODE_SUBMIT flag
The following functions belong to the path of kernel_mode submit and
the flag CONFIG_NVGPU_KERNEL_MODE_SUBMIT is used to compile these out
of safety builds.

channel_gk20a_alloc_priv_cmdbuf
channel_gk20a_free_prealloc_resources
channel_gk20a_joblist_add
channel_gk20a_joblist_delete
channel_gk20a_joblist_peek
channel_gk20a_prealloc_resources
nvgpu_channel
nvgpu_channel_add_job
nvgpu_channel_alloc_job
nvgpu_channel_alloc_priv_cmdbuf
nvgpu_channel_clean_up_jobs
nvgpu_channel_free_job
nvgpu_channel_free_priv_cmd_entry
nvgpu_channel_free_priv_cmd_q
nvgpu_channel_from_worker_item
nvgpu_channel_get_gpfifo_free_count
nvgpu_channel_is_prealloc_enabled
nvgpu_channel_joblist_is_empty
nvgpu_channel_joblist_lock
nvgpu_channel_joblist_unlock
nvgpu_channel_kernelmode_deinit
nvgpu_channel_poll_wdt
nvgpu_channel_set_syncpt
nvgpu_channel_setup_kernelmode
nvgpu_channel_sync_get_ref
nvgpu_channel_sync_incr
nvgpu_channel_sync_incr_user
nvgpu_channel_sync_put_ref_and_check
nvgpu_channel_sync_wait_fence_fd
nvgpu_channel_update
nvgpu_channel_update_gpfifo_get_and_get_free_count
nvgpu_channel_update_priv_cmd_q_and_free_entry
nvgpu_channel_wdt_continue
nvgpu_channel_wdt_handler
nvgpu_channel_wdt_init
nvgpu_channel_wdt_restart_all_channels
nvgpu_channel_wdt_restart_all_channels
nvgpu_channel_wdt_rewind
nvgpu_channel_wdt_start
nvgpu_channel_wdt_stop
nvgpu_channel_worker_deinit
nvgpu_channel_worker_from_worker
nvgpu_channel_worker_init
nvgpu_channel_worker_poll_init
nvgpu_channel_worker_poll_wakeup_post_process_item
nvgpu_channel_worker_poll_wakeup_process_item
nvgpu_submit_channel_gpfifo_kernel
nvgpu_submit_channel_gpfifo_user
gk20a_userd_gp_get
gk20a_userd_pb_get
gk20a_userd_gp_put
nvgpu_fence_alloc

The following members of struct nvgpu_channel are compiled out of
safety build.

struct gpfifo_desc gpfifo;
struct priv_cmd_queue priv_cmd_q;
struct nvgpu_channel_sync *sync;
struct nvgpu_list_node worker_item;
struct nvgpu_channel_wdt wdt;

The following files are compiled out of safety build.

common/fifo/submit.c
common/sync/channe1_sync_semaphore.c
hal/fifo/userd_gv11b.c

Jira NVGPU-3479

Change-Id: If46c936477c6698f4bec3cab93906aaacb0ceabf
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2127212
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2019-06-30 22:04:48 -07:00

139 lines
4.5 KiB
C

/*
* Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/log.h>
#include <nvgpu/io.h>
#include <nvgpu/soc.h>
#include <nvgpu/ptimer.h>
#include <nvgpu/channel.h>
#include <nvgpu/rc.h>
#include <nvgpu/engines.h>
#include <hal/fifo/ctxsw_timeout_gk20a.h>
#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
void gk20a_fifo_ctxsw_timeout_enable(struct gk20a *g, bool enable)
{
u32 timeout;
if (enable) {
timeout = g->ctxsw_timeout_period_ms * 1000U; /* in us */
timeout = scale_ptimer(timeout,
ptimer_scalingfactor10x(g->ptimer_src_freq));
timeout |= fifo_eng_timeout_detection_enabled_f();
nvgpu_writel(g, fifo_eng_timeout_r(), timeout);
} else {
timeout = nvgpu_readl(g, fifo_eng_timeout_r());
timeout &= ~(fifo_eng_timeout_detection_enabled_f());
nvgpu_writel(g, fifo_eng_timeout_r(), timeout);
}
}
bool gk20a_fifo_handle_ctxsw_timeout(struct gk20a *g)
{
u32 sched_error;
u32 engine_id;
u32 id = U32_MAX;
bool is_tsg = false;
bool recover = false;
struct nvgpu_channel *ch = NULL;
struct nvgpu_tsg *tsg = NULL;
struct nvgpu_fifo *f = &g->fifo;
u32 ms = 0;
bool debug_dump = false;
/* read the scheduler error register */
sched_error = nvgpu_readl(g, fifo_intr_sched_error_r());
engine_id = nvgpu_engine_find_busy_doing_ctxsw(g, &id, &is_tsg);
/*
* Could not find the engine
* Possible Causes:
* a)
* On hitting engine reset, h/w drops the ctxsw_status to INVALID in
* fifo_engine_status register. Also while the engine is held in reset
* h/w passes busy/idle straight through. fifo_engine_status registers
* are correct in that there is no context switch outstanding
* as the CTXSW is aborted when reset is asserted.
* This is just a side effect of how gv100 and earlier versions of
* ctxsw_timeout behave.
* With gv11b and later, h/w snaps the context at the point of error
* so that s/w can see the tsg_id which caused the HW timeout.
* b)
* If engines are not busy and ctxsw state is valid then intr occurred
* in the past and if the ctxsw state has moved on to VALID from LOAD
* or SAVE, it means that whatever timed out eventually finished
* anyways. The problem with this is that s/w cannot conclude which
* context caused the problem as maybe more switches occurred before
* intr is handled.
*/
if (engine_id == NVGPU_INVALID_ENG_ID) {
nvgpu_info(g, "fifo ctxsw timeout: 0x%08x, failed to find engine "
"that is busy doing ctxsw. "
"May be ctxsw already happened", sched_error);
return false;
}
if (!nvgpu_engine_check_valid_id(g, engine_id)) {
nvgpu_err(g, "fifo ctxsw timeout: 0x%08x, engine_id %u not valid",
sched_error, engine_id);
return false;
}
if (id > f->num_channels) {
nvgpu_err(g, "fifo ctxsw timeout error: id is invalid %u", id);
return false;
}
if (is_tsg) {
tsg = nvgpu_tsg_check_and_get_from_id(g, id);
} else {
ch = nvgpu_channel_from_id(g, id);
if (ch != NULL) {
tsg = nvgpu_tsg_from_ch(ch);
nvgpu_channel_put(ch);
}
}
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
if (tsg != NULL) {
recover = g->ops.tsg.check_ctxsw_timeout(tsg, &debug_dump, &ms);
}
#endif
if (recover) {
nvgpu_err(g,
"fifo ctxsw timeout error: "
"engine=%u, %s=%d, ms=%u",
engine_id, is_tsg ? "tsg" : "ch", id, ms);
nvgpu_rc_ctxsw_timeout(g, BIT32(engine_id), tsg, debug_dump);
} else {
nvgpu_log_info(g,
"fifo is waiting for ctxsw switch for %d ms, "
"%s=%d", ms, is_tsg ? "tsg" : "ch", id);
}
return recover;
}