diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 50a35f512..53be9c337 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -223,6 +223,8 @@ nvgpu-y += \ hal/fifo/userd_gv11b.o \ hal/fifo/fifo_intr_gk20a.o \ hal/fifo/fifo_intr_gv11b.o \ + hal/fifo/ctxsw_timeout_gk20a.o \ + hal/fifo/ctxsw_timeout_gv11b.o \ hal/falcon/falcon_gk20a.o \ hal/nvlink/minion_gv100.o \ hal/nvlink/minion_tu104.o \ @@ -408,6 +410,7 @@ nvgpu-y += \ common/power_features/pg/pg.o \ common/sim.o \ common/sim_pci.o \ + common/rc/rc.o \ common/fifo/fifo.o \ common/fifo/channel.o \ common/fifo/submit.o \ diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources index 9876aade9..b6bea3ff3 100644 --- a/drivers/gpu/nvgpu/Makefile.sources +++ b/drivers/gpu/nvgpu/Makefile.sources @@ -167,6 +167,7 @@ srcs += common/sim.c \ common/power_features/cg/cg.c \ common/power_features/pg/pg.c \ common/fifo/channel.c \ + common/rc/rc.c \ common/fifo/fifo.c \ common/fifo/submit.c \ common/fifo/tsg.c \ @@ -376,6 +377,8 @@ srcs += common/sim.c \ hal/fifo/userd_gv11b.c \ hal/fifo/fifo_intr_gk20a.c \ hal/fifo/fifo_intr_gv11b.c \ + hal/fifo/ctxsw_timeout_gk20a.c \ + hal/fifo/ctxsw_timeout_gv11b.c \ hal/falcon/falcon_gk20a.c \ hal/nvlink/minion_gv100.c \ hal/nvlink/minion_tu104.c \ diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index 56b4449fe..e8a0dc688 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -412,14 +412,14 @@ void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g, struct tsg_gk20a *tsg) } bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg, - bool *verbose, u32 *ms) + bool *debug_dump, u32 *ms) { struct channel_gk20a *ch; bool recover = false; bool progress = false; struct gk20a *g = tsg->g; - *verbose = false; + *debug_dump = false; *ms = g->ctxsw_timeout_period_ms; nvgpu_rwsem_down_read(&tsg->ch_list_lock); @@ -446,13 +446,10 @@ bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg, * which channel caused the problem, so set ctxsw timeout error * notifier for all channels. */ - nvgpu_log_info(g, "timeout on tsg=%d ch=%d", - tsg->tsgid, ch->chid); *ms = ch->ctxsw_timeout_accumulated_ms; gk20a_channel_put(ch); - nvgpu_tsg_set_error_notifier(g, tsg, - NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); - *verbose = nvgpu_tsg_ctxsw_timeout_debug_dump_state(tsg); + *debug_dump = nvgpu_tsg_ctxsw_timeout_debug_dump_state(tsg); + } else if (progress) { /* * if at least one channel in the TSG made some progress, reset diff --git a/drivers/gpu/nvgpu/common/rc/rc.c b/drivers/gpu/nvgpu/common/rc/rc.c new file mode 100644 index 000000000..31c5d4949 --- /dev/null +++ b/drivers/gpu/nvgpu/common/rc/rc.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gk20a/fifo_gk20a.h" + +void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, + struct tsg_gk20a *tsg, bool debug_dump) +{ + nvgpu_tsg_set_error_notifier(g, tsg, + NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT); + /* + * Cancel all channels' wdt since ctxsw timeout might + * trigger multiple watchdogs at a time + */ + nvgpu_channel_wdt_restart_all_channels(g); + gk20a_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump, + RC_TYPE_CTXSW_TIMEOUT); +} diff --git a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c index 8a2123876..1849d0682 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c @@ -420,7 +420,6 @@ static const struct gpu_ops vgpu_gp10b_ops = { .is_preempt_pending = NULL, .reset_enable_hw = NULL, .teardown_ch_tsg = NULL, - .handle_sched_error = NULL, .tsg_bind_channel = vgpu_tsg_bind_channel, .tsg_unbind_channel = vgpu_tsg_unbind_channel, .post_event_id = gk20a_tsg_event_id_post_event, @@ -436,6 +435,9 @@ static const struct gpu_ops vgpu_gp10b_ops = { .intr_1_enable = NULL, .intr_0_isr = NULL, .intr_1_isr = NULL, + .handle_sched_error = NULL, + .handle_ctxsw_timeout = NULL, + .ctxsw_timeout_enable = NULL, }, .engine = { .is_fault_engine_subid_gpc = gm20b_is_fault_engine_subid_gpc, diff --git a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c index 98e550b5b..bdb403ede 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c @@ -493,7 +493,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { .is_preempt_pending = gv11b_fifo_is_preempt_pending, .reset_enable_hw = NULL, .teardown_ch_tsg = NULL, - .handle_sched_error = NULL, .init_eng_method_buffers = gv11b_fifo_init_eng_method_buffers, .deinit_eng_method_buffers = gv11b_fifo_deinit_eng_method_buffers, @@ -508,7 +507,6 @@ static const struct gpu_ops vgpu_gv11b_ops = { .cleanup_sw = vgpu_fifo_cleanup_sw, .resetup_ramfc = NULL, .free_channel_ctx_header = vgpu_channel_free_ctx_header, - .handle_ctxsw_timeout = gv11b_fifo_handle_ctxsw_timeout, .ring_channel_doorbell = gv11b_ring_channel_doorbell, .set_sm_exception_type_mask = vgpu_set_sm_exception_type_mask, .usermode_base = gv11b_fifo_usermode_base, @@ -517,6 +515,9 @@ static const struct gpu_ops vgpu_gv11b_ops = { .intr_1_enable = NULL, .intr_0_isr = NULL, .intr_1_isr = NULL, + .handle_sched_error = NULL, + .handle_ctxsw_timeout = NULL, + .ctxsw_timeout_enable = NULL, }, .engine = { .is_fault_engine_subid_gpc = gv11b_is_fault_engine_subid_gpc, diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index dead46c92..fdd92c0d9 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -1168,112 +1168,6 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, return active_engine_id; } -bool gk20a_fifo_handle_sched_error(struct gk20a *g) -{ - u32 sched_error; - u32 engine_id; - u32 id = U32_MAX; - bool is_tsg = false; - bool ret = false; - struct channel_gk20a *ch = NULL; - - /* read the scheduler error register */ - sched_error = gk20a_readl(g, fifo_intr_sched_error_r()); - - engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg); - /* - * Could not find the engine - * Possible Causes: - * a) - * On hitting engine reset, h/w drops the ctxsw_status to INVALID in - * fifo_engine_status register. Also while the engine is held in reset - * h/w passes busy/idle straight through. fifo_engine_status registers - * are correct in that there is no context switch outstanding - * as the CTXSW is aborted when reset is asserted. - * This is just a side effect of how gv100 and earlier versions of - * ctxsw_timeout behave. - * With gv11b and later, h/w snaps the context at the point of error - * so that s/w can see the tsg_id which caused the HW timeout. - * b) - * If engines are not busy and ctxsw state is valid then intr occurred - * in the past and if the ctxsw state has moved on to VALID from LOAD - * or SAVE, it means that whatever timed out eventually finished - * anyways. The problem with this is that s/w cannot conclude which - * context caused the problem as maybe more switches occurred before - * intr is handled. - */ - if (engine_id == FIFO_INVAL_ENGINE_ID) { - nvgpu_info(g, "fifo sched error: 0x%08x, failed to find engine " - "that is busy doing ctxsw. " - "May be ctxsw already happened", sched_error); - ret = false; - goto err; - } - - if (!nvgpu_engine_check_valid_id(g, engine_id)) { - nvgpu_err(g, "fifo sched error: 0x%08x, engine_id %u not valid", - sched_error, engine_id); - ret = false; - goto err; - } - - if (fifo_intr_sched_error_code_f(sched_error) == - fifo_intr_sched_error_code_ctxsw_timeout_v()) { - struct fifo_gk20a *f = &g->fifo; - u32 ms = 0; - bool verbose = false; - - if (id > f->num_channels) { - nvgpu_err(g, "fifo sched error : channel id invalid %u", - id); - ret = false; - goto err; - } - - if (is_tsg) { - ret = g->ops.tsg.check_ctxsw_timeout( - &f->tsg[id], &verbose, &ms); - } else { - ch = gk20a_channel_from_id(g, id); - if (ch != NULL) { - ret = g->ops.channel.check_ctxsw_timeout( - ch, &verbose, &ms); - - gk20a_channel_put(ch); - } else { - /* skip recovery since channel is null */ - ret = false; - } - } - - if (ret) { - nvgpu_err(g, - "fifo sched ctxsw timeout error: " - "engine=%u, %s=%d, ms=%u", - engine_id, is_tsg ? "tsg" : "ch", id, ms); - /* - * Cancel all channels' timeout since SCHED error might - * trigger multiple watchdogs at a time - */ - nvgpu_channel_wdt_restart_all_channels(g); - gk20a_fifo_recover(g, BIT(engine_id), id, - is_tsg, true, verbose, - RC_TYPE_CTXSW_TIMEOUT); - } else { - nvgpu_log_info(g, - "fifo is waiting for ctx switch for %d ms, " - "%s=%d", ms, is_tsg ? "tsg" : "ch", id); - } - } else { - nvgpu_err(g, - "fifo sched error : 0x%08x, engine=%u, %s=%d", - sched_error, engine_id, is_tsg ? "tsg" : "ch", id); - } - -err: - return ret; -} - static void gk20a_fifo_pbdma_fault_rc(struct gk20a *g, struct fifo_gk20a *f, u32 pbdma_id, u32 error_notifier) diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 90eb3183c..adf8c307a 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -344,7 +344,6 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids, void gk20a_fifo_teardown_mask_intr(struct gk20a *g); void gk20a_fifo_teardown_unmask_intr(struct gk20a *g); -bool gk20a_fifo_handle_sched_error(struct gk20a *g); u32 gk20a_fifo_handle_pbdma_intr(struct gk20a *g, struct fifo_gk20a *f, u32 pbdma_id, unsigned int rc); diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c index a08ea4a98..5fb7e4d6e 100644 --- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c @@ -54,6 +54,7 @@ #include "hal/fifo/pbdma_status_gm20b.h" #include "hal/fifo/userd_gk20a.h" #include "hal/fifo/fifo_intr_gk20a.h" +#include "hal/fifo/ctxsw_timeout_gk20a.h" #include "hal/gr/zbc/zbc_gm20b.h" #include "hal/gr/zcull/zcull_gm20b.h" #include "hal/gr/init/gr_init_gm20b.h" @@ -569,7 +570,6 @@ static const struct gpu_ops gm20b_ops = { .teardown_ch_tsg = gk20a_fifo_teardown_ch_tsg, .teardown_mask_intr = gk20a_fifo_teardown_mask_intr, .teardown_unmask_intr = gk20a_fifo_teardown_unmask_intr, - .handle_sched_error = gk20a_fifo_handle_sched_error, .tsg_bind_channel = gk20a_tsg_bind_channel, .tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, .post_event_id = gk20a_tsg_event_id_post_event, @@ -587,6 +587,9 @@ static const struct gpu_ops gm20b_ops = { .intr_1_enable = gk20a_fifo_intr_1_enable, .intr_0_isr = gk20a_fifo_intr_0_isr, .intr_1_isr = gk20a_fifo_intr_1_isr, + .handle_sched_error = gk20a_fifo_handle_sched_error, + .ctxsw_timeout_enable = gk20a_fifo_ctxsw_timeout_enable, + .handle_ctxsw_timeout = gk20a_fifo_handle_ctxsw_timeout, }, .engine = { .is_fault_engine_subid_gpc = gm20b_is_fault_engine_subid_gpc, diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c index 30c90dba8..1279224b9 100644 --- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c @@ -63,6 +63,7 @@ #include "hal/fifo/pbdma_status_gm20b.h" #include "hal/fifo/userd_gk20a.h" #include "hal/fifo/fifo_intr_gk20a.h" +#include "hal/fifo/ctxsw_timeout_gk20a.h" #include "hal/gr/fecs_trace/fecs_trace_gm20b.h" #include "hal/gr/fecs_trace/fecs_trace_gp10b.h" #include "hal/gr/config/gr_config_gm20b.h" @@ -647,7 +648,6 @@ static const struct gpu_ops gp10b_ops = { .teardown_ch_tsg = gk20a_fifo_teardown_ch_tsg, .teardown_mask_intr = gk20a_fifo_teardown_mask_intr, .teardown_unmask_intr = gk20a_fifo_teardown_unmask_intr, - .handle_sched_error = gk20a_fifo_handle_sched_error, .tsg_bind_channel = gk20a_tsg_bind_channel, .tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, .post_event_id = gk20a_tsg_event_id_post_event, @@ -666,6 +666,9 @@ static const struct gpu_ops gp10b_ops = { .intr_1_enable = gk20a_fifo_intr_1_enable, .intr_0_isr = gk20a_fifo_intr_0_isr, .intr_1_isr = gk20a_fifo_intr_1_isr, + .handle_sched_error = gk20a_fifo_handle_sched_error, + .ctxsw_timeout_enable = gk20a_fifo_ctxsw_timeout_enable, + .handle_ctxsw_timeout = gk20a_fifo_handle_ctxsw_timeout, }, .engine = { .is_fault_engine_subid_gpc = gm20b_is_fault_engine_subid_gpc, diff --git a/drivers/gpu/nvgpu/gv100/fifo_gv100.c b/drivers/gpu/nvgpu/gv100/fifo_gv100.c index 85e07de36..ed5d1985f 100644 --- a/drivers/gpu/nvgpu/gv100/fifo_gv100.c +++ b/drivers/gpu/nvgpu/gv100/fifo_gv100.c @@ -38,17 +38,6 @@ u32 gv100_fifo_get_preempt_timeout(struct gk20a *g) return g->ctxsw_timeout_period_ms; } -void gv100_apply_ctxsw_timeout_intr(struct gk20a *g) -{ - u32 timeout; - - timeout = g->ch_wdt_init_limit_ms*1000U; - timeout = scale_ptimer(timeout, - ptimer_scalingfactor10x(g->ptimer_src_freq)); - timeout |= fifo_eng_timeout_detection_enabled_f(); - gk20a_writel(g, fifo_eng_timeout_r(), timeout); -} - void gv100_fifo_teardown_mask_intr(struct gk20a *g) { u32 val; diff --git a/drivers/gpu/nvgpu/gv100/fifo_gv100.h b/drivers/gpu/nvgpu/gv100/fifo_gv100.h index c65a89db2..8455fa6f5 100644 --- a/drivers/gpu/nvgpu/gv100/fifo_gv100.h +++ b/drivers/gpu/nvgpu/gv100/fifo_gv100.h @@ -29,7 +29,6 @@ struct gk20a; u32 gv100_fifo_get_preempt_timeout(struct gk20a *g); -void gv100_apply_ctxsw_timeout_intr(struct gk20a *g); void gv100_fifo_teardown_mask_intr(struct gk20a *g); void gv100_fifo_teardown_unmask_intr(struct gk20a *g); #endif diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index 299c5abd0..7e096a9d6 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -55,6 +55,7 @@ #include "hal/fifo/userd_gv11b.h" #include "hal/fifo/fifo_intr_gk20a.h" #include "hal/fifo/fifo_intr_gv11b.h" +#include "hal/fifo/ctxsw_timeout_gk20a.h" #include "hal/gr/fecs_trace/fecs_trace_gm20b.h" #include "hal/gr/config/gr_config_gm20b.h" #include "hal/gr/config/gr_config_gv100.h" @@ -823,7 +824,6 @@ static const struct gpu_ops gv100_ops = { .teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg, .teardown_mask_intr = gv100_fifo_teardown_mask_intr, .teardown_unmask_intr = gv100_fifo_teardown_unmask_intr, - .handle_sched_error = gk20a_fifo_handle_sched_error, .init_eng_method_buffers = gv11b_fifo_init_eng_method_buffers, .deinit_eng_method_buffers = gv11b_fifo_deinit_eng_method_buffers, @@ -849,6 +849,9 @@ static const struct gpu_ops gv100_ops = { .intr_1_enable = gk20a_fifo_intr_1_enable, .intr_0_isr = gv11b_fifo_intr_0_isr, .intr_1_isr = gk20a_fifo_intr_1_isr, + .handle_sched_error = gk20a_fifo_handle_sched_error, + .ctxsw_timeout_enable = gk20a_fifo_ctxsw_timeout_enable, + .handle_ctxsw_timeout = gk20a_fifo_handle_ctxsw_timeout, }, .engine = { .is_fault_engine_subid_gpc = gv11b_is_fault_engine_subid_gpc, diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 4c699cb3c..cd7f93d5a 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -56,6 +56,7 @@ #include "hal/fifo/userd_gv11b.h" #include "hal/fifo/fifo_intr_gk20a.h" #include "hal/fifo/fifo_intr_gv11b.h" +#include "hal/fifo/ctxsw_timeout_gv11b.h" #include "hal/gr/fecs_trace/fecs_trace_gm20b.h" #include "hal/gr/fecs_trace/fecs_trace_gv11b.h" #include "hal/gr/config/gr_config_gm20b.h" @@ -778,7 +779,6 @@ static const struct gpu_ops gv11b_ops = { .teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg, .teardown_mask_intr = gv11b_fifo_teardown_mask_intr, .teardown_unmask_intr = gv11b_fifo_teardown_unmask_intr, - .handle_sched_error = gv11b_fifo_handle_sched_error, .init_eng_method_buffers = gv11b_fifo_init_eng_method_buffers, .deinit_eng_method_buffers = gv11b_fifo_deinit_eng_method_buffers, @@ -793,7 +793,6 @@ static const struct gpu_ops gv11b_ops = { .cleanup_sw = nvgpu_fifo_cleanup_sw, .resetup_ramfc = NULL, .free_channel_ctx_header = gv11b_free_subctx_header, - .handle_ctxsw_timeout = gv11b_fifo_handle_ctxsw_timeout, .ring_channel_doorbell = gv11b_ring_channel_doorbell, .set_sm_exception_type_mask = gk20a_tsg_set_sm_exception_type_mask, .usermode_base = gv11b_fifo_usermode_base, @@ -805,6 +804,9 @@ static const struct gpu_ops gv11b_ops = { .intr_1_enable = gk20a_fifo_intr_1_enable, .intr_0_isr = gv11b_fifo_intr_0_isr, .intr_1_isr = gk20a_fifo_intr_1_isr, + .handle_sched_error = gv11b_fifo_handle_sched_error, + .ctxsw_timeout_enable = gv11b_fifo_ctxsw_timeout_enable, + .handle_ctxsw_timeout = gv11b_fifo_handle_ctxsw_timeout, }, .engine = { .is_fault_engine_subid_gpc = gv11b_is_fault_engine_subid_gpc, diff --git a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gk20a.c b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gk20a.c new file mode 100644 index 000000000..671c6f89c --- /dev/null +++ b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gk20a.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +void gk20a_fifo_ctxsw_timeout_enable(struct gk20a *g, bool enable) +{ + u32 timeout; + + if (enable) { + timeout = g->ctxsw_timeout_period_ms * 1000U; /* in us */ + timeout = scale_ptimer(timeout, + ptimer_scalingfactor10x(g->ptimer_src_freq)); + timeout |= fifo_eng_timeout_detection_enabled_f(); + nvgpu_writel(g, fifo_eng_timeout_r(), timeout); + } else { + timeout = nvgpu_readl(g, fifo_eng_timeout_r()); + timeout &= ~(fifo_eng_timeout_detection_enabled_f()); + nvgpu_writel(g, fifo_eng_timeout_r(), timeout); + } +} + +bool gk20a_fifo_handle_ctxsw_timeout(struct gk20a *g) +{ + u32 sched_error; + u32 engine_id; + u32 id = U32_MAX; + bool is_tsg = false; + bool recover = false; + struct channel_gk20a *ch = NULL; + struct tsg_gk20a *tsg = NULL; + struct fifo_gk20a *f = &g->fifo; + u32 ms = 0; + bool debug_dump = false; + + /* read the scheduler error register */ + sched_error = nvgpu_readl(g, fifo_intr_sched_error_r()); + + engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg); + /* + * Could not find the engine + * Possible Causes: + * a) + * On hitting engine reset, h/w drops the ctxsw_status to INVALID in + * fifo_engine_status register. Also while the engine is held in reset + * h/w passes busy/idle straight through. fifo_engine_status registers + * are correct in that there is no context switch outstanding + * as the CTXSW is aborted when reset is asserted. + * This is just a side effect of how gv100 and earlier versions of + * ctxsw_timeout behave. + * With gv11b and later, h/w snaps the context at the point of error + * so that s/w can see the tsg_id which caused the HW timeout. + * b) + * If engines are not busy and ctxsw state is valid then intr occurred + * in the past and if the ctxsw state has moved on to VALID from LOAD + * or SAVE, it means that whatever timed out eventually finished + * anyways. The problem with this is that s/w cannot conclude which + * context caused the problem as maybe more switches occurred before + * intr is handled. + */ + if (engine_id == FIFO_INVAL_ENGINE_ID) { + nvgpu_info(g, "fifo ctxsw timeout: 0x%08x, failed to find engine " + "that is busy doing ctxsw. " + "May be ctxsw already happened", sched_error); + return false; + } + + if (!nvgpu_engine_check_valid_id(g, engine_id)) { + nvgpu_err(g, "fifo ctxsw timeout: 0x%08x, engine_id %u not valid", + sched_error, engine_id); + return false; + } + + if (id > f->num_channels) { + nvgpu_err(g, "fifo ctxsw timeout error: id is invalid %u", id); + return false; + } + + if (is_tsg) { + tsg = &f->tsg[id]; + } else { + ch = gk20a_channel_from_id(g, id); + if (ch != NULL) { + tsg = tsg_gk20a_from_ch(ch); + gk20a_channel_put(ch); + } + } + + if (tsg != NULL) { + recover = g->ops.tsg.check_ctxsw_timeout(tsg, &debug_dump, &ms); + } + + if (recover) { + nvgpu_err(g, + "fifo ctxsw timeout error: " + "engine=%u, %s=%d, ms=%u", + engine_id, is_tsg ? "tsg" : "ch", id, ms); + + nvgpu_rc_ctxsw_timeout(g, BIT(engine_id), tsg, debug_dump); + } else { + nvgpu_log_info(g, + "fifo is waiting for ctxsw switch for %d ms, " + "%s=%d", ms, is_tsg ? "tsg" : "ch", id); + } + + return recover; +} diff --git a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gk20a.h b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gk20a.h new file mode 100644 index 000000000..317abad58 --- /dev/null +++ b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gk20a.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_FIFO_CTXSW_TIMEOUT_GK20A_H +#define NVGPU_FIFO_CTXSW_TIMEOUT_GK20A_H + +#include + +struct gk20a; + +void gk20a_fifo_ctxsw_timeout_enable(struct gk20a *g, bool enable); +bool gk20a_fifo_handle_ctxsw_timeout(struct gk20a *g); + +#endif /* NVGPU_FIFO_CTXSW_TIMEOUT_GK20A_H */ diff --git a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b.c b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b.c new file mode 100644 index 000000000..5e632968d --- /dev/null +++ b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +static const char * const invalid_str = "invalid"; + +static const char *const ctxsw_timeout_status_desc[] = { + "awaiting ack", + "eng was reset", + "ack received", + "dropped timeout" +}; + +void gv11b_fifo_ctxsw_timeout_enable(struct gk20a *g, bool enable) +{ + u32 timeout; + + if (enable) { + /* clear ctxsw timeout interrupts */ + nvgpu_writel(g, fifo_intr_ctxsw_timeout_r(), ~U32(0U)); + + if (nvgpu_platform_is_silicon(g)) { + timeout = g->ctxsw_timeout_period_ms * 1000U; + timeout = scale_ptimer(timeout, + ptimer_scalingfactor10x(g->ptimer_src_freq)); + timeout |= fifo_eng_ctxsw_timeout_detection_enabled_f(); + nvgpu_writel(g, fifo_eng_ctxsw_timeout_r(), timeout); + } else { + timeout = nvgpu_readl(g, fifo_eng_ctxsw_timeout_r()); + nvgpu_log_info(g, + "fifo_eng_ctxsw_timeout reg val = 0x%08x", + timeout); + timeout = set_field(timeout, + fifo_eng_ctxsw_timeout_period_m(), + fifo_eng_ctxsw_timeout_period_max_f()); + timeout = set_field(timeout, + fifo_eng_ctxsw_timeout_detection_m(), + fifo_eng_ctxsw_timeout_detection_disabled_f()); + nvgpu_log_info(g, + "new fifo_eng_ctxsw_timeout reg val = 0x%08x", + timeout); + nvgpu_writel(g, fifo_eng_ctxsw_timeout_r(), timeout); + } + + } else { + timeout = nvgpu_readl(g, fifo_eng_ctxsw_timeout_r()); + timeout = set_field(timeout, + fifo_eng_ctxsw_timeout_detection_m(), + fifo_eng_ctxsw_timeout_detection_disabled_f()); + nvgpu_writel(g, fifo_eng_ctxsw_timeout_r(), timeout); + timeout = nvgpu_readl(g, fifo_eng_ctxsw_timeout_r()); + nvgpu_info(g, "fifo_eng_ctxsw_timeout disabled val = 0x%08x", + timeout); + /* clear ctxsw timeout interrupts */ + nvgpu_writel(g, fifo_intr_ctxsw_timeout_r(), ~U32(0U)); + } +} + +static u32 gv11b_fifo_ctxsw_timeout_info(struct gk20a *g, u32 active_eng_id, + u32 *info_status) +{ + u32 tsgid = FIFO_INVAL_TSG_ID; + u32 timeout_info; + u32 ctx_status; + + timeout_info = nvgpu_readl(g, + fifo_intr_ctxsw_timeout_info_r(active_eng_id)); + + /* + * ctxsw_state and tsgid are snapped at the point of the timeout and + * will not change while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit + * is PENDING. + */ + ctx_status = fifo_intr_ctxsw_timeout_info_ctxsw_state_v(timeout_info); + if (ctx_status == + fifo_intr_ctxsw_timeout_info_ctxsw_state_load_v()) { + + tsgid = fifo_intr_ctxsw_timeout_info_next_tsgid_v(timeout_info); + + } else if (ctx_status == + fifo_intr_ctxsw_timeout_info_ctxsw_state_switch_v() || + ctx_status == + fifo_intr_ctxsw_timeout_info_ctxsw_state_save_v()) { + + tsgid = fifo_intr_ctxsw_timeout_info_prev_tsgid_v(timeout_info); + } + nvgpu_log_info(g, "ctxsw timeout info: tsgid = %d", tsgid); + + /* + * STATUS indicates whether the context request ack was eventually + * received and whether a subsequent request timed out. This field is + * updated live while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit + * is PENDING. STATUS starts in AWAITING_ACK, and progresses to + * ACK_RECEIVED and finally ends with DROPPED_TIMEOUT. + * + * AWAITING_ACK - context request ack still not returned from engine. + * ENG_WAS_RESET - The engine was reset via a PRI write to NV_PMC_ENABLE + * or NV_PMC_ELPG_ENABLE prior to receiving the ack. Host will not + * expect ctx ack to return, but if it is already in flight, STATUS will + * transition shortly to ACK_RECEIVED unless the interrupt is cleared + * first. Once the engine is reset, additional context switches can + * occur; if one times out, STATUS will transition to DROPPED_TIMEOUT + * if the interrupt isn't cleared first. + * ACK_RECEIVED - The ack for the timed-out context request was + * received between the point of the timeout and this register being + * read. Note this STATUS can be reported during the load stage of the + * same context switch that timed out if the timeout occurred during the + * save half of a context switch. Additional context requests may have + * completed or may be outstanding, but no further context timeout has + * occurred. This simplifies checking for spurious context switch + * timeouts. + * DROPPED_TIMEOUT - The originally timed-out context request acked, + * but a subsequent context request then timed out. + * Information about the subsequent timeout is not stored; in fact, that + * context request may also have already been acked by the time SW + * SW reads this register. If not, there is a chance SW can get the + * dropped information by clearing the corresponding + * INTR_CTXSW_TIMEOUT_ENGINE bit and waiting for the timeout to occur + * again. Note, however, that if the engine does time out again, + * it may not be from the original request that caused the + * DROPPED_TIMEOUT state, as that request may + * be acked in the interim. + */ + *info_status = fifo_intr_ctxsw_timeout_info_status_v(timeout_info); + if (*info_status == + fifo_intr_ctxsw_timeout_info_status_ack_received_v()) { + + nvgpu_log_info(g, "ctxsw timeout info : ack received"); + /* no need to recover */ + tsgid = FIFO_INVAL_TSG_ID; + + } else if (*info_status == + fifo_intr_ctxsw_timeout_info_status_dropped_timeout_v()) { + + nvgpu_log_info(g, "ctxsw timeout info : dropped timeout"); + /* no need to recover */ + tsgid = FIFO_INVAL_TSG_ID; + + } + return tsgid; +} + +bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g) +{ + bool recover = false; + u32 tsgid = FIFO_INVAL_TSG_ID; + u32 engine_id, active_eng_id; + u32 timeout_val, ctxsw_timeout_engines; + u32 info_status; + const char *info_status_str; + struct tsg_gk20a *tsg = NULL; + + + /* get ctxsw timedout engines */ + ctxsw_timeout_engines = nvgpu_readl(g, fifo_intr_ctxsw_timeout_r()); + if (ctxsw_timeout_engines == 0U) { + nvgpu_err(g, "no eng ctxsw timeout pending"); + return false; + } + + timeout_val = nvgpu_readl(g, fifo_eng_ctxsw_timeout_r()); + timeout_val = fifo_eng_ctxsw_timeout_period_v(timeout_val); + + nvgpu_log_info(g, "eng ctxsw timeout period = 0x%x", timeout_val); + + for (engine_id = 0; engine_id < g->fifo.num_engines; engine_id++) { + active_eng_id = g->fifo.active_engines_list[engine_id]; + + if ((ctxsw_timeout_engines & + fifo_intr_ctxsw_timeout_engine_pending_f( + active_eng_id)) != 0U) { + + struct fifo_gk20a *f = &g->fifo; + u32 ms = 0; + bool debug_dump = false; + + tsgid = gv11b_fifo_ctxsw_timeout_info(g, active_eng_id, + &info_status); + + if (tsgid == FIFO_INVAL_TSG_ID) { + continue; + } + + tsg = &f->tsg[tsgid]; + recover = g->ops.tsg.check_ctxsw_timeout(tsg, + &debug_dump, &ms); + if (recover) { + info_status_str = invalid_str; + if (info_status < + ARRAY_SIZE(ctxsw_timeout_status_desc)) { + info_status_str = + ctxsw_timeout_status_desc[info_status]; + } + + nvgpu_err(g, "ctxsw timeout error: " + "active engine id =%u, %s=%d, info: %s ms=%u", + active_eng_id, "tsg", tsgid, info_status_str, + ms); + + nvgpu_rc_ctxsw_timeout(g, BIT32(active_eng_id), + tsg, debug_dump); + } else { + nvgpu_log_info(g, + "fifo is waiting for ctxsw switch: " + "for %d ms, %s=%d", ms, "tsg", tsgid); + } + } + } + /* clear interrupt */ + nvgpu_writel(g, fifo_intr_ctxsw_timeout_r(), ctxsw_timeout_engines); + + return recover; +} diff --git a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b.h b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b.h new file mode 100644 index 000000000..47fbd4e49 --- /dev/null +++ b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_CTXSW_TIMEOUT_GV11B_H +#define NVGPU_CTXSW_TIMEOUT_GV11B_H + +#include + +struct gk20a; + +void gv11b_fifo_ctxsw_timeout_enable(struct gk20a *g, bool enable); +bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g); + +#endif /* NVGPU_CTXSW_TIMEOUT_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c index fb348363f..52c83e27e 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c @@ -67,24 +67,18 @@ static u32 gk20a_fifo_intr_0_en_mask(struct gk20a *g) void gk20a_fifo_intr_0_enable(struct gk20a *g, bool enable) { unsigned int i; - u32 intr_stall, timeout, mask; + u32 intr_stall, mask; u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA); if (!enable) { + g->ops.fifo.ctxsw_timeout_enable(g, false); nvgpu_writel(g, fifo_intr_en_0_r(), 0U); return; } - if (g->ops.fifo.apply_ctxsw_timeout_intr != NULL) { - g->ops.fifo.apply_ctxsw_timeout_intr(g); - } else { - /* timeout is in us. Enable ctxsw timeout */ - timeout = g->ctxsw_timeout_period_ms * 1000U; - timeout = scale_ptimer(timeout, - ptimer_scalingfactor10x(g->ptimer_src_freq)); - timeout |= fifo_eng_timeout_detection_enabled_f(); - nvgpu_writel(g, fifo_eng_timeout_r(), timeout); - } + /* Enable interrupts */ + + g->ops.fifo.ctxsw_timeout_enable(g, true); /* clear and enable pbdma interrupt */ for (i = 0; i < host_num_pbdma; i++) { @@ -146,6 +140,30 @@ u32 gk20a_fifo_intr_1_isr(struct gk20a *g) return GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE; } +bool gk20a_fifo_handle_sched_error(struct gk20a *g) +{ + u32 sched_error; + u32 engine_id; + u32 id = U32_MAX; + bool is_tsg = false; + bool ret = false; + + /* read the scheduler error register */ + sched_error = nvgpu_readl(g, fifo_intr_sched_error_r()); + + engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg); + + if (fifo_intr_sched_error_code_f(sched_error) != + fifo_intr_sched_error_code_ctxsw_timeout_v()) { + nvgpu_err(g, + "fifo sched error : 0x%08x, engine=%u, %s=%d", + sched_error, engine_id, is_tsg ? "tsg" : "ch", id); + } else { + ret = g->ops.fifo.handle_ctxsw_timeout(g); + } + return ret; +} + void gk20a_fifo_intr_handle_chsw_error(struct gk20a *g) { u32 intr; diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.h b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.h index 3adb94e4b..73e466e0c 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.h +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.h @@ -34,5 +34,6 @@ u32 gk20a_fifo_intr_1_isr(struct gk20a *g); void gk20a_fifo_intr_handle_chsw_error(struct gk20a *g); void gk20a_fifo_intr_handle_runlist_event(struct gk20a *g); +bool gk20a_fifo_handle_sched_error(struct gk20a *g); #endif /* NVGPU_FIFO_INTR_GK20A_H */ diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.c index 99ff5c582..6e0447926 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.c @@ -36,98 +36,6 @@ #include #include /* TODO: remove */ -static u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g) -{ - u32 intr_0_error_mask = - fifo_intr_0_bind_error_pending_f() | - fifo_intr_0_sched_error_pending_f() | - fifo_intr_0_chsw_error_pending_f() | - fifo_intr_0_memop_timeout_pending_f() | - fifo_intr_0_lb_error_pending_f(); - - return intr_0_error_mask; -} - -static u32 gv11b_fifo_intr_0_en_mask(struct gk20a *g) -{ - u32 intr_0_en_mask; - - intr_0_en_mask = gv11b_fifo_intr_0_error_mask(g); - - intr_0_en_mask |= fifo_intr_0_pbdma_intr_pending_f() | - fifo_intr_0_ctxsw_timeout_pending_f(); - - return intr_0_en_mask; -} - -void gv11b_fifo_intr_0_enable(struct gk20a *g, bool enable) -{ - unsigned int i; - u32 intr_stall, timeout, mask; - u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA); - - if (!enable) { - nvgpu_writel(g, fifo_intr_en_0_r(), 0); - return; - } - /* clear and enable pbdma interrupt */ - for (i = 0; i < host_num_pbdma; i++) { - nvgpu_writel(g, pbdma_intr_0_r(i), U32_MAX); - nvgpu_writel(g, pbdma_intr_1_r(i), U32_MAX); - - intr_stall = nvgpu_readl(g, pbdma_intr_stall_r(i)); - nvgpu_log_info(g, "pbdma id:%u, intr_en_0 0x%08x", i, - intr_stall); - nvgpu_writel(g, pbdma_intr_en_0_r(i), intr_stall); - - intr_stall = nvgpu_readl(g, pbdma_intr_stall_1_r(i)); - /* - * For bug 2082123 - * Mask the unused HCE_RE_ILLEGAL_OP bit from the interrupt. - */ - intr_stall &= ~pbdma_intr_stall_1_hce_illegal_op_enabled_f(); - nvgpu_log_info(g, "pbdma id:%u, intr_en_1 0x%08x", i, - intr_stall); - nvgpu_writel(g, pbdma_intr_en_1_r(i), intr_stall); - } - - /* clear ctxsw timeout interrupts */ - nvgpu_writel(g, fifo_intr_ctxsw_timeout_r(), ~U32(0U)); - - if (nvgpu_platform_is_silicon(g)) { - /* timeout is in us. Enable ctxsw timeout */ - timeout = g->ctxsw_timeout_period_ms * 1000U; - timeout = scale_ptimer(timeout, - ptimer_scalingfactor10x(g->ptimer_src_freq)); - timeout |= fifo_eng_ctxsw_timeout_detection_enabled_f(); - nvgpu_writel(g, fifo_eng_ctxsw_timeout_r(), timeout); - } else { - timeout = nvgpu_readl(g, fifo_eng_ctxsw_timeout_r()); - nvgpu_log_info(g, - "fifo_eng_ctxsw_timeout reg val = 0x%08x", - timeout); - timeout = set_field(timeout, - fifo_eng_ctxsw_timeout_period_m(), - fifo_eng_ctxsw_timeout_period_max_f()); - timeout = set_field(timeout, - fifo_eng_ctxsw_timeout_detection_m(), - fifo_eng_ctxsw_timeout_detection_disabled_f()); - nvgpu_log_info(g, - "new fifo_eng_ctxsw_timeout reg val = 0x%08x", - timeout); - nvgpu_writel(g, fifo_eng_ctxsw_timeout_r(), timeout); - } - - /* clear runlist interrupts */ - nvgpu_writel(g, fifo_intr_runlist_r(), ~U32(0U)); - - /* clear and enable pfifo interrupt */ - nvgpu_writel(g, fifo_intr_0_r(), U32_MAX); - mask = gv11b_fifo_intr_0_en_mask(g); - nvgpu_log_info(g, "fifo_intr_en_0 0x%08x", mask); - nvgpu_writel(g, fifo_intr_en_0_r(), mask); -} - static const char *const gv11b_sched_error_str[] = { "xxx-0", "xxx-1", @@ -164,6 +72,77 @@ static const char *const gv11b_sched_error_str[] = { "bad_tsg", }; +static u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g) +{ + u32 intr_0_error_mask = + fifo_intr_0_bind_error_pending_f() | + fifo_intr_0_sched_error_pending_f() | + fifo_intr_0_chsw_error_pending_f() | + fifo_intr_0_memop_timeout_pending_f() | + fifo_intr_0_lb_error_pending_f(); + + return intr_0_error_mask; +} + +static u32 gv11b_fifo_intr_0_en_mask(struct gk20a *g) +{ + u32 intr_0_en_mask; + + intr_0_en_mask = gv11b_fifo_intr_0_error_mask(g); + + intr_0_en_mask |= fifo_intr_0_pbdma_intr_pending_f() | + fifo_intr_0_ctxsw_timeout_pending_f(); + + return intr_0_en_mask; +} + +void gv11b_fifo_intr_0_enable(struct gk20a *g, bool enable) +{ + unsigned int i; + u32 intr_stall, mask; + u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA); + + if (!enable) { + g->ops.fifo.ctxsw_timeout_enable(g, false); + nvgpu_writel(g, fifo_intr_en_0_r(), 0); + return; + } + + /* Enable interrupts */ + + g->ops.fifo.ctxsw_timeout_enable(g, true); + + /* clear and enable pbdma interrupt */ + for (i = 0; i < host_num_pbdma; i++) { + nvgpu_writel(g, pbdma_intr_0_r(i), U32_MAX); + nvgpu_writel(g, pbdma_intr_1_r(i), U32_MAX); + + intr_stall = nvgpu_readl(g, pbdma_intr_stall_r(i)); + nvgpu_log_info(g, "pbdma id:%u, intr_en_0 0x%08x", i, + intr_stall); + nvgpu_writel(g, pbdma_intr_en_0_r(i), intr_stall); + + intr_stall = nvgpu_readl(g, pbdma_intr_stall_1_r(i)); + /* + * For bug 2082123 + * Mask the unused HCE_RE_ILLEGAL_OP bit from the interrupt. + */ + intr_stall &= ~pbdma_intr_stall_1_hce_illegal_op_enabled_f(); + nvgpu_log_info(g, "pbdma id:%u, intr_en_1 0x%08x", i, + intr_stall); + nvgpu_writel(g, pbdma_intr_en_1_r(i), intr_stall); + } + + /* clear runlist interrupts */ + nvgpu_writel(g, fifo_intr_runlist_r(), ~U32(0U)); + + /* clear and enable pfifo interrupt */ + nvgpu_writel(g, fifo_intr_0_r(), U32_MAX); + mask = gv11b_fifo_intr_0_en_mask(g); + nvgpu_log_info(g, "fifo_intr_en_0 0x%08x", mask); + nvgpu_writel(g, fifo_intr_en_0_r(), mask); +} + bool gv11b_fifo_handle_sched_error(struct gk20a *g) { u32 sched_error; @@ -189,176 +168,6 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g) return false; } -static const char * const invalid_str = "invalid"; - -static const char *const ctxsw_timeout_status_desc[] = { - "awaiting ack", - "eng was reset", - "ack received", - "dropped timeout" -}; - -static u32 gv11b_fifo_ctxsw_timeout_info(struct gk20a *g, u32 active_eng_id, - u32 *info_status) -{ - u32 tsgid = FIFO_INVAL_TSG_ID; - u32 timeout_info; - u32 ctx_status; - - timeout_info = nvgpu_readl(g, - fifo_intr_ctxsw_timeout_info_r(active_eng_id)); - - /* - * ctxsw_state and tsgid are snapped at the point of the timeout and - * will not change while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit - * is PENDING. - */ - ctx_status = fifo_intr_ctxsw_timeout_info_ctxsw_state_v(timeout_info); - if (ctx_status == - fifo_intr_ctxsw_timeout_info_ctxsw_state_load_v()) { - - tsgid = fifo_intr_ctxsw_timeout_info_next_tsgid_v(timeout_info); - - } else if (ctx_status == - fifo_intr_ctxsw_timeout_info_ctxsw_state_switch_v() || - ctx_status == - fifo_intr_ctxsw_timeout_info_ctxsw_state_save_v()) { - - tsgid = fifo_intr_ctxsw_timeout_info_prev_tsgid_v(timeout_info); - } - nvgpu_log_info(g, "ctxsw timeout info: tsgid = %d", tsgid); - - /* - * STATUS indicates whether the context request ack was eventually - * received and whether a subsequent request timed out. This field is - * updated live while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit - * is PENDING. STATUS starts in AWAITING_ACK, and progresses to - * ACK_RECEIVED and finally ends with DROPPED_TIMEOUT. - * - * AWAITING_ACK - context request ack still not returned from engine. - * ENG_WAS_RESET - The engine was reset via a PRI write to NV_PMC_ENABLE - * or NV_PMC_ELPG_ENABLE prior to receiving the ack. Host will not - * expect ctx ack to return, but if it is already in flight, STATUS will - * transition shortly to ACK_RECEIVED unless the interrupt is cleared - * first. Once the engine is reset, additional context switches can - * occur; if one times out, STATUS will transition to DROPPED_TIMEOUT - * if the interrupt isn't cleared first. - * ACK_RECEIVED - The ack for the timed-out context request was - * received between the point of the timeout and this register being - * read. Note this STATUS can be reported during the load stage of the - * same context switch that timed out if the timeout occurred during the - * save half of a context switch. Additional context requests may have - * completed or may be outstanding, but no further context timeout has - * occurred. This simplifies checking for spurious context switch - * timeouts. - * DROPPED_TIMEOUT - The originally timed-out context request acked, - * but a subsequent context request then timed out. - * Information about the subsequent timeout is not stored; in fact, that - * context request may also have already been acked by the time SW - * SW reads this register. If not, there is a chance SW can get the - * dropped information by clearing the corresponding - * INTR_CTXSW_TIMEOUT_ENGINE bit and waiting for the timeout to occur - * again. Note, however, that if the engine does time out again, - * it may not be from the original request that caused the - * DROPPED_TIMEOUT state, as that request may - * be acked in the interim. - */ - *info_status = fifo_intr_ctxsw_timeout_info_status_v(timeout_info); - if (*info_status == - fifo_intr_ctxsw_timeout_info_status_ack_received_v()) { - - nvgpu_log_info(g, "ctxsw timeout info : ack received"); - /* no need to recover */ - tsgid = FIFO_INVAL_TSG_ID; - - } else if (*info_status == - fifo_intr_ctxsw_timeout_info_status_dropped_timeout_v()) { - - nvgpu_log_info(g, "ctxsw timeout info : dropped timeout"); - /* no need to recover */ - tsgid = FIFO_INVAL_TSG_ID; - - } - return tsgid; -} - -bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g, u32 fifo_intr) -{ - bool ret = false; - u32 tsgid = FIFO_INVAL_TSG_ID; - u32 engine_id, active_eng_id; - u32 timeout_val, ctxsw_timeout_engines; - u32 info_status; - const char *info_status_str; - - - if ((fifo_intr & fifo_intr_0_ctxsw_timeout_pending_f()) == 0U) { - return ret; - } - - /* get ctxsw timedout engines */ - ctxsw_timeout_engines = nvgpu_readl(g, fifo_intr_ctxsw_timeout_r()); - if (ctxsw_timeout_engines == 0U) { - nvgpu_err(g, "no eng ctxsw timeout pending"); - return ret; - } - - timeout_val = nvgpu_readl(g, fifo_eng_ctxsw_timeout_r()); - timeout_val = fifo_eng_ctxsw_timeout_period_v(timeout_val); - - nvgpu_log_info(g, "eng ctxsw timeout period = 0x%x", timeout_val); - - for (engine_id = 0; engine_id < g->fifo.num_engines; engine_id++) { - active_eng_id = g->fifo.active_engines_list[engine_id]; - - if ((ctxsw_timeout_engines & - fifo_intr_ctxsw_timeout_engine_pending_f( - active_eng_id)) != 0U) { - - struct fifo_gk20a *f = &g->fifo; - u32 ms = 0; - bool verbose = false; - - tsgid = gv11b_fifo_ctxsw_timeout_info(g, active_eng_id, - &info_status); - - if (tsgid == FIFO_INVAL_TSG_ID) { - continue; - } - - if (g->ops.tsg.check_ctxsw_timeout( - &f->tsg[tsgid], &verbose, &ms)) { - ret = true; - - info_status_str = invalid_str; - if (info_status < - ARRAY_SIZE(ctxsw_timeout_status_desc)) { - info_status_str = - ctxsw_timeout_status_desc[info_status]; - } - - nvgpu_err(g, "ctxsw timeout error: " - "active engine id =%u, %s=%d, info: %s ms=%u", - active_eng_id, "tsg", tsgid, info_status_str, - ms); - - /* Cancel all channels' timeout */ - nvgpu_channel_wdt_restart_all_channels(g); - gk20a_fifo_recover(g, BIT32(active_eng_id), - tsgid, true, true, verbose, - RC_TYPE_CTXSW_TIMEOUT); - } else { - nvgpu_log_info(g, - "fifo is waiting for ctx switch: " - "for %d ms, %s=%d", ms, "tsg", tsgid); - } - } - } - /* clear interrupt */ - nvgpu_writel(g, fifo_intr_ctxsw_timeout_r(), ctxsw_timeout_engines); - return ret; -} - static u32 gv11b_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr) { u32 handled = 0U; @@ -425,16 +234,13 @@ void gv11b_fifo_intr_0_isr(struct gk20a *g) } if ((fifo_intr & fifo_intr_0_sched_error_pending_f()) != 0U) { - (void) g->ops.fifo.handle_sched_error(g); + (void)g->ops.fifo.handle_sched_error(g); clear_intr |= fifo_intr_0_sched_error_pending_f(); } if ((fifo_intr & fifo_intr_0_ctxsw_timeout_pending_f()) != 0U) { - if (g->ops.fifo.handle_ctxsw_timeout != NULL) { - g->ops.fifo.handle_ctxsw_timeout(g, fifo_intr); - } else { - nvgpu_err(g, "unhandled fifo ctxsw timeout intr"); - } + (void)g->ops.fifo.handle_ctxsw_timeout(g); + clear_intr |= fifo_intr_0_ctxsw_timeout_pending_f(); } nvgpu_mutex_release(&g->fifo.intr.isr.mutex); diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.h b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.h index cf154bd61..1b7000159 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.h +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b.h @@ -41,6 +41,5 @@ void gv11b_fifo_intr_0_enable(struct gk20a *g, bool enable); void gv11b_fifo_intr_0_isr(struct gk20a *g); bool gv11b_fifo_handle_sched_error(struct gk20a *g); -bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g, u32 fifo_intr); #endif /* NVGPU_FIFO_INTR_GV11B_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index d2e0daf7e..f20af01ff 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -887,7 +887,6 @@ struct gpu_ops { struct mmu_fault_info *mmfault); void (*get_mmu_fault_gpc_desc)(struct mmu_fault_info *mmfault); void (*apply_pb_timeout)(struct gk20a *g); - void (*apply_ctxsw_timeout_intr)(struct gk20a *g); int (*tsg_set_timeslice)(struct tsg_gk20a *tsg, u32 timeslice); u32 (*default_timeslice_us)(struct gk20a *g); int (*force_reset_ch)(struct channel_gk20a *ch, @@ -916,8 +915,6 @@ struct gpu_ops { struct mmu_fault_info *mmfault); void (*teardown_mask_intr)(struct gk20a *g); void (*teardown_unmask_intr)(struct gk20a *g); - bool (*handle_sched_error)(struct gk20a *g); - bool (*handle_ctxsw_timeout)(struct gk20a *g, u32 fifo_intr); void (*init_eng_method_buffers)(struct gk20a *g, struct tsg_gk20a *tsg); void (*deinit_eng_method_buffers)(struct gk20a *g, @@ -949,6 +946,9 @@ struct gpu_ops { void (*intr_0_isr)(struct gk20a *g); void (*intr_1_enable)(struct gk20a *g, bool enable); u32 (*intr_1_isr)(struct gk20a *g); + bool (*handle_sched_error)(struct gk20a *g); + void (*ctxsw_timeout_enable)(struct gk20a *g, bool enable); + bool (*handle_ctxsw_timeout)(struct gk20a *g); } fifo; struct { diff --git a/drivers/gpu/nvgpu/include/nvgpu/rc.h b/drivers/gpu/nvgpu/include/nvgpu/rc.h new file mode 100644 index 000000000..6df53eb08 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_RC_H +#define NVGPU_RC_H + +#include + +struct gk20a; +struct tsg_gk20a; + +void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, + struct tsg_gk20a *tsg, bool debug_dump); + +#endif /* NVGPU_RC_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg.h b/drivers/gpu/nvgpu/include/nvgpu/tsg.h index 3352ceff4..21491db57 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h +++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h @@ -107,7 +107,7 @@ bool nvgpu_tsg_mark_error(struct gk20a *g, struct tsg_gk20a *tsg); void gk20a_tsg_event_id_post_event(struct tsg_gk20a *tsg, int event_id); bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg, - bool *verbose, u32 *ms); + bool *debug_dump, u32 *ms); int gk20a_tsg_set_runlist_interleave(struct tsg_gk20a *tsg, u32 level); int gk20a_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice); u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg); diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index f925fbf4f..2e5034ccd 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -58,6 +58,7 @@ #include "hal/fifo/userd_gv11b.h" #include "hal/fifo/fifo_intr_gk20a.h" #include "hal/fifo/fifo_intr_gv11b.h" +#include "hal/fifo/ctxsw_timeout_gv11b.h" #include "hal/gr/fecs_trace/fecs_trace_gm20b.h" #include "hal/gr/fecs_trace/fecs_trace_gv11b.h" #include "hal/gr/config/gr_config_gm20b.h" @@ -858,7 +859,6 @@ static const struct gpu_ops tu104_ops = { .teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg, .teardown_mask_intr = gv11b_fifo_teardown_mask_intr, .teardown_unmask_intr = gv11b_fifo_teardown_unmask_intr, - .handle_sched_error = gv11b_fifo_handle_sched_error, .init_eng_method_buffers = gv11b_fifo_init_eng_method_buffers, .deinit_eng_method_buffers = gv11b_fifo_deinit_eng_method_buffers, @@ -873,7 +873,6 @@ static const struct gpu_ops tu104_ops = { .cleanup_sw = nvgpu_fifo_cleanup_sw, .resetup_ramfc = NULL, .free_channel_ctx_header = gv11b_free_subctx_header, - .handle_ctxsw_timeout = gv11b_fifo_handle_ctxsw_timeout, .ring_channel_doorbell = tu104_ring_channel_doorbell, .usermode_base = tu104_fifo_usermode_base, .doorbell_token = tu104_fifo_doorbell_token, @@ -887,6 +886,9 @@ static const struct gpu_ops tu104_ops = { .intr_1_enable = gk20a_fifo_intr_1_enable, .intr_0_isr = gv11b_fifo_intr_0_isr, .intr_1_isr = gk20a_fifo_intr_1_isr, + .handle_sched_error = gv11b_fifo_handle_sched_error, + .ctxsw_timeout_enable = gv11b_fifo_ctxsw_timeout_enable, + .handle_ctxsw_timeout = gv11b_fifo_handle_ctxsw_timeout, }, .engine = { .is_fault_engine_subid_gpc = gv11b_is_fault_engine_subid_gpc,