diff --git a/drivers/gpu/nvgpu/common/fifo/preempt.c b/drivers/gpu/nvgpu/common/fifo/preempt.c index 17d40ffaf..72ff3f933 100644 --- a/drivers/gpu/nvgpu/common/fifo/preempt.c +++ b/drivers/gpu/nvgpu/common/fifo/preempt.c @@ -42,6 +42,9 @@ u32 nvgpu_preempt_get_timeout(struct gk20a *g) int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg) { int ret = 0; + u32 preempt_retry_count = 10U; + u32 preempt_retry_timeout = + nvgpu_preempt_get_timeout(g) / preempt_retry_count; #ifdef CONFIG_NVGPU_LS_PMU u32 token = PMU_INVALID_MUTEX_OWNER_ID; int mutex_ret = 0; @@ -53,40 +56,57 @@ int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg) return 0; } - nvgpu_mutex_acquire(&tsg->runlist->runlist_lock); + do { + nvgpu_mutex_acquire(&tsg->runlist->runlist_lock); - if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { - nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), - RUNLIST_DISABLED); - } - -#ifdef CONFIG_NVGPU_LS_PMU - mutex_ret = nvgpu_pmu_lock_acquire(g, g->pmu, - PMU_MUTEX_ID_FIFO, &token); -#endif - nvgpu_log_fn(g, "preempt id: %d", tsg->tsgid); - - g->ops.fifo.preempt_trigger(g, tsg->tsgid, ID_TYPE_TSG); - - /* poll for preempt done */ - ret = g->ops.fifo.is_preempt_pending(g, tsg->tsgid, ID_TYPE_TSG); - -#ifdef CONFIG_NVGPU_LS_PMU - if (mutex_ret == 0) { - int err = nvgpu_pmu_lock_release(g, g->pmu, PMU_MUTEX_ID_FIFO, - &token); - if (err != 0) { - nvgpu_err(g, "PMU_MUTEX_ID_FIFO not released err=%d", - err); + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { + nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), + RUNLIST_DISABLED); } - } -#endif - if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { - nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), - RUNLIST_ENABLED); - } - nvgpu_mutex_release(&tsg->runlist->runlist_lock); +#ifdef CONFIG_NVGPU_LS_PMU + mutex_ret = nvgpu_pmu_lock_acquire(g, g->pmu, + PMU_MUTEX_ID_FIFO, &token); +#endif + g->ops.fifo.preempt_trigger(g, tsg->tsgid, ID_TYPE_TSG); + + /* + * Poll for preempt done. if stalling interrupts are pending + * while preempt is in progress we poll for stalling interrupts + * to finish based on return value from this function and + * retry preempt again. + * If HW is hung, on the last retry instance we try to identify + * the engines hung and set the runlist reset_eng_bitmask + * and mark preemption completion. + */ + ret = g->ops.fifo.is_preempt_pending(g, tsg->tsgid, + ID_TYPE_TSG, preempt_retry_count > 1U); + +#ifdef CONFIG_NVGPU_LS_PMU + if (mutex_ret == 0) { + int err = nvgpu_pmu_lock_release(g, g->pmu, + PMU_MUTEX_ID_FIFO, &token); + if (err != 0) { + nvgpu_err(g, "PMU_MUTEX_ID_FIFO not released err=%d", err); + } + } +#endif + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { + nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), + RUNLIST_ENABLED); + } + + nvgpu_mutex_release(&tsg->runlist->runlist_lock); + + if (ret != -EAGAIN) { + break; + } + + ret = nvgpu_wait_for_stall_interrupts(g, preempt_retry_timeout); + if (ret != 0) { + nvgpu_log_info(g, "wait for stall interrupts failed %d", ret); + } + } while (--preempt_retry_count != 0U); if (ret != 0) { if (nvgpu_platform_is_silicon(g)) { diff --git a/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.c b/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.c index cd216ee4a..bf68b975f 100644 --- a/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.c +++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -62,11 +62,11 @@ static int gk20a_fifo_preempt_locked(struct gk20a *g, u32 id, g->ops.fifo.preempt_trigger(g, id, id_type); /* wait for preempt */ - return g->ops.fifo.is_preempt_pending(g, id, id_type); + return g->ops.fifo.is_preempt_pending(g, id, id_type, false); } int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type) + unsigned int id_type, bool preempt_retries_left) { struct nvgpu_timeout timeout; u32 delay = POLL_DELAY_MIN_US; diff --git a/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.h b/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.h index f6472bfa8..4ff4de918 100644 --- a/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.h +++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gk20a.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -32,6 +32,6 @@ void gk20a_fifo_preempt_trigger(struct gk20a *g, u32 id, unsigned int id_type); int gk20a_fifo_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch); int gk20a_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg); int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type); + unsigned int id_type, bool preempt_retries_left); #endif /* FIFO_PREEMPT_GK20A_H */ diff --git a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b.h b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b.h index 0d8a17130..3ce660151 100644 --- a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b.h +++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -33,7 +33,7 @@ struct nvgpu_tsg; void gv11b_fifo_preempt_trigger(struct gk20a *g, u32 id, unsigned int id_type); int gv11b_fifo_preempt_channel(struct gk20a *g, struct nvgpu_channel *ch); int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type); + unsigned int id_type, bool preempt_retries_left); int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid, u32 pbdma_id); #endif /* FIFO_PREEMPT_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c index 74fdb83e2..a60643aa5 100644 --- a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c @@ -163,16 +163,16 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid, static int gv11b_fifo_check_eng_intr_pending(struct gk20a *g, u32 id, struct nvgpu_engine_status_info *engine_status, u32 eng_intr_pending, - u32 engine_id, u32 *reset_eng_bitmask) + u32 engine_id, u32 *reset_eng_bitmask, + bool preempt_retries_left) { + bool check_preempt_retry = false; int ret = -EBUSY; if (engine_status->ctxsw_status == NVGPU_CTX_STATUS_CTXSW_SWITCH) { /* Eng save hasn't started yet. Continue polling */ if (eng_intr_pending != 0U) { - /* if eng intr, stop polling */ - *reset_eng_bitmask |= BIT32(engine_id); - ret = 0; + check_preempt_retry = true; } } else if ((engine_status->ctxsw_status == NVGPU_CTX_STATUS_VALID) || @@ -180,9 +180,7 @@ static int gv11b_fifo_check_eng_intr_pending(struct gk20a *g, u32 id, if (id == engine_status->ctx_id) { if (eng_intr_pending != 0U) { - /* preemption will not finish */ - *reset_eng_bitmask |= BIT32(engine_id); - ret = 0; + check_preempt_retry = true; } } else { /* context is not running on the engine */ @@ -193,24 +191,38 @@ static int gv11b_fifo_check_eng_intr_pending(struct gk20a *g, u32 id, if (id == engine_status->ctx_next_id) { if (eng_intr_pending != 0U) { - /* preemption will not finish */ - *reset_eng_bitmask |= BIT32(engine_id); - ret = 0; + check_preempt_retry = true; } } else { /* context is not running on the engine */ ret = 0; } } else { - /* Preempt should be finished */ - ret = 0; + if (eng_intr_pending != 0U) { + check_preempt_retry = true; + } else { + /* Preempt should be finished */ + ret = 0; + } + } + + /* if eng intr, stop polling and check if we can retry preempts. */ + if (check_preempt_retry) { + if (preempt_retries_left) { + ret = -EAGAIN; + } else { + /* preemption will not finish */ + *reset_eng_bitmask |= BIT32(engine_id); + ret = 0; + } } return ret; } static int gv11b_fifo_preempt_poll_eng(struct gk20a *g, u32 id, - u32 engine_id, u32 *reset_eng_bitmask) + u32 engine_id, u32 *reset_eng_bitmask, + bool preempt_retries_left) { struct nvgpu_timeout timeout; u32 delay = POLL_DELAY_MIN_US; @@ -283,8 +295,8 @@ static int gv11b_fifo_preempt_poll_eng(struct gk20a *g, u32 id, } ret = gv11b_fifo_check_eng_intr_pending(g, id, &engine_status, eng_intr_pending, engine_id, - reset_eng_bitmask); - if (ret == 0) { + reset_eng_bitmask, preempt_retries_left); + if (ret == 0 || ret == -EAGAIN) { break; } @@ -292,7 +304,7 @@ static int gv11b_fifo_preempt_poll_eng(struct gk20a *g, u32 id, delay = min_t(u32, delay << 1U, POLL_DELAY_MAX_US); } while (nvgpu_timeout_expired(&timeout) == 0); - if (ret != 0) { + if (ret != 0 && ret != -EAGAIN) { /* * The reasons a preempt can fail are: * 1.Some other stalling interrupt is asserted preventing @@ -309,7 +321,7 @@ static int gv11b_fifo_preempt_poll_eng(struct gk20a *g, u32 id, } int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type) + unsigned int id_type, bool preempt_retries_left) { struct nvgpu_fifo *f = &g->fifo; struct nvgpu_runlist *rl; @@ -350,7 +362,7 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, engine_id = U32(bit); err = gv11b_fifo_preempt_poll_eng(g, tsgid, engine_id, - &rl->reset_eng_bitmask); + &rl->reset_eng_bitmask, preempt_retries_left); if ((err != 0) && (ret == 0)) { ret = err; } diff --git a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.c b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.c index 585451bf3..1eef14cdf 100644 --- a/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.c +++ b/drivers/gpu/nvgpu/hal/fifo/runlist_fifo_gk20a.c @@ -109,7 +109,7 @@ int gk20a_fifo_reschedule_preempt_next(struct nvgpu_channel *ch, #endif if (wait_preempt) { if (g->ops.fifo.is_preempt_pending(g, preempt_id, - preempt_type) != 0) { + preempt_type, false) != 0) { nvgpu_err(g, "fifo preempt timed out"); /* * This function does not care if preempt diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/fifo.h b/drivers/gpu/nvgpu/include/nvgpu/gops/fifo.h index 0905e55fd..6393a9d05 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops/fifo.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/fifo.h @@ -177,7 +177,7 @@ struct gops_fifo { int (*preempt_poll_pbdma)(struct gk20a *g, u32 tsgid, u32 pbdma_id); int (*is_preempt_pending)(struct gk20a *g, u32 id, - unsigned int id_type); + unsigned int id_type, bool preempt_retries_left); void (*intr_set_recover_mask)(struct gk20a *g); void (*intr_unset_recover_mask)(struct gk20a *g); void (*intr_top_enable)(struct gk20a *g, bool enable); diff --git a/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c b/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c index ba4f3bc77..5b492df3c 100644 --- a/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c +++ b/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c @@ -205,13 +205,15 @@ static void stub_fifo_preempt_trigger(struct gk20a *g, u32 id, } static int stub_fifo_is_preempt_pending_ebusy(struct gk20a *g, u32 id, - unsigned int id_type) + unsigned int id_type, + bool preempt_retries_left) { return -EBUSY; } static int stub_fifo_is_preempt_pending_pass(struct gk20a *g, u32 id, - unsigned int id_type) + unsigned int id_type, + bool preempt_retries_left) { return 0; } @@ -463,7 +465,7 @@ int test_gv11b_fifo_is_preempt_pending(struct unit_module *m, struct gk20a *g, /* Modify eng_stat for engine 0 */ nvgpu_writel(g, fifo_engine_status_r(0U), stub.eng_stat); - err = gv11b_fifo_is_preempt_pending(g, 0U, id_type); + err = gv11b_fifo_is_preempt_pending(g, 0U, id_type, false); if (branches & F_PREEMPT_PENDING_POLL_PBDMA_FAIL) { unit_assert(err == -ETIMEDOUT, goto done);