From 883c12529a688938dcca4477ffa3db121fcaef42 Mon Sep 17 00:00:00 2001 From: Lakshmanan M Date: Tue, 10 Nov 2020 11:13:41 +0530 Subject: [PATCH] gpu: nvgpu: Add multi GR reset support for MIG * Added multi GR reset/recovery support for MIG. * Added a api to get the gr engine id using gr instance id. JIRA NVGPU-5650 JIRA NVGPU-5653 Change-Id: I12ece75a4c33f0944f404121b54879e814dda6df Signed-off-by: Lakshmanan M Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2443644 Reviewed-by: automaticguardword Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-cert Reviewed-by: Rajesh Devaraj Reviewed-by: Dinesh T Reviewed-by: Deepak Nibade Reviewed-by: mobile promotions Tested-by: mobile promotions GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/common/fifo/engines.c | 14 ++++-- drivers/gpu/nvgpu/common/gr/gr.c | 56 ++++++++++++++++++++--- drivers/gpu/nvgpu/common/rc/rc.c | 9 +++- drivers/gpu/nvgpu/include/nvgpu/engines.h | 12 +++++ 4 files changed, 79 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/nvgpu/common/fifo/engines.c b/drivers/gpu/nvgpu/common/fifo/engines.c index 77c99c2c2..299b62bee 100644 --- a/drivers/gpu/nvgpu/common/fifo/engines.c +++ b/drivers/gpu/nvgpu/common/fifo/engines.c @@ -92,20 +92,26 @@ bool nvgpu_engine_check_valid_id(struct gk20a *g, u32 engine_id) return f->host_engines[engine_id] != NULL; } -u32 nvgpu_engine_get_gr_id(struct gk20a *g) +u32 nvgpu_engine_get_gr_id_for_inst(struct gk20a *g, u32 inst_id) { const struct nvgpu_device *dev; - /* Consider 1st available GR engine */ - dev = nvgpu_device_get(g, NVGPU_DEVTYPE_GRAPHICS, 0); + dev = nvgpu_device_get(g, NVGPU_DEVTYPE_GRAPHICS, inst_id); if (dev == NULL) { - nvgpu_warn(g, "No GR devices on this GPU?!"); + nvgpu_warn(g, "No GR devices on this GPU for inst[%u]?!", + inst_id); return NVGPU_INVALID_ENG_ID; } return dev->engine_id; } +u32 nvgpu_engine_get_gr_id(struct gk20a *g) +{ + /* Consider 1st available GR engine */ + return nvgpu_engine_get_gr_id_for_inst(g, 0U); +} + u32 nvgpu_engine_act_interrupt_mask(struct gk20a *g, u32 engine_id) { const struct nvgpu_device *dev = NULL; diff --git a/drivers/gpu/nvgpu/common/gr/gr.c b/drivers/gpu/nvgpu/common/gr/gr.c index f022ba44c..54b366d1a 100644 --- a/drivers/gpu/nvgpu/common/gr/gr.c +++ b/drivers/gpu/nvgpu/common/gr/gr.c @@ -681,32 +681,70 @@ int nvgpu_gr_enable_hw(struct gk20a *g) } #ifdef CONFIG_NVGPU_ENGINE_RESET +static int nvgpu_gr_enable_hw_for_instance(struct gk20a *g) +{ + int err; + + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "Enable GR%u HW", + nvgpu_gr_get_cur_instance_id(g)); + + err = gr_reset_engine(g); + if (err != 0) { + nvgpu_err(g, "Gr Reset failed"); + return err; + } + + nvgpu_cg_init_gr_load_gating_prod(g); + + /* Disable elcg until it gets enabled later in the init*/ + nvgpu_cg_elcg_disable_no_wait(g); + + /** Enable interrupts at MC level */ + nvgpu_mc_intr_stall_unit_config(g, MC_INTR_UNIT_GR, MC_INTR_ENABLE); + nvgpu_mc_intr_nonstall_unit_config(g, MC_INTR_UNIT_GR, MC_INTR_ENABLE); + + err = gr_init_prepare_hw_impl(g); + if (err != 0) { + nvgpu_err(g, "gr_init_prepare_hw_impl failed"); + return err; + } + + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done"); + + return 0; +} + int nvgpu_gr_reset(struct gk20a *g) { int err; + struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g); struct nvgpu_mutex *fecs_mutex = - nvgpu_gr_falcon_get_fecs_mutex(g->gr->falcon); + nvgpu_gr_falcon_get_fecs_mutex(gr->falcon); g->gr->initialized = false; - nvgpu_log(g, gpu_dbg_rec, "Resetting GR"); + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr | gpu_dbg_rec, "Resetting GR%u HW", + nvgpu_gr_get_cur_instance_id(g)); nvgpu_mutex_acquire(fecs_mutex); - err = nvgpu_gr_enable_hw(g); + err = nvgpu_gr_enable_hw_for_instance(g); if (err != 0) { + nvgpu_err(g, "nvgpu_gr_enable_hw_for_instance failed"); nvgpu_mutex_release(fecs_mutex); return err; } - err = gr_init_setup_hw(g, g->gr); + err = gr_init_setup_hw(g, gr); if (err != 0) { + nvgpu_err(g, "gr_init_setup_hw failed"); nvgpu_mutex_release(fecs_mutex); return err; } - err = nvgpu_gr_falcon_init_ctxsw(g, g->gr->falcon); + err = nvgpu_gr_falcon_init_ctxsw(g, gr->falcon); if (err != 0) { + nvgpu_err(g, "nvgpu_gr_falcon_init_ctxsw failed"); nvgpu_mutex_release(fecs_mutex); return err; } @@ -718,8 +756,9 @@ int nvgpu_gr_reset(struct gk20a *g) * ramchain, etc so this is hw init. Hence should be executed * for every GR engine HW initialization. */ - err = nvgpu_gr_init_ctx_state(g, g->gr); + err = nvgpu_gr_init_ctx_state(g, gr); if (err != 0) { + nvgpu_err(g, "nvgpu_gr_init_ctx_state failed"); return err; } @@ -727,6 +766,7 @@ int nvgpu_gr_reset(struct gk20a *g) if (g->can_elpg) { err = nvgpu_gr_falcon_bind_fecs_elpg(g); if (err != 0) { + nvgpu_err(g, "nvgpu_gr_falcon_bind_fecs_elpg failed"); return err; } } @@ -738,7 +778,9 @@ int nvgpu_gr_reset(struct gk20a *g) /* GR is inialized, signal possible waiters */ g->gr->initialized = true; - nvgpu_cond_signal(&g->gr->init_wq); + nvgpu_cond_signal(&gr->init_wq); + + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done"); return err; } #endif diff --git a/drivers/gpu/nvgpu/common/rc/rc.c b/drivers/gpu/nvgpu/common/rc/rc.c index a8e57b903..4e5a64715 100644 --- a/drivers/gpu/nvgpu/common/rc/rc.c +++ b/drivers/gpu/nvgpu/common/rc/rc.c @@ -33,6 +33,7 @@ #include #include #include +#include void nvgpu_rc_fifo_recover(struct gk20a *g, u32 eng_bitmask, u32 hw_id, bool id_is_tsg, @@ -181,8 +182,13 @@ void nvgpu_rc_gr_fault(struct gk20a *g, struct nvgpu_tsg *tsg, #ifdef CONFIG_NVGPU_RECOVERY u32 gr_engine_id; u32 gr_eng_bitmask = 0U; + u32 cur_gr_instance_id = nvgpu_gr_get_cur_instance_id(g); + u32 inst_id = nvgpu_gr_get_syspipe_id(g, cur_gr_instance_id); - gr_engine_id = nvgpu_engine_get_gr_id(g); + nvgpu_log(g, gpu_dbg_gr, "RC GR%u inst_id%u", + cur_gr_instance_id, inst_id); + + gr_engine_id = nvgpu_engine_get_gr_id_for_inst(g, inst_id); if (gr_engine_id != NVGPU_INVALID_ENG_ID) { gr_eng_bitmask = BIT32(gr_engine_id); } else { @@ -203,6 +209,7 @@ void nvgpu_rc_gr_fault(struct gk20a *g, struct nvgpu_tsg *tsg, #else WARN_ON(!g->sw_quiesce_pending); #endif + nvgpu_log(g, gpu_dbg_gr, "done"); } void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g) diff --git a/drivers/gpu/nvgpu/include/nvgpu/engines.h b/drivers/gpu/nvgpu/include/nvgpu/engines.h index f3ad99ae8..f1229b2a0 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/engines.h +++ b/drivers/gpu/nvgpu/include/nvgpu/engines.h @@ -113,6 +113,18 @@ const struct nvgpu_device *nvgpu_engine_get_active_eng_info( * does not match with any of the engine ids supported by h/w. */ bool nvgpu_engine_check_valid_id(struct gk20a *g, u32 engine_id); +/** + * @brief Get h/w engine id based on engine's instance identification number + * #NVGPU_ENGINE_GR engine enum type. + * + * @param g [in] The GPU driver struct. + * @param inst_id [in] Engine's instance identification number. + * + * @return H/W engine id for #NVGPU_ENGINE_GR engine enum type. + * @retval #NVGPU_INVALID_ENG_ID if #NVGPU_ENGINE_GR engine enum type could not + * be found in the set of available h/w engine ids. + */ +u32 nvgpu_engine_get_gr_id_for_inst(struct gk20a *g, u32 inst_id); /** * @brief Get instance count and first available h/w engine id for * #NVGPU_ENGINE_GR engine enum type.