From c7b41f106d2c8f5ed6d1f23f8515812c2d832ea2 Mon Sep 17 00:00:00 2001 From: Thomas Fleury Date: Thu, 1 Aug 2019 11:31:29 -0400 Subject: [PATCH] gpu: nvgpu: add CONFIG_NVGPU_RECOVERY Add CONFIG_NVGPU_RECOVERY in order to conditionally compile recovery code. This code will be removed from safety build when sw quiesce state is implemented, and negative tests are disabled or modified such that they do not expect recovery to happen. Added static inline functions for recovery handlers, when CONFIG_NVGPU_RECOVERY is not defined. These inline functions can later be wired to the sw quiesce functions. Also moved gv11b recovery code to non-fusa, as it will ultimately be removed from safety build. Jira NVGPU-3871 Change-Id: Ia705b059fab6120899c7e15082f2a0f51ff51dc9 Signed-off-by: Thomas Fleury Reviewed-on: https://git-master.nvidia.com/r/2166074 Reviewed-by: mobile promotions Tested-by: mobile promotions --- arch/nvgpu-hal-new.yaml | 2 +- drivers/gpu/nvgpu/Kconfig | 9 +++- drivers/gpu/nvgpu/Makefile | 11 +++-- drivers/gpu/nvgpu/Makefile.shared.configs | 4 ++ drivers/gpu/nvgpu/Makefile.sources | 11 +++-- drivers/gpu/nvgpu/hal/init/hal_gm20b.c | 4 ++ drivers/gpu/nvgpu/hal/init/hal_gp10b.c | 4 ++ drivers/gpu/nvgpu/hal/init/hal_gv11b.c | 4 ++ drivers/gpu/nvgpu/hal/init/hal_tu104.c | 4 ++ .../hal/rc/{rc_gv11b_fusa.c => rc_gv11b.c} | 0 drivers/gpu/nvgpu/include/nvgpu/rc.h | 43 +++++++++++++++++++ 11 files changed, 88 insertions(+), 8 deletions(-) rename drivers/gpu/nvgpu/hal/rc/{rc_gv11b_fusa.c => rc_gv11b.c} (100%) diff --git a/arch/nvgpu-hal-new.yaml b/arch/nvgpu-hal-new.yaml index 0a3055fe4..1bd84d950 100644 --- a/arch/nvgpu-hal-new.yaml +++ b/arch/nvgpu-hal-new.yaml @@ -119,7 +119,7 @@ rc: owner: Seema K sources: [ hal/rc/rc_gk20a.c, hal/rc/rc_gk20a.h, - hal/rc/rc_gv11b_fusa.c, + hal/rc/rc_gv11b.c, hal/rc/rc_gv11b.h ] fbpa: diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig index 0fef9750e..42bb21e31 100644 --- a/drivers/gpu/nvgpu/Kconfig +++ b/drivers/gpu/nvgpu/Kconfig @@ -186,4 +186,11 @@ config NVGPU_HAL_NON_FUSA default y help Enable/Disable the support of HALs from chips that do not have functional - safety certification \ No newline at end of file + safety certification + +config NVGPU_RECOVERY + bool "Recovery support" + depends on GK20A + default y + help + Support recovery on failure (which may involve engine reset) diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 6a4d06b28..bde3a3495 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -62,6 +62,14 @@ ifeq ($(CONFIG_NVGPU_HAL_NON_FUSA),y) ccflags-y += -DCONFIG_NVGPU_HAL_NON_FUSA endif +ifeq ($(CONFIG_NVGPU_RECOVERY),y) +ccflags-y += -DCONFIG_NVGPU_RECOVERY +nvgpu-y += \ + common/rc/rc.o \ + hal/rc/rc_gk20a.o \ + hal/rc/rc_gv11b.o +endif + obj-$(CONFIG_GK20A) := nvgpu.o # OS independent parts of nvgpu. The work to collect files here @@ -252,7 +260,6 @@ nvgpu-y += \ hal/fuse/fuse_gm20b.o \ hal/fuse/fuse_gp106.o \ hal/func/func_tu104.o \ - hal/rc/rc_gk20a.o \ hal/fifo/fifo_gk20a.o \ hal/fifo/fifo_tu104.o \ hal/fifo/preempt_gk20a.o \ @@ -490,7 +497,6 @@ nvgpu-y += \ common/sim/sim.o \ common/sim/sim_pci.o \ common/sim/sim_netlist.o \ - common/rc/rc.o \ common/fifo/fifo.o \ common/fifo/preempt.o \ common/fifo/channel.o \ @@ -640,7 +646,6 @@ nvgpu-y += \ hal/priv_ring/priv_ring_gm20b_fusa.o \ hal/priv_ring/priv_ring_gp10b_fusa.o \ hal/ptimer/ptimer_gk20a_fusa.o \ - hal/rc/rc_gv11b_fusa.o \ hal/sync/syncpt_cmdbuf_gv11b_fusa.o \ hal/therm/therm_gm20b_fusa.o \ hal/therm/therm_gv11b_fusa.o \ diff --git a/drivers/gpu/nvgpu/Makefile.shared.configs b/drivers/gpu/nvgpu/Makefile.shared.configs index 2eb2e14ec..51516f957 100644 --- a/drivers/gpu/nvgpu/Makefile.shared.configs +++ b/drivers/gpu/nvgpu/Makefile.shared.configs @@ -93,6 +93,10 @@ NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_CHANNEL_TSG_CONTROL CONFIG_NVGPU_LOGGING := 1 NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_LOGGING +# Enable recovery for safety build until sw quiesce is done +CONFIG_NVGPU_RECOVERY := 1 +NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_RECOVERY + # # Flags enabled only for safety debug and regular build profile. # diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources index 87a29427f..6e1b99787 100644 --- a/drivers/gpu/nvgpu/Makefile.sources +++ b/drivers/gpu/nvgpu/Makefile.sources @@ -132,7 +132,6 @@ srcs += common/utils/assert.c \ common/power_features/cg/cg.c \ common/fifo/preempt.c \ common/fifo/channel.c \ - common/rc/rc.c \ common/fifo/fifo.c \ common/fifo/pbdma.c \ common/fifo/tsg.c \ @@ -233,7 +232,6 @@ srcs += hal/mm/mm_gv11b_fusa.c \ hal/priv_ring/priv_ring_gm20b_fusa.c \ hal/priv_ring/priv_ring_gp10b_fusa.c \ hal/ptimer/ptimer_gk20a_fusa.c \ - hal/rc/rc_gv11b_fusa.c \ hal/sync/syncpt_cmdbuf_gv11b_fusa.c \ hal/therm/therm_gm20b_fusa.c \ hal/therm/therm_gv11b_fusa.c \ @@ -283,7 +281,6 @@ srcs += hal/init/hal_gp10b.c \ hal/fb/fb_gm20b.c \ hal/fb/fb_gv11b.c \ hal/fuse/fuse_gm20b.c \ - hal/rc/rc_gk20a.c \ hal/fifo/fifo_gk20a.c \ hal/fifo/preempt_gk20a.c \ hal/fifo/engines_gm20b.c \ @@ -390,6 +387,14 @@ srcs += common/fifo/userd.c \ hal/fifo/userd_gv11b.c endif +ifeq ($(CONFIG_NVGPU_RECOVERY),1) +srcs += common/rc/rc.c \ + hal/rc/rc_gv11b.c +ifeq ($(CONFIG_NVGPU_HAL_NON_FUSA),1) +srcs += hal/rc/rc_gk20a.c +endif +endif + ifeq ($(CONFIG_NVGPU_FENCE),1) srcs += common/fence/fence.c endif diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c index 5bd1ea101..7d5626a94 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c @@ -83,7 +83,9 @@ #include "hal/fifo/ctxsw_timeout_gk20a.h" #include "hal/fifo/mmu_fault_gk20a.h" #include "hal/fifo/mmu_fault_gm20b.h" +#ifdef CONFIG_NVGPU_RECOVERY #include "hal/rc/rc_gk20a.h" +#endif #ifdef CONFIG_NVGPU_GRAPHICS #include "hal/gr/zbc/zbc_gm20b.h" #include "hal/gr/zcull/zcull_gm20b.h" @@ -606,7 +608,9 @@ static const struct gpu_ops gm20b_ops = { .init_pbdma_map = gk20a_fifo_init_pbdma_map, .is_preempt_pending = gk20a_fifo_is_preempt_pending, .reset_enable_hw = gk20a_init_fifo_reset_enable_hw, +#ifdef CONFIG_NVGPU_RECOVERY .recover = gk20a_fifo_recover, +#endif .intr_set_recover_mask = gk20a_fifo_intr_set_recover_mask, .intr_unset_recover_mask = gk20a_fifo_intr_unset_recover_mask, .setup_sw = nvgpu_fifo_setup_sw, diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index e1dfa2a0d..8050d8894 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -96,7 +96,9 @@ #include "hal/fifo/mmu_fault_gm20b.h" #include "hal/fifo/mmu_fault_gp10b.h" #include "hal/fifo/ctxsw_timeout_gk20a.h" +#ifdef CONFIG_NVGPU_RECOVERY #include "hal/rc/rc_gk20a.h" +#endif #include "hal/gr/ecc/ecc_gp10b.h" #include "hal/gr/fecs_trace/fecs_trace_gm20b.h" #include "hal/gr/config/gr_config_gm20b.h" @@ -675,7 +677,9 @@ static const struct gpu_ops gp10b_ops = { .init_pbdma_map = gk20a_fifo_init_pbdma_map, .is_preempt_pending = gk20a_fifo_is_preempt_pending, .reset_enable_hw = gk20a_init_fifo_reset_enable_hw, +#ifdef CONFIG_NVGPU_RECOVERY .recover = gk20a_fifo_recover, +#endif .intr_set_recover_mask = gk20a_fifo_intr_set_recover_mask, .intr_unset_recover_mask = gk20a_fifo_intr_unset_recover_mask, .setup_sw = nvgpu_fifo_setup_sw, diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 67d577b10..e24010376 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -71,7 +71,9 @@ #include "hal/fuse/fuse_gp10b.h" #include "hal/ptimer/ptimer_gk20a.h" #include "hal/regops/regops_gv11b.h" +#ifdef CONFIG_NVGPU_RECOVERY #include "hal/rc/rc_gv11b.h" +#endif #include "hal/fifo/fifo_gk20a.h" #include "hal/fifo/fifo_gv11b.h" #include "hal/fifo/pbdma_gm20b.h" @@ -833,7 +835,9 @@ static const struct gpu_ops gv11b_ops = { .init_pbdma_map = gk20a_fifo_init_pbdma_map, .is_preempt_pending = gv11b_fifo_is_preempt_pending, .reset_enable_hw = gv11b_init_fifo_reset_enable_hw, +#ifdef CONFIG_NVGPU_RECOVERY .recover = gv11b_fifo_recover, +#endif .intr_set_recover_mask = gv11b_fifo_intr_set_recover_mask, .intr_unset_recover_mask = gv11b_fifo_intr_unset_recover_mask, .setup_sw = nvgpu_fifo_setup_sw, diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index 62b299a8f..bb77d4449 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -71,7 +71,9 @@ #include "hal/fuse/fuse_gm20b.h" #include "hal/fuse/fuse_gp10b.h" #include "hal/fuse/fuse_gp106.h" +#ifdef CONFIG_NVGPU_RECOVERY #include "hal/rc/rc_gv11b.h" +#endif #include "hal/fifo/fifo_gk20a.h" #include "hal/fifo/fifo_gv11b.h" #include "hal/fifo/fifo_tu104.h" @@ -863,7 +865,9 @@ static const struct gpu_ops tu104_ops = { .init_pbdma_map = gk20a_fifo_init_pbdma_map, .is_preempt_pending = gv11b_fifo_is_preempt_pending, .reset_enable_hw = gv11b_init_fifo_reset_enable_hw, +#ifdef CONFIG_NVGPU_RECOVERY .recover = gv11b_fifo_recover, +#endif .intr_set_recover_mask = gv11b_fifo_intr_set_recover_mask, .intr_unset_recover_mask = gv11b_fifo_intr_unset_recover_mask, .setup_sw = nvgpu_fifo_setup_sw, diff --git a/drivers/gpu/nvgpu/hal/rc/rc_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/rc/rc_gv11b.c similarity index 100% rename from drivers/gpu/nvgpu/hal/rc/rc_gv11b_fusa.c rename to drivers/gpu/nvgpu/hal/rc/rc_gv11b.c diff --git a/drivers/gpu/nvgpu/include/nvgpu/rc.h b/drivers/gpu/nvgpu/include/nvgpu/rc.h index df7236444..3402b71ef 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/rc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h @@ -43,6 +43,7 @@ struct nvgpu_tsg; struct nvgpu_channel; struct nvgpu_pbdma_status_info; +#ifdef CONFIG_NVGPU_RECOVERY void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, struct nvgpu_tsg *tsg, bool debug_dump); @@ -64,4 +65,46 @@ void nvgpu_rc_fifo_recover(struct gk20a *g, u32 hw_id, /* if ~0, will be queried from HW */ bool id_is_tsg, /* ignored if hw_id == ~0 */ bool id_is_known, bool debug_dump, u32 rc_type); +#else +static inline void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, + struct nvgpu_tsg *tsg, bool debug_dump) +{ +} + +static inline void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f, + u32 pbdma_id, u32 error_notifier) +{ +} + +static inline void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id) +{ +} + +static inline void nvgpu_rc_preempt_timeout(struct gk20a *g, struct nvgpu_tsg *tsg) +{ +} + +static inline void nvgpu_rc_gr_fault(struct gk20a *g, + struct nvgpu_tsg *tsg, struct nvgpu_channel *ch) +{ +} + +static inline void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g) +{ +} + +static inline void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct nvgpu_tsg *tsg, + bool debug_dump, u32 rc_type) +{ +} + +static inline void nvgpu_rc_fifo_recover(struct gk20a *g, + u32 eng_bitmask, /* if zero, will be queried from HW */ + u32 hw_id, /* if ~0, will be queried from HW */ + bool id_is_tsg, /* ignored if hw_id == ~0 */ + bool id_is_known, bool debug_dump, u32 rc_type) +{ +} + +#endif #endif /* NVGPU_RC_H */