From ea9aebb358e3459d46584eabbea12b7d266614d3 Mon Sep 17 00:00:00 2001 From: Kishan Date: Wed, 2 Nov 2022 07:40:10 +0000 Subject: [PATCH] nvgpu:cic: API to handle fatal error interrupt Any corrected or uncorrected error reported by gpu hw will be seen by nvgpu-mon. nvgpu-mon will raise a devctl call to notify nvgpu-rm if its a fatal error interrupt. nvgpu_cic_mon_handle_fatal_intr is the corresponding handler which will walk through the entire tree structure of interrupts for all the subunits and enter quiesce state. Change-Id: I3c00c61a7f2c52ae1920f84ee7dfb65cba6b683d Signed-off-by: Kishan Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2801693 Reviewed-by: svcacv Reviewed-by: Vaibhav Kachore GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/common/cic/mon/mon_intr.c | 12 ++++++++++++ drivers/gpu/nvgpu/include/nvgpu/cic_mon.h | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_intr.c b/drivers/gpu/nvgpu/common/cic/mon/mon_intr.c index 79f24381c..7b7eb8e99 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_intr.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_intr.c @@ -153,6 +153,18 @@ void nvgpu_cic_mon_intr_nonstall_handle(struct gk20a *g) (void)nvgpu_cic_rm_broadcast_last_irq_nonstall(g); } #endif +#ifdef CONFIG_NVGPU_MON_PRESENT +int nvgpu_cic_mon_handle_fatal_intr(struct gk20a *g) +{ + if (nvgpu_is_powered_off(g)) { + nvgpu_err(g, "GPU is already powered off"); + return -ENODEV; + } + g->ops.mc.isr_stall(g); + + return 0U; +} +#endif u32 nvgpu_cic_mon_intr_stall_isr(struct gk20a *g) { diff --git a/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h b/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h index e2e79a18d..cea989145 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h +++ b/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h @@ -409,6 +409,22 @@ int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, */ int nvgpu_cic_mon_get_num_hw_modules(struct gk20a *g); +#ifdef CONFIG_NVGPU_MON_PRESENT +/** + * @brief Fatal error interrupt handler for safety. + * + * @param g [in] The GPU driver struct. + * + * This function is invoked by NVGPU_MON_DEVCTL_NOTIFY_INTR devctl raised by nvgpu-mon. + * It is called to parse the interrupt tree and determine exact error. + * The unit ISR functions are invoked based on triggered interrupts. + * + * @retval -ENODEV if GPU is already powered off. + * @retval 0U if Fatal interrupt handling was performed succcessfully. + */ +int nvgpu_cic_mon_handle_fatal_intr(struct gk20a *g); +#endif + /** * @brief Top half of stall interrupt ISR. *