From 4652f96a6ffb4f7055290181c5234893d0096789 Mon Sep 17 00:00:00 2001 From: Rajesh Devaraj Date: Sat, 19 Mar 2022 18:13:02 +0000 Subject: [PATCH] gpu: nvgpu: add polling for back-to-back error reporting in av+l When an error is reported to Safety_Services, it will be cleared at FSI and reported to SEH (System Error Handler). Since MISC_EC interface provides only one register for error reporting, there is a need to poll the status of previously reported error before reporting the next error. For this purpose, this patch adds logic to perform polling using epl_get_misc_ec_err_status(), in AV+L. JIRA NVGPU-8094 Bug 200729736 Change-Id: Ia01a2fc42a7ce586b7965a82c90027a9a2dd252b Signed-off-by: Rajesh Devaraj Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2684141 Reviewed-by: Dinesh T Reviewed-by: Ankur Kishore GVS: Gerrit_Virtual_Submit --- .../gpu/nvgpu/os/linux/cic/cic_report_err.c | 52 ++++++++++++++++++- drivers/gpu/nvgpu/os/linux/module.c | 4 ++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c b/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c index 9e3cffdc0..4ce110c94 100644 --- a/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c +++ b/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c @@ -19,6 +19,7 @@ #ifdef CONFIG_NVGPU_ENABLE_MISC_EC #include +#include #include "os/linux/os_linux.h" #endif @@ -26,11 +27,60 @@ struct gk20a; int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, u32 err_id) { - int ret = 0U; + int ret = 0; #ifdef CONFIG_NVGPU_ENABLE_MISC_EC + u32 ss_retries = 0U; + bool ss_status; struct device *dev = dev_from_gk20a(g); + if (g->enable_polling == true) { + /** + * Do polling, if the error to be reported is corrected one. + */ + if ((err_id & ERR_TYPE_MASK) == 0U) { + for (ss_retries = MAX_SS_RETRIES; ss_retries > 0U; + ss_retries--) { + ret = epl_get_misc_ec_err_status(dev, + MISC_EC_SW_ERR_CODE_0, + &ss_status); + if (ret == 0) { + if (ss_status == true) { + /** + * Previously reported error is + * cleared at Safety_Services. + */ + break; + } else { + nvgpu_udelay(SS_WAIT_DURATION_US); + continue; + } + } else if (ret == -ENODEV) { + nvgpu_err(g, "Error reporting is not " + "supported in this platform"); + ret = 0; + return ret; + } else { + nvgpu_err(g, "Error reporting failed"); + return ret; + } + } + if (ss_retries == 0U) { + nvgpu_err(g, "Error reporting failed: previous" + "error is not cleared"); + ret = -1; + return ret; + } + } + } + + /** + * Enable polling immediately after reporting of first error from boot. + */ + if (g->enable_polling == false) { + g->enable_polling = true; + } + /** * MISC_EC_SW_ERR_CODE_0 register has been allocated for NvGPU * to report GPU HW errors to Safety_Services via MISC_EC interface. diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index 2c7e4d595..0fd9ca926 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -452,6 +452,10 @@ int gk20a_pm_finalize_poweron(struct device *dev) } } +#ifdef CONFIG_NVGPU_ENABLE_MISC_EC + g->enable_polling = false; +#endif + err = gk20a_restore_registers(g); if (err) goto done;