gpu: nvgpu: add polling for back-to-back error reporting in av+l

When an error is reported to Safety_Services, it will be cleared
at FSI and reported to SEH (System Error Handler). Since MISC_EC
interface provides only one register for error reporting, there
is a need to poll the status of previously reported error before
reporting the next error. For this purpose, this patch adds logic
to perform polling using epl_get_misc_ec_err_status(), in AV+L.

JIRA NVGPU-8094
Bug 200729736

Change-Id: Ia01a2fc42a7ce586b7965a82c90027a9a2dd252b
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2684141
Reviewed-by: Dinesh T <dt@nvidia.com>
Reviewed-by: Ankur Kishore <ankkishore@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Rajesh Devaraj
2022-03-19 18:13:02 +00:00
committed by mobile promotions
parent 9edbac4494
commit 4652f96a6f
2 changed files with 55 additions and 1 deletions

View File

@@ -19,6 +19,7 @@
#ifdef CONFIG_NVGPU_ENABLE_MISC_EC #ifdef CONFIG_NVGPU_ENABLE_MISC_EC
#include <linux/tegra-epl.h> #include <linux/tegra-epl.h>
#include <nvgpu/timers.h>
#include "os/linux/os_linux.h" #include "os/linux/os_linux.h"
#endif #endif
@@ -26,11 +27,60 @@ struct gk20a;
int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, u32 err_id) int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, u32 err_id)
{ {
int ret = 0U; int ret = 0;
#ifdef CONFIG_NVGPU_ENABLE_MISC_EC #ifdef CONFIG_NVGPU_ENABLE_MISC_EC
u32 ss_retries = 0U;
bool ss_status;
struct device *dev = dev_from_gk20a(g); struct device *dev = dev_from_gk20a(g);
if (g->enable_polling == true) {
/**
* Do polling, if the error to be reported is corrected one.
*/
if ((err_id & ERR_TYPE_MASK) == 0U) {
for (ss_retries = MAX_SS_RETRIES; ss_retries > 0U;
ss_retries--) {
ret = epl_get_misc_ec_err_status(dev,
MISC_EC_SW_ERR_CODE_0,
&ss_status);
if (ret == 0) {
if (ss_status == true) {
/**
* Previously reported error is
* cleared at Safety_Services.
*/
break;
} else {
nvgpu_udelay(SS_WAIT_DURATION_US);
continue;
}
} else if (ret == -ENODEV) {
nvgpu_err(g, "Error reporting is not "
"supported in this platform");
ret = 0;
return ret;
} else {
nvgpu_err(g, "Error reporting failed");
return ret;
}
}
if (ss_retries == 0U) {
nvgpu_err(g, "Error reporting failed: previous"
"error is not cleared");
ret = -1;
return ret;
}
}
}
/**
* Enable polling immediately after reporting of first error from boot.
*/
if (g->enable_polling == false) {
g->enable_polling = true;
}
/** /**
* MISC_EC_SW_ERR_CODE_0 register has been allocated for NvGPU * MISC_EC_SW_ERR_CODE_0 register has been allocated for NvGPU
* to report GPU HW errors to Safety_Services via MISC_EC interface. * to report GPU HW errors to Safety_Services via MISC_EC interface.

View File

@@ -452,6 +452,10 @@ int gk20a_pm_finalize_poweron(struct device *dev)
} }
} }
#ifdef CONFIG_NVGPU_ENABLE_MISC_EC
g->enable_polling = false;
#endif
err = gk20a_restore_registers(g); err = gk20a_restore_registers(g);
if (err) if (err)
goto done; goto done;