gpu: nvgpu: enable polling support for error reporting in AV+L

As per Safety_Services, a client must perform polling to ensure that the previously reported errors are cleared at FSI, in case of back-to-back error reporting. However, to minimize the polling overhead, NvGPU driver performs polling only when the error to be reported is corrected error to ensure that it is not overwriting the previously reported uncorrected/corrected error. In case of uncorrected errors, it will be reported without doing polling. This situation leads to a failure in error reporting, when uncorrected errors are reported back-to-back. This is acceptable for safety builds where SW quiesce will be triggered immediately after the reporting of first uncorrected error. In case of other build configurations, MCU/SEH takes the decision on encountering uncorrected errors. To handle such build configurations, polling is enabled for all types of errors, in all build configurations. This patch also removes an unused macro "ERR_TYPE_MASK". Bug 3622420 Change-Id: I750b0406faec9b229d8d0c74e986807234362cb9 Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2707105 Reviewed-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> GVS: Gerrit_Virtual_Submit
2025-12-24 10:34:43 +03:00 · 2022-05-04 11:47:24 +00:00
parent 657daaee9e
commit fac998940c
2 changed files with 24 additions and 30 deletions
--- a/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h
@@ -35,7 +35,6 @@
 #define ERR_REPORT_TIMEOUT_US		(5000U)
 #define SS_WAIT_DURATION_US		(500U)
 #define MAX_SS_RETRIES (ERR_REPORT_TIMEOUT_US / SS_WAIT_DURATION_US)
-#define ERR_TYPE_MASK		((1U) << (CORRECTED_BIT_FIELD_SHIFT))

 #define U32_BITS		32U
 #define DIV_BY_U32_BITS(x)	((x) / U32_BITS)
--- a/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c
+++ b/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c
@@ -35,43 +35,38 @@ int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, u32 err_id)
 	struct device *dev = dev_from_gk20a(g);

 	if (g->enable_polling == true) {
-		/**
-		 * Do polling, if the error to be reported is corrected one.
-		 */
-		if ((err_id & ERR_TYPE_MASK) == 0U) {
-			for (ss_retries = MAX_SS_RETRIES; ss_retries > 0U;
+		for (ss_retries = MAX_SS_RETRIES; ss_retries > 0U;
 					ss_retries--) {
-				ret = epl_get_misc_ec_err_status(dev,
+			ret = epl_get_misc_ec_err_status(dev,
 						MISC_EC_SW_ERR_CODE_0,
 						&ss_status);
-				if (ret == 0) {
-					if (ss_status == true) {
-						/**
-						 * Previously reported error is
-						 * cleared at Safety_Services.
-						 */
-						break;
-					} else {
-						nvgpu_udelay(SS_WAIT_DURATION_US);
-						continue;
-					}
-				} else if (ret == -ENODEV) {
-					nvgpu_err(g, "Error reporting is not "
-						"supported in this platform");
-					ret = 0;
-					return ret;
+			if (ret == 0) {
+				if (ss_status == true) {
+					/**
+					 * Previously reported error is cleared.
+					 */
+					break;
 				} else {
-					nvgpu_err(g, "Error reporting failed");
-					return ret;
+					nvgpu_info(g, "Polling is in progress");
+					nvgpu_udelay(SS_WAIT_DURATION_US);
+					continue;
 				}
-			}
-			if (ss_retries == 0U) {
-				nvgpu_err(g, "Error reporting failed: previous"
-						"error is not cleared");
-				ret = -1;
+			} else if (ret == -ENODEV) {
+				nvgpu_err(g, "Error reporting is not "
+						"supported in this platform");
+				ret = 0;
+				return ret;
+			} else {
+				nvgpu_err(g, "Error reporting failed");
 				return ret;
 			}
 		}
+		if (ss_retries == 0U) {
+			nvgpu_err(g, "Error reporting failed: previous"
+					"error is not cleared after retries");
+			ret = -1;
+			return ret;
+		}
 	}

 	/**