gpu: nvgpu: compile-out unused apis from safety build

This patch does the following changes: - Compiles-out unused error reporting APIs and the related data structures from safety build. For this purpose, it introduces the new flag: CONFIG_NVGPU_INTR_DEBUG - Updates nvgpu_report_err_to_sdl() API with one more argument, hw_unit_id. This aids in finding whether an error to be reported is corrected or uncorrected from LUT. - Triggers SW quiesce, if an uncorrected error is reported to Safety_Services, in safety build. - Renames files in cic folder by replacing gv11b with ga10b, since error reporting for gv11b is not supported in dev-main. JIRA NVGPU-8002 Change-Id: Ic01e73b0208252abba1f615a2c98d770cdf41ca4 Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2668466 Reviewed-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> GVS: Gerrit_Virtual_Submit
2025-12-24 10:34:43 +03:00 · 2022-02-11 05:59:52 +00:00
parent 81c220b95b
commit 0699220b85
49 changed files with 456 additions and 316 deletions
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h
@@ -128,6 +128,7 @@ struct gops_ltc_intr {
 	 *               -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
 	 *                  "nvgpu_report_err_to_sdl" with following parameters:
 	 *                  -# \a g
+	 *                  -# \ref NVGPU_ERR_MODULE_LTC
 	 *                  -# \ref GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED
 	 *                     "GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED"
 	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() is
@@ -142,6 +143,7 @@ struct gops_ltc_intr {
 	 *               -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
 	 *                  "nvgpu_report_err_to_sdl" with following parameters:
 	 *                  -# \a g
+	 *                  -# \ref NVGPU_ERR_MODULE_LTC
 	 *                  -# \ref GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED
 	 *                     "GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED"
 	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() is
@@ -157,6 +159,7 @@ struct gops_ltc_intr {
 	 *               -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
 	 *                  "nvgpu_report_err_to_sdl" with following parameters:
 	 *                  -# \a g
+	 *                  -# \ref NVGPU_ERR_MODULE_LTC
 	 *                  -# \ref GPU_LTC_CACHE_DSTG_ECC_CORRECTED
 	 *                     "GPU_LTC_CACHE_DSTG_ECC_CORRECTED"
 	 *               -# Flush the L2 cache by calling
@@ -173,6 +176,7 @@ struct gops_ltc_intr {
 	 *                  -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
 	 *                     "nvgpu_report_err_to_sdl" with following parameters:
 	 *                     -# \a g
+	 *                  -# \ref NVGPU_ERR_MODULE_LTC
 	 *                     -# \ref GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED
 	 *                        "GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED"
 	 *               -# Else if the ECC address correspongs to DSTG BE RAM:
@@ -182,6 +186,7 @@ struct gops_ltc_intr {
 	 *                  -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
 	 *                     "nvgpu_report_err_to_sdl" with following parameters:
 	 *                     -# \a g
+	 *                     -# \ref NVGPU_ERR_MODULE_LTC
 	 *                     -# \ref GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
 	 *                        "GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED"
 	 *               -# Else call \ref BUG "BUG()" as this type of ECC error is not supported.
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -108,28 +108,6 @@ struct mmu_fault_info;
 * @}
 */

-/**
- * This structure is used to store SM machine check related information.
- */
-struct gr_sm_mcerr_info {
-	/** PC which triggered the machine check error. */
-	u64 hww_warp_esr_pc;
-
-	/** Error status register. */
-	u32 hww_warp_esr_status;
-
-	/** GR engine context of the faulted channel. */
-	u32 curr_ctx;
-
-	/** Channel to which the context belongs. */
-	u32 chid;
-
-	/** TSG to which the channel is bound. */
-	u32 tsgid;
-
-	/** IDs of TPC, GPC, and SM. */
-	u32 tpc, gpc, sm;
-};

 /**
 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
@@ -147,32 +125,6 @@ struct gr_sm_mcerr_info {
 * @}
 */

-/**
- * This structure is used to store CTXSW error related information.
- */
-struct ctxsw_err_info {
-
-	/** GR engine context of the faulted channel. */
-	u32 curr_ctx;
-
-	/** Context-switch status register-0. */
-	u32 ctxsw_status0;
-
-	/** Context-switch status register-1. */
-	u32 ctxsw_status1;
-
-	/** Channel to which the context belongs. */
-	u32 chid;
-
-	/**
-	 * In case of any fault during context-switch transaction,
-	 * context-switch error interrupt is set and the FECS firmware
-	 * writes error code into FECS mailbox 6. This exception
-	 * is handled at GR unit.
-	 */
-	u32 mailbox_value;
-};
-
 /**
 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
 * Macros used to assign unique index to errors reported from the GPCCS unit.
@@ -268,23 +220,6 @@ struct ctxsw_err_info {
 #define GPU_PGRAPH_ILLEGAL_CLASS		(2U)
 #define GPU_PGRAPH_CLASS_ERROR			(3U)

-/**
- * This structure is used to store GR exception related information.
- */
-struct gr_exception_info {
-	/** GR engine context of the faulted channel. */
-	u32 curr_ctx;
-
-	/** Channel bound to the context. */
-	u32 chid;
-
-	/** TSG to which the channel is bound. */
-	u32 tsgid;
-
-	/** GR interrupt status. */
-	u32 status;
-};
-
 /**
 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
 * Macros used to assign unique index to errors reported from the LTC unit.
@@ -347,17 +282,6 @@ struct gr_exception_info {
 * @}
 */

-/**
- * This structure is used to store GR error related information.
- */
-struct gr_err_info {
-	/** SM machine check error information. */
-	struct gr_sm_mcerr_info *sm_mcerr_info;
-
-	/** GR exception related information. */
-	struct gr_exception_info *exception_info;
-};
-
 /**
 * This macro is used to initialize the members of nvgpu_hw_err_inject_info
 * struct.
@@ -392,6 +316,85 @@ struct nvgpu_hw_err_inject_info_desc {
 	u32 info_size;
 };

+#ifdef CONFIG_NVGPU_INTR_DEBUG
+
+/**
+ * This structure is used to store SM machine check related information.
+ */
+struct gr_sm_mcerr_info {
+	/** PC which triggered the machine check error. */
+	u64 hww_warp_esr_pc;
+
+	/** Error status register. */
+	u32 hww_warp_esr_status;
+
+	/** GR engine context of the faulted channel. */
+	u32 curr_ctx;
+
+	/** Channel to which the context belongs. */
+	u32 chid;
+
+	/** TSG to which the channel is bound. */
+	u32 tsgid;
+
+	/** IDs of TPC, GPC, and SM. */
+	u32 tpc, gpc, sm;
+};
+
+/**
+ * This structure is used to store CTXSW error related information.
+ */
+struct ctxsw_err_info {
+
+	/** GR engine context of the faulted channel. */
+	u32 curr_ctx;
+
+	/** Context-switch status register-0. */
+	u32 ctxsw_status0;
+
+	/** Context-switch status register-1. */
+	u32 ctxsw_status1;
+
+	/** Channel to which the context belongs. */
+	u32 chid;
+
+	/**
+	 * In case of any fault during context-switch transaction,
+	 * context-switch error interrupt is set and the FECS firmware
+	 * writes error code into FECS mailbox 6. This exception
+	 * is handled at GR unit.
+	 */
+	u32 mailbox_value;
+};
+
+/**
+ * This structure is used to store GR exception related information.
+ */
+struct gr_exception_info {
+	/** GR engine context of the faulted channel. */
+	u32 curr_ctx;
+
+	/** Channel bound to the context. */
+	u32 chid;
+
+	/** TSG to which the channel is bound. */
+	u32 tsgid;
+
+	/** GR interrupt status. */
+	u32 status;
+};
+
+/**
+ * This structure is used to store GR error related information.
+ */
+struct gr_err_info {
+	/** SM machine check error information. */
+	struct gr_sm_mcerr_info *sm_mcerr_info;
+
+	/** GR exception related information. */
+	struct gr_exception_info *exception_info;
+};
+
 /**
 * @brief This function provides an interface to report errors from HOST
 *        (PFIFO/PBDMA/PBUS) unit to SDL unit.
@@ -1194,17 +1197,19 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
 */
 void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
 		u32 mailbox_value);
+#endif /* CONFIG_NVGPU_INTR_DEBUG */

 /**
 * @brief This is a wrapper function to report ECC errors from HUBMMU to SDL.
 *
 * @param g [in]		- The GPU driver struct.
+ * @param hw_unit_id [in]	- HW Unit ID.
 * @param err_id [in]		- Error ID.
 *
 * Calls nvgpu_report_err_to_ss to report errors to Safety_Services.
 *
 * @return	None
 */
-void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id);
+void nvgpu_report_err_to_sdl(struct gk20a *g, u32 hw_unit_id, u32 err_id);

 #endif /* NVGPU_NVGPU_ERR_H */