gpu: nvgpu: compile-out unused apis from safety build

This patch does the following changes:
- Compiles-out unused error reporting APIs and the related
  data structures from safety build. For this purpose, it
  introduces the new flag: CONFIG_NVGPU_INTR_DEBUG
- Updates nvgpu_report_err_to_sdl() API with one more argument,
  hw_unit_id. This aids in finding whether an error to be reported
  is corrected or uncorrected from LUT.
- Triggers SW quiesce, if an uncorrected error is reported to
  Safety_Services, in safety build.
- Renames files in cic folder by replacing gv11b with ga10b,
  since error reporting for gv11b is not supported in dev-main.

JIRA NVGPU-8002

Change-Id: Ic01e73b0208252abba1f615a2c98d770cdf41ca4
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2668466
Reviewed-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Rajesh Devaraj
2022-02-11 05:59:52 +00:00
committed by mobile promotions
parent 81c220b95b
commit 0699220b85
49 changed files with 456 additions and 316 deletions

View File

@@ -128,6 +128,7 @@ struct gops_ltc_intr {
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC
* -# \ref GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED
* "GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED"
* -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() is
@@ -142,6 +143,7 @@ struct gops_ltc_intr {
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC
* -# \ref GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED
* "GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED"
* -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() is
@@ -157,6 +159,7 @@ struct gops_ltc_intr {
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC
* -# \ref GPU_LTC_CACHE_DSTG_ECC_CORRECTED
* "GPU_LTC_CACHE_DSTG_ECC_CORRECTED"
* -# Flush the L2 cache by calling
@@ -173,6 +176,7 @@ struct gops_ltc_intr {
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC
* -# \ref GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED
* "GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED"
* -# Else if the ECC address correspongs to DSTG BE RAM:
@@ -182,6 +186,7 @@ struct gops_ltc_intr {
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC
* -# \ref GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
* "GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED"
* -# Else call \ref BUG "BUG()" as this type of ECC error is not supported.

View File

@@ -108,28 +108,6 @@ struct mmu_fault_info;
* @}
*/
/**
* This structure is used to store SM machine check related information.
*/
struct gr_sm_mcerr_info {
/** PC which triggered the machine check error. */
u64 hww_warp_esr_pc;
/** Error status register. */
u32 hww_warp_esr_status;
/** GR engine context of the faulted channel. */
u32 curr_ctx;
/** Channel to which the context belongs. */
u32 chid;
/** TSG to which the channel is bound. */
u32 tsgid;
/** IDs of TPC, GPC, and SM. */
u32 tpc, gpc, sm;
};
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
@@ -147,32 +125,6 @@ struct gr_sm_mcerr_info {
* @}
*/
/**
* This structure is used to store CTXSW error related information.
*/
struct ctxsw_err_info {
/** GR engine context of the faulted channel. */
u32 curr_ctx;
/** Context-switch status register-0. */
u32 ctxsw_status0;
/** Context-switch status register-1. */
u32 ctxsw_status1;
/** Channel to which the context belongs. */
u32 chid;
/**
* In case of any fault during context-switch transaction,
* context-switch error interrupt is set and the FECS firmware
* writes error code into FECS mailbox 6. This exception
* is handled at GR unit.
*/
u32 mailbox_value;
};
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
* Macros used to assign unique index to errors reported from the GPCCS unit.
@@ -268,23 +220,6 @@ struct ctxsw_err_info {
#define GPU_PGRAPH_ILLEGAL_CLASS (2U)
#define GPU_PGRAPH_CLASS_ERROR (3U)
/**
* This structure is used to store GR exception related information.
*/
struct gr_exception_info {
/** GR engine context of the faulted channel. */
u32 curr_ctx;
/** Channel bound to the context. */
u32 chid;
/** TSG to which the channel is bound. */
u32 tsgid;
/** GR interrupt status. */
u32 status;
};
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
* Macros used to assign unique index to errors reported from the LTC unit.
@@ -347,17 +282,6 @@ struct gr_exception_info {
* @}
*/
/**
* This structure is used to store GR error related information.
*/
struct gr_err_info {
/** SM machine check error information. */
struct gr_sm_mcerr_info *sm_mcerr_info;
/** GR exception related information. */
struct gr_exception_info *exception_info;
};
/**
* This macro is used to initialize the members of nvgpu_hw_err_inject_info
* struct.
@@ -392,6 +316,85 @@ struct nvgpu_hw_err_inject_info_desc {
u32 info_size;
};
#ifdef CONFIG_NVGPU_INTR_DEBUG
/**
* This structure is used to store SM machine check related information.
*/
struct gr_sm_mcerr_info {
/** PC which triggered the machine check error. */
u64 hww_warp_esr_pc;
/** Error status register. */
u32 hww_warp_esr_status;
/** GR engine context of the faulted channel. */
u32 curr_ctx;
/** Channel to which the context belongs. */
u32 chid;
/** TSG to which the channel is bound. */
u32 tsgid;
/** IDs of TPC, GPC, and SM. */
u32 tpc, gpc, sm;
};
/**
* This structure is used to store CTXSW error related information.
*/
struct ctxsw_err_info {
/** GR engine context of the faulted channel. */
u32 curr_ctx;
/** Context-switch status register-0. */
u32 ctxsw_status0;
/** Context-switch status register-1. */
u32 ctxsw_status1;
/** Channel to which the context belongs. */
u32 chid;
/**
* In case of any fault during context-switch transaction,
* context-switch error interrupt is set and the FECS firmware
* writes error code into FECS mailbox 6. This exception
* is handled at GR unit.
*/
u32 mailbox_value;
};
/**
* This structure is used to store GR exception related information.
*/
struct gr_exception_info {
/** GR engine context of the faulted channel. */
u32 curr_ctx;
/** Channel bound to the context. */
u32 chid;
/** TSG to which the channel is bound. */
u32 tsgid;
/** GR interrupt status. */
u32 status;
};
/**
* This structure is used to store GR error related information.
*/
struct gr_err_info {
/** SM machine check error information. */
struct gr_sm_mcerr_info *sm_mcerr_info;
/** GR exception related information. */
struct gr_exception_info *exception_info;
};
/**
* @brief This function provides an interface to report errors from HOST
* (PFIFO/PBDMA/PBUS) unit to SDL unit.
@@ -1194,17 +1197,19 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
*/
void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
u32 mailbox_value);
#endif /* CONFIG_NVGPU_INTR_DEBUG */
/**
* @brief This is a wrapper function to report ECC errors from HUBMMU to SDL.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit_id [in] - HW Unit ID.
* @param err_id [in] - Error ID.
*
* Calls nvgpu_report_err_to_ss to report errors to Safety_Services.
*
* @return None
*/
void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id);
void nvgpu_report_err_to_sdl(struct gk20a *g, u32 hw_unit_id, u32 err_id);
#endif /* NVGPU_NVGPU_ERR_H */