mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: add error ids for pmu, gsp
This patch does the following: - Adds error IDs for GSP ACR and GSP SCHED. - Updates error IDs for PMU. - Removes reporting of DMEM ECC_CORRECTED since DMEM RAMs in PWR is protected only with parity mechanism, (ref: T23x_UPROC_Safety_IAS) - Removes reporting of IMEM ECC_CORRECTED since IMEM RAMs for PROC in PWR is protected only with parity mechanism, (ref: T23x_UPROC_Safety_IAS) JIRA NVGPU-8094 Change-Id: I127e78b1aa76b552758d1fff5bc7a01b5f8f3e54 Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2677589 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
cf43371073
commit
185dbf9192
@@ -354,29 +354,56 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.name = "pmu",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_PMU,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 5U,
|
||||
.num_errs = 10U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_NONCRITERR("falcon_imem_ecc_corrected",
|
||||
GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
|
||||
GPU_CRITERR("pmu_nvriscv_brom_failure",
|
||||
GPU_PMU_NVRISCV_BROM_FAILURE,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("falcon_imem_ecc_uncorrected",
|
||||
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
|
||||
GPU_CRITERR("pmu_access_timeout",
|
||||
GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("falcon_dmem_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
GPU_CRITERR("pmu_mpu_ecc_uncorrected",
|
||||
GPU_PMU_MPU_ECC_UNCORRECTED,
|
||||
INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
|
||||
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
|
||||
GPU_CRITERR("pmu_illegal_access_uncorrected",
|
||||
GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("bar0_error_timeout",
|
||||
GPU_PMU_BAR0_ERROR_TIMEOUT, INJECT_SW,
|
||||
GPU_CRITERR("pmu_imem_ecc_uncorrected",
|
||||
GPU_PMU_IMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("pmu_dcls_uncorrected",
|
||||
GPU_PMU_DCLS_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("pmu_dmem_ecc_uncorrected",
|
||||
GPU_PMU_DMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("pmu_wdt_uncorrected",
|
||||
GPU_PMU_WDT_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("pmu_reg_ecc_uncorrected",
|
||||
GPU_PMU_REG_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("pmu_bar0_error_timeout",
|
||||
GPU_PMU_BAR0_ERROR_TIMEOUT,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
},
|
||||
@@ -624,6 +651,127 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
#endif
|
||||
},
|
||||
},
|
||||
{
|
||||
.name = "gsp_acr",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_GSP_ACR,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 12U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_CRITERR("gsp_acr_nvriscv_brom_failure",
|
||||
GPU_GSP_ACR_NVRISCV_BROM_FAILURE,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_emem_ecc_uncorrected",
|
||||
GPU_GSP_ACR_EMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_reg_access_timeout_uncorrected",
|
||||
GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED,
|
||||
INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_illegal_access_uncorrected",
|
||||
GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_imem_ecc_uncorrected",
|
||||
GPU_GSP_ACR_IMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_dcls_uncorrected",
|
||||
GPU_GSP_ACR_DCLS_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_dmem_ecc_uncorrected",
|
||||
GPU_GSP_ACR_DMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_wdt_uncorrected",
|
||||
GPU_GSP_ACR_WDT_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_reg_ecc_uncorrected",
|
||||
GPU_GSP_ACR_REG_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_fecs_pkc_lssig_failure",
|
||||
GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_gpccs_pkc_lssig_failure",
|
||||
GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_acr_lspmu_pkc_lssig_failure",
|
||||
GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
},
|
||||
},
|
||||
{
|
||||
.name = "gsp_sched",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_GSP_SCHED,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 9U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_CRITERR("gsp_sched_nvriscv_brom_failure",
|
||||
GPU_GSP_SCHED_NVRISCV_BROM_FAILURE,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_emem_ecc_uncorrected",
|
||||
GPU_GSP_SCHED_EMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_reg_access_timeout_uncorrected",
|
||||
GPU_GSP_SCHED_REG_ACCESS_TIMEOUT_UNCORRECTED,
|
||||
INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_illegal_access_uncorrected",
|
||||
GPU_GSP_SCHED_ILLEGAL_ACCESS_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_imem_ecc_uncorrected",
|
||||
GPU_GSP_SCHED_IMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_dcls_uncorrected",
|
||||
GPU_GSP_SCHED_DCLS_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_dmem_ecc_uncorrected",
|
||||
GPU_GSP_SCHED_DMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_wdt_uncorrected",
|
||||
GPU_GSP_SCHED_WDT_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("gpu_gsp_sched_reg_ecc_uncorrected",
|
||||
GPU_GSP_SCHED_REG_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
u32 size_of_ga10b_lut = sizeof(ga10b_err_lut) /
|
||||
|
||||
@@ -141,32 +141,21 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
|
||||
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
|
||||
GPU_PMU_FALCON_IMEM_ECC_CORRECTED);
|
||||
nvgpu_err(g, "falcon imem ecc error corrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
|
||||
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED);
|
||||
GPU_PMU_IMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "falcon imem ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
ret = -EFAULT;
|
||||
}
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
|
||||
nvgpu_err(g, "falcon dmem ecc error corrected");
|
||||
/* This error is not expected to occur in gv11b and hence,
|
||||
* this scenario is considered as a fatal error.
|
||||
*/
|
||||
nvgpu_mutex_release(&g->pmu->isr_mutex);
|
||||
BUG();
|
||||
}
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
|
||||
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED);
|
||||
GPU_PMU_DMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "falcon dmem ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
ret = -EFAULT;
|
||||
|
||||
@@ -53,6 +53,8 @@ struct mmu_fault_info;
|
||||
#define NVGPU_ERR_MODULE_HUBMMU (9U)
|
||||
#define NVGPU_ERR_MODULE_PRI (10U)
|
||||
#define NVGPU_ERR_MODULE_CE (11U)
|
||||
#define NVGPU_ERR_MODULE_GSP_ACR (12U)
|
||||
#define NVGPU_ERR_MODULE_GSP_SCHED (13U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
@@ -161,10 +163,55 @@ struct mmu_fault_info;
|
||||
* Macros used to assign unique index to errors reported from the PMU unit.
|
||||
* @{
|
||||
*/
|
||||
#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U)
|
||||
#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U)
|
||||
#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U)
|
||||
#define GPU_PMU_BAR0_ERROR_TIMEOUT (4U)
|
||||
#define GPU_PMU_NVRISCV_BROM_FAILURE (0U)
|
||||
#define GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED (1U)
|
||||
#define GPU_PMU_MPU_ECC_UNCORRECTED (2U)
|
||||
#define GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED (3U)
|
||||
#define GPU_PMU_IMEM_ECC_UNCORRECTED (4U)
|
||||
#define GPU_PMU_DCLS_UNCORRECTED (5U)
|
||||
#define GPU_PMU_DMEM_ECC_UNCORRECTED (6U)
|
||||
#define GPU_PMU_WDT_UNCORRECTED (7U)
|
||||
#define GPU_PMU_REG_ECC_UNCORRECTED (8U)
|
||||
#define GPU_PMU_BAR0_ERROR_TIMEOUT (9U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GSP_ACR
|
||||
* Macros used to assign unique index to errors reported from the GSP ACR unit.
|
||||
* @{
|
||||
*/
|
||||
#define GPU_GSP_ACR_NVRISCV_BROM_FAILURE (0U)
|
||||
#define GPU_GSP_ACR_EMEM_ECC_UNCORRECTED (1U)
|
||||
#define GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED (2U)
|
||||
#define GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED (3U)
|
||||
#define GPU_GSP_ACR_IMEM_ECC_UNCORRECTED (4U)
|
||||
#define GPU_GSP_ACR_DCLS_UNCORRECTED (5U)
|
||||
#define GPU_GSP_ACR_DMEM_ECC_UNCORRECTED (6U)
|
||||
#define GPU_GSP_ACR_WDT_UNCORRECTED (7U)
|
||||
#define GPU_GSP_ACR_REG_ECC_UNCORRECTED (8U)
|
||||
#define GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE (9U)
|
||||
#define GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE (10U)
|
||||
#define GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE (11U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GSP_SCHED
|
||||
* Macros used to assign unique index to errors reported from the GSP SCHED unit.
|
||||
* @{
|
||||
*/
|
||||
#define GPU_GSP_SCHED_NVRISCV_BROM_FAILURE (0U)
|
||||
#define GPU_GSP_SCHED_EMEM_ECC_UNCORRECTED (1U)
|
||||
#define GPU_GSP_SCHED_REG_ACCESS_TIMEOUT_UNCORRECTED (2U)
|
||||
#define GPU_GSP_SCHED_ILLEGAL_ACCESS_UNCORRECTED (3U)
|
||||
#define GPU_GSP_SCHED_IMEM_ECC_UNCORRECTED (4U)
|
||||
#define GPU_GSP_SCHED_DCLS_UNCORRECTED (5U)
|
||||
#define GPU_GSP_SCHED_DMEM_ECC_UNCORRECTED (6U)
|
||||
#define GPU_GSP_SCHED_WDT_UNCORRECTED (7U)
|
||||
#define GPU_GSP_SCHED_REG_ECC_UNCORRECTED (8U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user