gpu: nvgpu: add error ids for pmu, gsp

This patch does the following:
- Adds error IDs for GSP ACR and GSP SCHED.
- Updates error IDs for PMU.
- Removes reporting of DMEM ECC_CORRECTED since DMEM RAMs in PWR is
  protected only with parity mechanism, (ref: T23x_UPROC_Safety_IAS)
- Removes reporting of IMEM ECC_CORRECTED since IMEM RAMs for PROC in
  PWR is protected only with parity mechanism, (ref: T23x_UPROC_Safety_IAS)

JIRA NVGPU-8094

Change-Id: I127e78b1aa76b552758d1fff5bc7a01b5f8f3e54
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2677589
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Rajesh Devaraj
2022-03-07 14:36:46 +00:00
committed by mobile promotions
parent cf43371073
commit 185dbf9192
3 changed files with 212 additions and 28 deletions

View File

@@ -354,29 +354,56 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.name = "pmu", .name = "pmu",
.hw_unit = (u32)NVGPU_ERR_MODULE_PMU, .hw_unit = (u32)NVGPU_ERR_MODULE_PMU,
.num_instances = 1U, .num_instances = 1U,
.num_errs = 5U, .num_errs = 10U,
.errs = (struct nvgpu_err_desc[]) { .errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("falcon_imem_ecc_corrected", GPU_CRITERR("pmu_nvriscv_brom_failure",
GPU_PMU_FALCON_IMEM_ECC_CORRECTED, GPU_PMU_NVRISCV_BROM_FAILURE,
INJECT_SW, INJECT_SW,
NULL, NULL, NULL, NULL,
NULL, NULL, 0, 0), NULL, NULL, 0, 0),
GPU_CRITERR("falcon_imem_ecc_uncorrected", GPU_CRITERR("pmu_access_timeout",
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED,
INJECT_SW, INJECT_SW,
NULL, NULL, NULL, NULL,
NULL, NULL, 0, 0), NULL, NULL, 0, 0),
GPU_NONCRITERR("falcon_dmem_ecc_corrected", GPU_CRITERR("pmu_mpu_ecc_uncorrected",
0, INJECT_NONE, GPU_PMU_MPU_ECC_UNCORRECTED,
INJECT_NONE,
NULL, NULL, NULL, NULL,
NULL, NULL, 0, 0), NULL, NULL, 0, 0),
GPU_CRITERR("falcon_dmem_ecc_uncorrected", GPU_CRITERR("pmu_illegal_access_uncorrected",
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED,
INJECT_SW, INJECT_SW,
NULL, NULL, NULL, NULL,
NULL, NULL, 0, 0), NULL, NULL, 0, 0),
GPU_CRITERR("bar0_error_timeout", GPU_CRITERR("pmu_imem_ecc_uncorrected",
GPU_PMU_BAR0_ERROR_TIMEOUT, INJECT_SW, GPU_PMU_IMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_dcls_uncorrected",
GPU_PMU_DCLS_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_dmem_ecc_uncorrected",
GPU_PMU_DMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_wdt_uncorrected",
GPU_PMU_WDT_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_reg_ecc_uncorrected",
GPU_PMU_REG_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_bar0_error_timeout",
GPU_PMU_BAR0_ERROR_TIMEOUT,
INJECT_SW,
NULL, NULL, NULL, NULL,
NULL, NULL, 0, 0), NULL, NULL, 0, 0),
}, },
@@ -624,6 +651,127 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
#endif #endif
}, },
}, },
{
.name = "gsp_acr",
.hw_unit = (u32)NVGPU_ERR_MODULE_GSP_ACR,
.num_instances = 1U,
.num_errs = 12U,
.errs = (struct nvgpu_err_desc[]) {
GPU_CRITERR("gsp_acr_nvriscv_brom_failure",
GPU_GSP_ACR_NVRISCV_BROM_FAILURE,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_emem_ecc_uncorrected",
GPU_GSP_ACR_EMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_reg_access_timeout_uncorrected",
GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_illegal_access_uncorrected",
GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_imem_ecc_uncorrected",
GPU_GSP_ACR_IMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_dcls_uncorrected",
GPU_GSP_ACR_DCLS_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_dmem_ecc_uncorrected",
GPU_GSP_ACR_DMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_wdt_uncorrected",
GPU_GSP_ACR_WDT_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_reg_ecc_uncorrected",
GPU_GSP_ACR_REG_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_fecs_pkc_lssig_failure",
GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_gpccs_pkc_lssig_failure",
GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_acr_lspmu_pkc_lssig_failure",
GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "gsp_sched",
.hw_unit = (u32)NVGPU_ERR_MODULE_GSP_SCHED,
.num_instances = 1U,
.num_errs = 9U,
.errs = (struct nvgpu_err_desc[]) {
GPU_CRITERR("gsp_sched_nvriscv_brom_failure",
GPU_GSP_SCHED_NVRISCV_BROM_FAILURE,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_emem_ecc_uncorrected",
GPU_GSP_SCHED_EMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_reg_access_timeout_uncorrected",
GPU_GSP_SCHED_REG_ACCESS_TIMEOUT_UNCORRECTED,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_illegal_access_uncorrected",
GPU_GSP_SCHED_ILLEGAL_ACCESS_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_imem_ecc_uncorrected",
GPU_GSP_SCHED_IMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_dcls_uncorrected",
GPU_GSP_SCHED_DCLS_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_dmem_ecc_uncorrected",
GPU_GSP_SCHED_DMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_wdt_uncorrected",
GPU_GSP_SCHED_WDT_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("gpu_gsp_sched_reg_ecc_uncorrected",
GPU_GSP_SCHED_REG_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
}; };
u32 size_of_ga10b_lut = sizeof(ga10b_err_lut) / u32 size_of_ga10b_lut = sizeof(ga10b_err_lut) /

View File

@@ -141,32 +141,21 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
if ((ecc_status & if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
GPU_PMU_FALCON_IMEM_ECC_CORRECTED);
nvgpu_err(g, "falcon imem ecc error corrected. " nvgpu_err(g, "falcon imem ecc error corrected. "
"ecc_addr(0x%x)", ecc_addr); "ecc_addr(0x%x)", ecc_addr);
} }
if ((ecc_status & if ((ecc_status &
pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED); GPU_PMU_IMEM_ECC_UNCORRECTED);
nvgpu_err(g, "falcon imem ecc error uncorrected. " nvgpu_err(g, "falcon imem ecc error uncorrected. "
"ecc_addr(0x%x)", ecc_addr); "ecc_addr(0x%x)", ecc_addr);
ret = -EFAULT; ret = -EFAULT;
} }
if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
nvgpu_err(g, "falcon dmem ecc error corrected");
/* This error is not expected to occur in gv11b and hence,
* this scenario is considered as a fatal error.
*/
nvgpu_mutex_release(&g->pmu->isr_mutex);
BUG();
}
if ((ecc_status & if ((ecc_status &
pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED); GPU_PMU_DMEM_ECC_UNCORRECTED);
nvgpu_err(g, "falcon dmem ecc error uncorrected. " nvgpu_err(g, "falcon dmem ecc error uncorrected. "
"ecc_addr(0x%x)", ecc_addr); "ecc_addr(0x%x)", ecc_addr);
ret = -EFAULT; ret = -EFAULT;

View File

@@ -53,6 +53,8 @@ struct mmu_fault_info;
#define NVGPU_ERR_MODULE_HUBMMU (9U) #define NVGPU_ERR_MODULE_HUBMMU (9U)
#define NVGPU_ERR_MODULE_PRI (10U) #define NVGPU_ERR_MODULE_PRI (10U)
#define NVGPU_ERR_MODULE_CE (11U) #define NVGPU_ERR_MODULE_CE (11U)
#define NVGPU_ERR_MODULE_GSP_ACR (12U)
#define NVGPU_ERR_MODULE_GSP_SCHED (13U)
/** /**
* @} * @}
*/ */
@@ -161,10 +163,55 @@ struct mmu_fault_info;
* Macros used to assign unique index to errors reported from the PMU unit. * Macros used to assign unique index to errors reported from the PMU unit.
* @{ * @{
*/ */
#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U) #define GPU_PMU_NVRISCV_BROM_FAILURE (0U)
#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U) #define GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED (1U)
#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U) #define GPU_PMU_MPU_ECC_UNCORRECTED (2U)
#define GPU_PMU_BAR0_ERROR_TIMEOUT (4U) #define GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED (3U)
#define GPU_PMU_IMEM_ECC_UNCORRECTED (4U)
#define GPU_PMU_DCLS_UNCORRECTED (5U)
#define GPU_PMU_DMEM_ECC_UNCORRECTED (6U)
#define GPU_PMU_WDT_UNCORRECTED (7U)
#define GPU_PMU_REG_ECC_UNCORRECTED (8U)
#define GPU_PMU_BAR0_ERROR_TIMEOUT (9U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GSP_ACR
* Macros used to assign unique index to errors reported from the GSP ACR unit.
* @{
*/
#define GPU_GSP_ACR_NVRISCV_BROM_FAILURE (0U)
#define GPU_GSP_ACR_EMEM_ECC_UNCORRECTED (1U)
#define GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED (2U)
#define GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED (3U)
#define GPU_GSP_ACR_IMEM_ECC_UNCORRECTED (4U)
#define GPU_GSP_ACR_DCLS_UNCORRECTED (5U)
#define GPU_GSP_ACR_DMEM_ECC_UNCORRECTED (6U)
#define GPU_GSP_ACR_WDT_UNCORRECTED (7U)
#define GPU_GSP_ACR_REG_ECC_UNCORRECTED (8U)
#define GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE (9U)
#define GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE (10U)
#define GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE (11U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GSP_SCHED
* Macros used to assign unique index to errors reported from the GSP SCHED unit.
* @{
*/
#define GPU_GSP_SCHED_NVRISCV_BROM_FAILURE (0U)
#define GPU_GSP_SCHED_EMEM_ECC_UNCORRECTED (1U)
#define GPU_GSP_SCHED_REG_ACCESS_TIMEOUT_UNCORRECTED (2U)
#define GPU_GSP_SCHED_ILLEGAL_ACCESS_UNCORRECTED (3U)
#define GPU_GSP_SCHED_IMEM_ECC_UNCORRECTED (4U)
#define GPU_GSP_SCHED_DCLS_UNCORRECTED (5U)
#define GPU_GSP_SCHED_DMEM_ECC_UNCORRECTED (6U)
#define GPU_GSP_SCHED_WDT_UNCORRECTED (7U)
#define GPU_GSP_SCHED_REG_ECC_UNCORRECTED (8U)
/** /**
* @} * @}
*/ */