From 185dbf919297c59e61066664e5b343d5e5c58d51 Mon Sep 17 00:00:00 2001 From: Rajesh Devaraj Date: Mon, 7 Mar 2022 14:36:46 +0000 Subject: [PATCH] gpu: nvgpu: add error ids for pmu, gsp This patch does the following: - Adds error IDs for GSP ACR and GSP SCHED. - Updates error IDs for PMU. - Removes reporting of DMEM ECC_CORRECTED since DMEM RAMs in PWR is protected only with parity mechanism, (ref: T23x_UPROC_Safety_IAS) - Removes reporting of IMEM ECC_CORRECTED since IMEM RAMs for PROC in PWR is protected only with parity mechanism, (ref: T23x_UPROC_Safety_IAS) JIRA NVGPU-8094 Change-Id: I127e78b1aa76b552758d1fff5bc7a01b5f8f3e54 Signed-off-by: Rajesh Devaraj Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2677589 Tested-by: mobile promotions Reviewed-by: mobile promotions --- .../gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c | 170 ++++++++++++++++-- drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c | 15 +- drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 55 +++++- 3 files changed, 212 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c index e162ba1f9..a7941b174 100644 --- a/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c @@ -354,29 +354,56 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .name = "pmu", .hw_unit = (u32)NVGPU_ERR_MODULE_PMU, .num_instances = 1U, - .num_errs = 5U, + .num_errs = 10U, .errs = (struct nvgpu_err_desc[]) { - GPU_NONCRITERR("falcon_imem_ecc_corrected", - GPU_PMU_FALCON_IMEM_ECC_CORRECTED, + GPU_CRITERR("pmu_nvriscv_brom_failure", + GPU_PMU_NVRISCV_BROM_FAILURE, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_CRITERR("falcon_imem_ecc_uncorrected", - GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, + GPU_CRITERR("pmu_access_timeout", + GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("falcon_dmem_ecc_corrected", - 0, INJECT_NONE, + GPU_CRITERR("pmu_mpu_ecc_uncorrected", + GPU_PMU_MPU_ECC_UNCORRECTED, + INJECT_NONE, NULL, NULL, NULL, NULL, 0, 0), - GPU_CRITERR("falcon_dmem_ecc_uncorrected", - GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, + GPU_CRITERR("pmu_illegal_access_uncorrected", + GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_CRITERR("bar0_error_timeout", - GPU_PMU_BAR0_ERROR_TIMEOUT, INJECT_SW, + GPU_CRITERR("pmu_imem_ecc_uncorrected", + GPU_PMU_IMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_dcls_uncorrected", + GPU_PMU_DCLS_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_dmem_ecc_uncorrected", + GPU_PMU_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_wdt_uncorrected", + GPU_PMU_WDT_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_reg_ecc_uncorrected", + GPU_PMU_REG_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_bar0_error_timeout", + GPU_PMU_BAR0_ERROR_TIMEOUT, + INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), }, @@ -624,6 +651,127 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { #endif }, }, + { + .name = "gsp_acr", + .hw_unit = (u32)NVGPU_ERR_MODULE_GSP_ACR, + .num_instances = 1U, + .num_errs = 12U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("gsp_acr_nvriscv_brom_failure", + GPU_GSP_ACR_NVRISCV_BROM_FAILURE, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_emem_ecc_uncorrected", + GPU_GSP_ACR_EMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_reg_access_timeout_uncorrected", + GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED, + INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_illegal_access_uncorrected", + GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_imem_ecc_uncorrected", + GPU_GSP_ACR_IMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_dcls_uncorrected", + GPU_GSP_ACR_DCLS_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_dmem_ecc_uncorrected", + GPU_GSP_ACR_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_wdt_uncorrected", + GPU_GSP_ACR_WDT_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_reg_ecc_uncorrected", + GPU_GSP_ACR_REG_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_fecs_pkc_lssig_failure", + GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_gpccs_pkc_lssig_failure", + GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_acr_lspmu_pkc_lssig_failure", + GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "gsp_sched", + .hw_unit = (u32)NVGPU_ERR_MODULE_GSP_SCHED, + .num_instances = 1U, + .num_errs = 9U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("gsp_sched_nvriscv_brom_failure", + GPU_GSP_SCHED_NVRISCV_BROM_FAILURE, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_emem_ecc_uncorrected", + GPU_GSP_SCHED_EMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_reg_access_timeout_uncorrected", + GPU_GSP_SCHED_REG_ACCESS_TIMEOUT_UNCORRECTED, + INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_illegal_access_uncorrected", + GPU_GSP_SCHED_ILLEGAL_ACCESS_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_imem_ecc_uncorrected", + GPU_GSP_SCHED_IMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_dcls_uncorrected", + GPU_GSP_SCHED_DCLS_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_dmem_ecc_uncorrected", + GPU_GSP_SCHED_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_wdt_uncorrected", + GPU_GSP_SCHED_WDT_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("gpu_gsp_sched_reg_ecc_uncorrected", + GPU_GSP_SCHED_REG_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, }; u32 size_of_ga10b_lut = sizeof(ga10b_err_lut) / diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c index 51c16410a..f9dae81a7 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c @@ -141,32 +141,21 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, - GPU_PMU_FALCON_IMEM_ECC_CORRECTED); nvgpu_err(g, "falcon imem ecc error corrected. " "ecc_addr(0x%x)", ecc_addr); } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, - GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED); + GPU_PMU_IMEM_ECC_UNCORRECTED); nvgpu_err(g, "falcon imem ecc error uncorrected. " "ecc_addr(0x%x)", ecc_addr); ret = -EFAULT; } - if ((ecc_status & - pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - nvgpu_err(g, "falcon dmem ecc error corrected"); - /* This error is not expected to occur in gv11b and hence, - * this scenario is considered as a fatal error. - */ - nvgpu_mutex_release(&g->pmu->isr_mutex); - BUG(); - } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, - GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED); + GPU_PMU_DMEM_ECC_UNCORRECTED); nvgpu_err(g, "falcon dmem ecc error uncorrected. " "ecc_addr(0x%x)", ecc_addr); ret = -EFAULT; diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 64a8075f3..abac86409 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -53,6 +53,8 @@ struct mmu_fault_info; #define NVGPU_ERR_MODULE_HUBMMU (9U) #define NVGPU_ERR_MODULE_PRI (10U) #define NVGPU_ERR_MODULE_CE (11U) +#define NVGPU_ERR_MODULE_GSP_ACR (12U) +#define NVGPU_ERR_MODULE_GSP_SCHED (13U) /** * @} */ @@ -161,10 +163,55 @@ struct mmu_fault_info; * Macros used to assign unique index to errors reported from the PMU unit. * @{ */ -#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U) -#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U) -#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U) -#define GPU_PMU_BAR0_ERROR_TIMEOUT (4U) +#define GPU_PMU_NVRISCV_BROM_FAILURE (0U) +#define GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED (1U) +#define GPU_PMU_MPU_ECC_UNCORRECTED (2U) +#define GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED (3U) +#define GPU_PMU_IMEM_ECC_UNCORRECTED (4U) +#define GPU_PMU_DCLS_UNCORRECTED (5U) +#define GPU_PMU_DMEM_ECC_UNCORRECTED (6U) +#define GPU_PMU_WDT_UNCORRECTED (7U) +#define GPU_PMU_REG_ECC_UNCORRECTED (8U) +#define GPU_PMU_BAR0_ERROR_TIMEOUT (9U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GSP_ACR + * Macros used to assign unique index to errors reported from the GSP ACR unit. + * @{ + */ +#define GPU_GSP_ACR_NVRISCV_BROM_FAILURE (0U) +#define GPU_GSP_ACR_EMEM_ECC_UNCORRECTED (1U) +#define GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED (2U) +#define GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED (3U) +#define GPU_GSP_ACR_IMEM_ECC_UNCORRECTED (4U) +#define GPU_GSP_ACR_DCLS_UNCORRECTED (5U) +#define GPU_GSP_ACR_DMEM_ECC_UNCORRECTED (6U) +#define GPU_GSP_ACR_WDT_UNCORRECTED (7U) +#define GPU_GSP_ACR_REG_ECC_UNCORRECTED (8U) +#define GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE (9U) +#define GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE (10U) +#define GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE (11U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GSP_SCHED + * Macros used to assign unique index to errors reported from the GSP SCHED unit. + * @{ + */ +#define GPU_GSP_SCHED_NVRISCV_BROM_FAILURE (0U) +#define GPU_GSP_SCHED_EMEM_ECC_UNCORRECTED (1U) +#define GPU_GSP_SCHED_REG_ACCESS_TIMEOUT_UNCORRECTED (2U) +#define GPU_GSP_SCHED_ILLEGAL_ACCESS_UNCORRECTED (3U) +#define GPU_GSP_SCHED_IMEM_ECC_UNCORRECTED (4U) +#define GPU_GSP_SCHED_DCLS_UNCORRECTED (5U) +#define GPU_GSP_SCHED_DMEM_ECC_UNCORRECTED (6U) +#define GPU_GSP_SCHED_WDT_UNCORRECTED (7U) +#define GPU_GSP_SCHED_REG_ECC_UNCORRECTED (8U) /** * @} */