diff --git a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c index 01becd067..9707a2da6 100644 --- a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c +++ b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c @@ -33,10 +33,62 @@ #include #include #include +#include #include "acr_bootstrap.h" #include "acr_priv.h" +static void acr_report_error_to_sdl(struct gk20a *g, u32 error, u32 error_type) +{ + switch (error) { + case ACR_ERROR_WDT: + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_WDT_UNCORRECTED); + nvgpu_err(g, "ACR GSP watchdog timeout"); + break; + + case ACR_ERROR_REG_ACCESS_FAILURE: + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED); + nvgpu_err(g, "ACR register access failure"); + break; + + case ACR_ERROR_RISCV_EXCEPTION: + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED); + nvgpu_err(g, "ACR riscv exception"); + break; + + case ACR_ERROR_LS_SIG_VERIF_FAIL: + switch (error_type) { + case FALCON_ID_PMU_NEXT_CORE: + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE); + nvgpu_err(g, "LSPMU pkc signature verification failed"); + break; + + case FALCON_ID_FECS: + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE); + nvgpu_err(g, "FECS pkc signature verification failed"); + break; + + case FALCON_ID_GPCCS: + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE); + nvgpu_err(g, "GPCCS pkc signature verification failed"); + break; + + default: + break; + } + break; + + default: + break; + } +} + int nvgpu_acr_wait_for_completion(struct gk20a *g, struct hs_acr *acr_desc, u32 timeout) { @@ -70,8 +122,24 @@ int nvgpu_acr_wait_for_completion(struct gk20a *g, struct hs_acr *acr_desc, } } + /* + * When engine-falcon is used for ACR bootstrap, validate the integrity + * of falcon IMEM and DMEM. + */ + if (acr_desc->acr_validate_mem_integrity != NULL) { + if (!acr_desc->acr_validate_mem_integrity(g)) { + nvgpu_err(g, "flcn-%d: memcheck failed", flcn_id); + completion = -EAGAIN; + error_type = ACR_BOOT_FAILED; + } + } + data = nvgpu_falcon_mailbox_read(acr_desc->acr_flcn, FALCON_MAILBOX_0); if (data != 0U) { + error_type = nvgpu_falcon_mailbox_read(acr_desc->acr_flcn, FALCON_MAILBOX_1); + if (nvgpu_is_enabled(g, NVGPU_ACR_NEXT_CORE_ENABLED)) { + acr_report_error_to_sdl(g, data, error_type); + } nvgpu_err(g, "flcn-%d: HS ucode boot failed, err %x", flcn_id, data); nvgpu_err(g, "flcn-%d: Mailbox-1 : 0x%x", flcn_id, @@ -87,18 +155,6 @@ int nvgpu_acr_wait_for_completion(struct gk20a *g, struct hs_acr *acr_desc, nvgpu_falcon_mailbox_read(acr_desc->acr_flcn, FALCON_MAILBOX_1)); - /* - * When engine-falcon is used for ACR bootstrap, validate the integrity - * of falcon IMEM and DMEM. - */ - if (acr_desc->acr_validate_mem_integrity != NULL) { - if (!acr_desc->acr_validate_mem_integrity(g)) { - nvgpu_err(g, "flcn-%d: memcheck failed", flcn_id); - completion = -EAGAIN; - error_type = ACR_BOOT_FAILED; - } - } - exit: #ifdef CONFIG_NVGPU_FALCON_NON_FUSA @@ -392,6 +448,8 @@ int nvgpu_acr_bootstrap_hs_ucode_riscv(struct gk20a *g, struct nvgpu_acr *acr) nvgpu_acr_dbg(g, "RISCV BROM passed"); nvgpu_riscv_dump_brom_stats(flcn); } else { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_NVRISCV_BROM_FAILURE); if (err == -ENOTRECOVERABLE) { nvgpu_err(g, "RISCV BROM Failed"); } else { diff --git a/drivers/gpu/nvgpu/common/acr/acr_priv.h b/drivers/gpu/nvgpu/common/acr/acr_priv.h index 0af7ab208..a454830d6 100644 --- a/drivers/gpu/nvgpu/common/acr/acr_priv.h +++ b/drivers/gpu/nvgpu/common/acr/acr_priv.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -109,6 +109,14 @@ struct wpr_carveout_info; #define ACR_COMPLETION_TIMEOUT_NON_SILICON_MS 10000U /*in msec */ #define ACR_COMPLETION_TIMEOUT_SILICON_MS 100 /*in msec */ +/* + * ACR firmware returns these error codes when below mentioned error occurs. + */ +# define ACR_ERROR_WDT 0x66U +# define ACR_ERROR_REG_ACCESS_FAILURE 0x1BU +# define ACR_ERROR_RISCV_EXCEPTION 0x84U +# define ACR_ERROR_LS_SIG_VERIF_FAIL 0x0BU + struct acr_lsf_config { u32 falcon_id; u32 falcon_dma_idx; diff --git a/drivers/gpu/nvgpu/common/acr/acr_sw_ga10b.c b/drivers/gpu/nvgpu/common/acr/acr_sw_ga10b.c index 033655ce1..f4dde171b 100644 --- a/drivers/gpu/nvgpu/common/acr/acr_sw_ga10b.c +++ b/drivers/gpu/nvgpu/common/acr/acr_sw_ga10b.c @@ -320,7 +320,7 @@ static void ga10b_acr_default_sw_init(struct gk20a *g, struct hs_acr *riscv_hs) nvgpu_pmu_report_bar0_pri_err_status; riscv_hs->acr_engine_bus_err_status = g->ops.pmu.bar0_error_status; - riscv_hs->acr_validate_mem_integrity = g->ops.pmu.validate_mem_integrity; + riscv_hs->acr_validate_mem_integrity = g->ops.gsp.validate_mem_integrity; } static void ga10b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr) diff --git a/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.c b/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.c index f073ca843..7f2229db6 100644 --- a/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.c +++ b/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef CONFIG_NVGPU_GSP_SCHEDULER #include #include @@ -59,6 +60,63 @@ int ga10b_gsp_engine_reset(struct gk20a *g) return 0; } +static int ga10b_gsp_handle_ecc(struct gk20a *g, u32 ecc_status) +{ + int ret = 0; + + if ((ecc_status & + pgsp_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_IMEM_ECC_UNCORRECTED); + nvgpu_err(g, "imem ecc error uncorrected"); + ret = -EFAULT; + } + + if ((ecc_status & + pgsp_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_DMEM_ECC_UNCORRECTED); + nvgpu_err(g, "dmem ecc error uncorrected"); + ret = -EFAULT; + } + + if ((ecc_status & + pgsp_falcon_ecc_status_uncorrected_err_dcls_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_DCLS_UNCORRECTED); + nvgpu_err(g, "dcls ecc error uncorrected"); + ret = -EFAULT; + } + + if ((ecc_status & + pgsp_falcon_ecc_status_uncorrected_err_reg_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_REG_ECC_UNCORRECTED); + nvgpu_err(g, "reg ecc error uncorrected"); + ret = -EFAULT; + } + + if ((ecc_status & + pgsp_falcon_ecc_status_uncorrected_err_emem_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR, + GPU_GSP_ACR_EMEM_ECC_UNCORRECTED); + nvgpu_err(g, "emem ecc error uncorrected"); + ret = -EFAULT; + } + + return ret; +} + +bool ga10b_gsp_validate_mem_integrity(struct gk20a *g) +{ + u32 ecc_status; + + ecc_status = nvgpu_readl(g, pgsp_falcon_ecc_status_r()); + + return ((ga10b_gsp_handle_ecc(g, ecc_status) == 0) ? true : + false); +} + #ifdef CONFIG_NVGPU_GSP_SCHEDULER u32 ga10b_gsp_queue_head_r(u32 i) { diff --git a/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.h b/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.h index 08563bba9..311babfff 100644 --- a/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.h +++ b/drivers/gpu/nvgpu/hal/gsp/gsp_ga10b.h @@ -26,6 +26,7 @@ u32 ga10b_gsp_falcon_base_addr(void); u32 ga10b_gsp_falcon2_base_addr(void); int ga10b_gsp_engine_reset(struct gk20a *g); +bool ga10b_gsp_validate_mem_integrity(struct gk20a *g); #ifdef CONFIG_NVGPU_GSP_SCHEDULER void ga10b_gsp_flcn_setup_boot_config(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c index 816380156..3113e0977 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c @@ -1295,6 +1295,7 @@ static const struct gops_gsp ga10b_ops_gsp = { .falcon_base_addr = ga10b_gsp_falcon_base_addr, .falcon2_base_addr = ga10b_gsp_falcon2_base_addr, .gsp_reset = ga10b_gsp_engine_reset, + .validate_mem_integrity = ga10b_gsp_validate_mem_integrity, #ifdef CONFIG_NVGPU_GSP_SCHEDULER /* interrupt */ .enable_irq = ga10b_gsp_enable_irq, diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/gsp.h b/drivers/gpu/nvgpu/include/nvgpu/gops/gsp.h index 45a664f21..0a16f18de 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops/gsp.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/gsp.h @@ -34,6 +34,7 @@ struct gops_gsp { u32 (*falcon2_base_addr)(void); void (*falcon_setup_boot_config)(struct gk20a *g); int (*gsp_reset)(struct gk20a *g); + bool (*validate_mem_integrity)(struct gk20a *g); #ifdef CONFIG_NVGPU_GSP_SCHEDULER u32 (*gsp_get_queue_head)(u32 i); u32 (*gsp_get_queue_head_size)(void); diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pgsp_ga10b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pgsp_ga10b.h index 7339ed7b3..a535ab0fe 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pgsp_ga10b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pgsp_ga10b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -61,7 +61,6 @@ #define pgsp_falcon2_gsp_base_r() (0x00111000U) #define pgsp_falcon_irqsset_r() (0x00110000U) -#define pgsp_falcon_irqsclr_r() (0x00110004U) #define pgsp_falcon_engine_r() (0x001103c0U) #define pgsp_falcon_engine_reset_true_f() (0x1U) #define pgsp_falcon_engine_reset_false_f() (0x0U) @@ -167,4 +166,10 @@ #define pgsp_falcon_exterrstat_valid_m() (U32(0x1U) << 31U) #define pgsp_falcon_exterrstat_valid_v(r) (((r) >> 31U) & 0x1U) #define pgsp_falcon_exterrstat_valid_true_v() (0x00000001U) +#define pgsp_falcon_ecc_status_r() (0x00110878U) +#define pgsp_falcon_ecc_status_uncorrected_err_imem_m() (U32(0x1U) << 8U) +#define pgsp_falcon_ecc_status_uncorrected_err_dmem_m() (U32(0x1U) << 9U) +#define pgsp_falcon_ecc_status_uncorrected_err_emem_m() (U32(0x1U) << 13U) +#define pgsp_falcon_ecc_status_uncorrected_err_dcls_m() (U32(0x1U) << 11U) +#define pgsp_falcon_ecc_status_uncorrected_err_reg_m() (U32(0x1U) << 12U) #endif