gpu: nvgpu: Add ACR error reporting to SDL

-Add check for ECC parity errors in IMEM, DMEM, EMEM, DCLS, REG
for ACR running in GSP engine.
The EXTIRQ3 external interrupt is set from ACR pointing towards host.
-Add function to check error type when ACR or Bootrom  execution fails
and report accordingly to SDL with relevant error codes.

This is a part of HSI safety requirements.

Bug 3564039
Jira NVGPU-8108

Change-Id: I65407371f7a1d1ba50a10bdf443ef6b903eeaa36
Signed-off-by: mpoojary <mpoojary@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2678100
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
mpoojary
2022-03-08 11:11:25 +00:00
committed by mobile promotions
parent 358f62a9d7
commit c1a995403a
8 changed files with 148 additions and 16 deletions

View File

@@ -33,10 +33,62 @@
#include <nvgpu/soc.h>
#include <nvgpu/riscv.h>
#include <nvgpu/io.h>
#include <nvgpu/nvgpu_err.h>
#include "acr_bootstrap.h"
#include "acr_priv.h"
static void acr_report_error_to_sdl(struct gk20a *g, u32 error, u32 error_type)
{
switch (error) {
case ACR_ERROR_WDT:
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_WDT_UNCORRECTED);
nvgpu_err(g, "ACR GSP watchdog timeout");
break;
case ACR_ERROR_REG_ACCESS_FAILURE:
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_REG_ACCESS_TIMEOUT_UNCORRECTED);
nvgpu_err(g, "ACR register access failure");
break;
case ACR_ERROR_RISCV_EXCEPTION:
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_ILLEGAL_ACCESS_UNCORRECTED);
nvgpu_err(g, "ACR riscv exception");
break;
case ACR_ERROR_LS_SIG_VERIF_FAIL:
switch (error_type) {
case FALCON_ID_PMU_NEXT_CORE:
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_LSPMU_PKC_LSSIG_FAILURE);
nvgpu_err(g, "LSPMU pkc signature verification failed");
break;
case FALCON_ID_FECS:
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_FECS_PKC_LSSIG_FAILURE);
nvgpu_err(g, "FECS pkc signature verification failed");
break;
case FALCON_ID_GPCCS:
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_GPCCS_PKC_LSSIG_FAILURE);
nvgpu_err(g, "GPCCS pkc signature verification failed");
break;
default:
break;
}
break;
default:
break;
}
}
int nvgpu_acr_wait_for_completion(struct gk20a *g, struct hs_acr *acr_desc,
u32 timeout)
{
@@ -70,8 +122,24 @@ int nvgpu_acr_wait_for_completion(struct gk20a *g, struct hs_acr *acr_desc,
}
}
/*
* When engine-falcon is used for ACR bootstrap, validate the integrity
* of falcon IMEM and DMEM.
*/
if (acr_desc->acr_validate_mem_integrity != NULL) {
if (!acr_desc->acr_validate_mem_integrity(g)) {
nvgpu_err(g, "flcn-%d: memcheck failed", flcn_id);
completion = -EAGAIN;
error_type = ACR_BOOT_FAILED;
}
}
data = nvgpu_falcon_mailbox_read(acr_desc->acr_flcn, FALCON_MAILBOX_0);
if (data != 0U) {
error_type = nvgpu_falcon_mailbox_read(acr_desc->acr_flcn, FALCON_MAILBOX_1);
if (nvgpu_is_enabled(g, NVGPU_ACR_NEXT_CORE_ENABLED)) {
acr_report_error_to_sdl(g, data, error_type);
}
nvgpu_err(g, "flcn-%d: HS ucode boot failed, err %x", flcn_id,
data);
nvgpu_err(g, "flcn-%d: Mailbox-1 : 0x%x", flcn_id,
@@ -87,18 +155,6 @@ int nvgpu_acr_wait_for_completion(struct gk20a *g, struct hs_acr *acr_desc,
nvgpu_falcon_mailbox_read(acr_desc->acr_flcn,
FALCON_MAILBOX_1));
/*
* When engine-falcon is used for ACR bootstrap, validate the integrity
* of falcon IMEM and DMEM.
*/
if (acr_desc->acr_validate_mem_integrity != NULL) {
if (!acr_desc->acr_validate_mem_integrity(g)) {
nvgpu_err(g, "flcn-%d: memcheck failed", flcn_id);
completion = -EAGAIN;
error_type = ACR_BOOT_FAILED;
}
}
exit:
#ifdef CONFIG_NVGPU_FALCON_NON_FUSA
@@ -392,6 +448,8 @@ int nvgpu_acr_bootstrap_hs_ucode_riscv(struct gk20a *g, struct nvgpu_acr *acr)
nvgpu_acr_dbg(g, "RISCV BROM passed");
nvgpu_riscv_dump_brom_stats(flcn);
} else {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_NVRISCV_BROM_FAILURE);
if (err == -ENOTRECOVERABLE) {
nvgpu_err(g, "RISCV BROM Failed");
} else {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -109,6 +109,14 @@ struct wpr_carveout_info;
#define ACR_COMPLETION_TIMEOUT_NON_SILICON_MS 10000U /*in msec */
#define ACR_COMPLETION_TIMEOUT_SILICON_MS 100 /*in msec */
/*
* ACR firmware returns these error codes when below mentioned error occurs.
*/
# define ACR_ERROR_WDT 0x66U
# define ACR_ERROR_REG_ACCESS_FAILURE 0x1BU
# define ACR_ERROR_RISCV_EXCEPTION 0x84U
# define ACR_ERROR_LS_SIG_VERIF_FAIL 0x0BU
struct acr_lsf_config {
u32 falcon_id;
u32 falcon_dma_idx;

View File

@@ -320,7 +320,7 @@ static void ga10b_acr_default_sw_init(struct gk20a *g, struct hs_acr *riscv_hs)
nvgpu_pmu_report_bar0_pri_err_status;
riscv_hs->acr_engine_bus_err_status =
g->ops.pmu.bar0_error_status;
riscv_hs->acr_validate_mem_integrity = g->ops.pmu.validate_mem_integrity;
riscv_hs->acr_validate_mem_integrity = g->ops.gsp.validate_mem_integrity;
}
static void ga10b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr)

View File

@@ -26,6 +26,7 @@
#include <nvgpu/timers.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/bug.h>
#include <nvgpu/nvgpu_err.h>
#ifdef CONFIG_NVGPU_GSP_SCHEDULER
#include <nvgpu/gsp.h>
#include <nvgpu/string.h>
@@ -59,6 +60,63 @@ int ga10b_gsp_engine_reset(struct gk20a *g)
return 0;
}
static int ga10b_gsp_handle_ecc(struct gk20a *g, u32 ecc_status)
{
int ret = 0;
if ((ecc_status &
pgsp_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_IMEM_ECC_UNCORRECTED);
nvgpu_err(g, "imem ecc error uncorrected");
ret = -EFAULT;
}
if ((ecc_status &
pgsp_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_DMEM_ECC_UNCORRECTED);
nvgpu_err(g, "dmem ecc error uncorrected");
ret = -EFAULT;
}
if ((ecc_status &
pgsp_falcon_ecc_status_uncorrected_err_dcls_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_DCLS_UNCORRECTED);
nvgpu_err(g, "dcls ecc error uncorrected");
ret = -EFAULT;
}
if ((ecc_status &
pgsp_falcon_ecc_status_uncorrected_err_reg_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_REG_ECC_UNCORRECTED);
nvgpu_err(g, "reg ecc error uncorrected");
ret = -EFAULT;
}
if ((ecc_status &
pgsp_falcon_ecc_status_uncorrected_err_emem_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_GSP_ACR,
GPU_GSP_ACR_EMEM_ECC_UNCORRECTED);
nvgpu_err(g, "emem ecc error uncorrected");
ret = -EFAULT;
}
return ret;
}
bool ga10b_gsp_validate_mem_integrity(struct gk20a *g)
{
u32 ecc_status;
ecc_status = nvgpu_readl(g, pgsp_falcon_ecc_status_r());
return ((ga10b_gsp_handle_ecc(g, ecc_status) == 0) ? true :
false);
}
#ifdef CONFIG_NVGPU_GSP_SCHEDULER
u32 ga10b_gsp_queue_head_r(u32 i)
{

View File

@@ -26,6 +26,7 @@
u32 ga10b_gsp_falcon_base_addr(void);
u32 ga10b_gsp_falcon2_base_addr(void);
int ga10b_gsp_engine_reset(struct gk20a *g);
bool ga10b_gsp_validate_mem_integrity(struct gk20a *g);
#ifdef CONFIG_NVGPU_GSP_SCHEDULER
void ga10b_gsp_flcn_setup_boot_config(struct gk20a *g);

View File

@@ -1295,6 +1295,7 @@ static const struct gops_gsp ga10b_ops_gsp = {
.falcon_base_addr = ga10b_gsp_falcon_base_addr,
.falcon2_base_addr = ga10b_gsp_falcon2_base_addr,
.gsp_reset = ga10b_gsp_engine_reset,
.validate_mem_integrity = ga10b_gsp_validate_mem_integrity,
#ifdef CONFIG_NVGPU_GSP_SCHEDULER
/* interrupt */
.enable_irq = ga10b_gsp_enable_irq,

View File

@@ -34,6 +34,7 @@ struct gops_gsp {
u32 (*falcon2_base_addr)(void);
void (*falcon_setup_boot_config)(struct gk20a *g);
int (*gsp_reset)(struct gk20a *g);
bool (*validate_mem_integrity)(struct gk20a *g);
#ifdef CONFIG_NVGPU_GSP_SCHEDULER
u32 (*gsp_get_queue_head)(u32 i);
u32 (*gsp_get_queue_head_size)(void);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -61,7 +61,6 @@
#define pgsp_falcon2_gsp_base_r() (0x00111000U)
#define pgsp_falcon_irqsset_r() (0x00110000U)
#define pgsp_falcon_irqsclr_r() (0x00110004U)
#define pgsp_falcon_engine_r() (0x001103c0U)
#define pgsp_falcon_engine_reset_true_f() (0x1U)
#define pgsp_falcon_engine_reset_false_f() (0x0U)
@@ -167,4 +166,10 @@
#define pgsp_falcon_exterrstat_valid_m() (U32(0x1U) << 31U)
#define pgsp_falcon_exterrstat_valid_v(r) (((r) >> 31U) & 0x1U)
#define pgsp_falcon_exterrstat_valid_true_v() (0x00000001U)
#define pgsp_falcon_ecc_status_r() (0x00110878U)
#define pgsp_falcon_ecc_status_uncorrected_err_imem_m() (U32(0x1U) << 8U)
#define pgsp_falcon_ecc_status_uncorrected_err_dmem_m() (U32(0x1U) << 9U)
#define pgsp_falcon_ecc_status_uncorrected_err_emem_m() (U32(0x1U) << 13U)
#define pgsp_falcon_ecc_status_uncorrected_err_dcls_m() (U32(0x1U) << 11U)
#define pgsp_falcon_ecc_status_uncorrected_err_reg_m() (U32(0x1U) << 12U)
#endif