gpu: nvgpu: validate PMU I/DMEM integrity at end of HS bootstrap

The HS ucode runs on PMU with all interrupts disabled. So it will not be
able to detect any data corruption introduced in the IMEM or DMEM due to bit
flips. In order to mitigate this issue validate the integrity of IMEM and DMEM
at the end of HS ucode bootstrap and fail the boot incase of any un-corrected
errors.

Jira NVGPU-3555

Change-Id: Icd9a2bf2c29470629be8524c9b99f90e3036abdc
Signed-off-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2124107
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Antony Clince Alex
2019-05-23 12:43:25 +05:30
committed by mobile promotions
parent 3e5fda3730
commit ce3c2a3c43
7 changed files with 35 additions and 2 deletions

View File

@@ -83,6 +83,17 @@ static int acr_wait_for_completion(struct gk20a *g,
nvgpu_acr_dbg(g, "flcn-%d: sctl reg %x cpuctl reg %x",
flcn_id, sctl, cpuctl);
/*
* When engine-falcon is used for ACR bootstrap, validate the integrity
* of falcon IMEM and DMEM.
*/
if (g->acr->acr.acr_validate_mem_integrity != NULL) {
if (!g->acr->acr.acr_validate_mem_integrity(g)) {
nvgpu_err(g, "flcn-%d: memcheck failed", flcn_id);
completion = -EAGAIN;
error_type = ACR_BOOT_FAILED;
}
}
exit:
if (completion != 0) {
if (g->acr->acr.report_acr_engine_bus_err_status != NULL) {

View File

@@ -196,6 +196,7 @@ struct hs_acr {
u32 bar0_status, u32 error_type);
int (*acr_engine_bus_err_status)(struct gk20a *g, u32 *bar0_status,
u32 *error_type);
bool (*acr_validate_mem_integrity)(struct gk20a *g);
};
int nvgpu_acr_bootstrap_hs_ucode(struct gk20a *g, struct nvgpu_acr *acr,

View File

@@ -208,6 +208,7 @@ static void gv11b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr)
nvgpu_pmu_report_bar0_pri_err_status;
hs_acr->acr_engine_bus_err_status =
g->ops.pmu.bar0_error_status;;
hs_acr->acr_validate_mem_integrity = g->ops.pmu.validate_mem_integrity;
}
void nvgpu_gv11b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr)

View File

@@ -1001,6 +1001,7 @@ static const struct gpu_ops gv11b_ops = {
.write_dmatrfbase = gp10b_write_dmatrfbase,
/* ISR */
.pmu_enable_irq = gk20a_pmu_enable_irq,
.validate_mem_integrity = gv11b_pmu_validate_mem_integrity,
#ifdef NVGPU_LS_PMU
.get_irqdest = gv11b_pmu_get_irqdest,
.handle_ext_irq = gv11b_pmu_handle_ext_irq,

View File

@@ -236,8 +236,10 @@ int gv11b_pmu_bootstrap(struct gk20a *g, struct nvgpu_pmu *pmu,
return err;
}
static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
{
int ret = 0;
if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
nvgpu_pmu_report_ecc_error(g, 0,
@@ -253,6 +255,7 @@ static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
ecc_addr,
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
ret = -EFAULT;
}
if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
@@ -269,7 +272,10 @@ static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
ecc_addr,
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
ret = -EFAULT;
}
return ret;
}
static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
@@ -338,7 +344,7 @@ static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
nvgpu_log(g, gpu_dbg_intr,
"pmu ecc interrupt intr1: 0x%x", intr1);
gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr);
(void)gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr);
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
nvgpu_info(g, "ecc counter overflow!");
@@ -354,6 +360,17 @@ static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
}
bool gv11b_pmu_validate_mem_integrity(struct gk20a *g)
{
u32 ecc_status, ecc_addr;
ecc_status = nvgpu_readl(g, pwr_pmu_falcon_ecc_status_r());
ecc_addr = nvgpu_readl(g, pwr_pmu_falcon_ecc_address_r());
return ((gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr) == 0) ? true :
false);
}
void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
{
/*

View File

@@ -34,5 +34,6 @@ void gv11b_pmu_setup_elpg(struct gk20a *g);
u32 gv11b_pmu_get_irqdest(struct gk20a *g);
void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0);
void gv11b_setup_apertures(struct gk20a *g);
bool gv11b_pmu_validate_mem_integrity(struct gk20a *g);
#endif /* PMU_GV11B_H */

View File

@@ -1386,6 +1386,7 @@ struct gpu_ops {
void (*set_irqmask)(struct gk20a *g);
u32 (*get_irqdest)(struct gk20a *g);
void (*pmu_enable_irq)(struct nvgpu_pmu *pmu, bool enable);
bool (*validate_mem_integrity)(struct gk20a *g);
void (*handle_ext_irq)(struct gk20a *g, u32 intr);
/* perfmon */
void (*pmu_init_perfmon_counter)(struct gk20a *g);