gpu: nvgpu: validate PMU I/DMEM integrity at end of HS bootstrap

The HS ucode runs on PMU with all interrupts disabled. So it will not be able to detect any data corruption introduced in the IMEM or DMEM due to bit flips. In order to mitigate this issue validate the integrity of IMEM and DMEM at the end of HS ucode bootstrap and fail the boot incase of any un-corrected errors. Jira NVGPU-3555 Change-Id: Icd9a2bf2c29470629be8524c9b99f90e3036abdc Signed-off-by: Antony Clince Alex <aalex@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2124107 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-05-23 12:43:25 +05:30
parent 3e5fda3730
commit ce3c2a3c43
7 changed files with 35 additions and 2 deletions
--- a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c
+++ b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c
@@ -83,6 +83,17 @@ static int acr_wait_for_completion(struct gk20a *g,
 	nvgpu_acr_dbg(g, "flcn-%d: sctl reg %x cpuctl reg %x",
 			flcn_id, sctl, cpuctl);

+	/*
+	 * When engine-falcon is used for ACR bootstrap, validate the integrity
+	 * of falcon IMEM and DMEM.
+	 */
+	if (g->acr->acr.acr_validate_mem_integrity != NULL) {
+		if (!g->acr->acr.acr_validate_mem_integrity(g)) {
+			nvgpu_err(g, "flcn-%d: memcheck failed", flcn_id);
+			completion = -EAGAIN;
+			error_type = ACR_BOOT_FAILED;
+		}
+	}
 exit:
 	if (completion != 0) {
 		if (g->acr->acr.report_acr_engine_bus_err_status != NULL) {
--- a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.h
+++ b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.h
@@ -196,6 +196,7 @@ struct hs_acr {
 		u32 bar0_status, u32 error_type);
 	int (*acr_engine_bus_err_status)(struct gk20a *g, u32 *bar0_status,
 		u32 *error_type);
+	bool (*acr_validate_mem_integrity)(struct gk20a *g);
 };

 int nvgpu_acr_bootstrap_hs_ucode(struct gk20a *g, struct nvgpu_acr *acr,
--- a/drivers/gpu/nvgpu/common/acr/acr_sw_gv11b.c
+++ b/drivers/gpu/nvgpu/common/acr/acr_sw_gv11b.c
@@ -208,6 +208,7 @@ static void gv11b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr)
 		nvgpu_pmu_report_bar0_pri_err_status;
 	hs_acr->acr_engine_bus_err_status =
 		g->ops.pmu.bar0_error_status;;
+	hs_acr->acr_validate_mem_integrity = g->ops.pmu.validate_mem_integrity;
 }

 void nvgpu_gv11b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr)
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -1001,6 +1001,7 @@ static const struct gpu_ops gv11b_ops = {
 		.write_dmatrfbase = gp10b_write_dmatrfbase,
 		/* ISR */
 		.pmu_enable_irq = gk20a_pmu_enable_irq,
+		.validate_mem_integrity = gv11b_pmu_validate_mem_integrity,
 #ifdef NVGPU_LS_PMU
 		.get_irqdest = gv11b_pmu_get_irqdest,
 		.handle_ext_irq = gv11b_pmu_handle_ext_irq,
--- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c
@@ -236,8 +236,10 @@ int gv11b_pmu_bootstrap(struct gk20a *g, struct nvgpu_pmu *pmu,
 	return err;
 }

-static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
+static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
 {
+	int ret = 0;
+
 	if ((ecc_status &
 	     pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
 		nvgpu_pmu_report_ecc_error(g, 0,
@@ -253,6 +255,7 @@ static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
 			ecc_addr,
 			g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
 		nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
+		ret = -EFAULT;
 	}
 	if ((ecc_status &
 	     pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
@@ -269,7 +272,10 @@ static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
 			ecc_addr,
 			g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
 		nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
+		ret = -EFAULT;
 	}
+
+	return ret;
 }

 static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
@@ -338,7 +344,7 @@ static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
 	nvgpu_log(g, gpu_dbg_intr,
 		"pmu ecc interrupt intr1: 0x%x", intr1);

-	gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr);
+	(void)gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr);

 	if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
 		nvgpu_info(g, "ecc counter overflow!");
@@ -354,6 +360,17 @@ static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
 		g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
 }

+bool gv11b_pmu_validate_mem_integrity(struct gk20a *g)
+{
+	u32 ecc_status, ecc_addr;
+
+	ecc_status = nvgpu_readl(g, pwr_pmu_falcon_ecc_status_r());
+	ecc_addr = nvgpu_readl(g, pwr_pmu_falcon_ecc_address_r());
+
+	return ((gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr) == 0) ? true :
+			false);
+}
+
 void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
 {
 	/*
--- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h
@@ -34,5 +34,6 @@ void gv11b_pmu_setup_elpg(struct gk20a *g);
 u32 gv11b_pmu_get_irqdest(struct gk20a *g);
 void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0);
 void gv11b_setup_apertures(struct gk20a *g);
+bool gv11b_pmu_validate_mem_integrity(struct gk20a *g);

 #endif /* PMU_GV11B_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -1386,6 +1386,7 @@ struct gpu_ops {
 		void (*set_irqmask)(struct gk20a *g);
 		u32 (*get_irqdest)(struct gk20a *g);
 		void (*pmu_enable_irq)(struct nvgpu_pmu *pmu, bool enable);
+		bool (*validate_mem_integrity)(struct gk20a *g);
 		void (*handle_ext_irq)(struct gk20a *g, u32 intr);
 		/* perfmon */
 		void (*pmu_init_perfmon_counter)(struct gk20a *g);