From ce3c2a3c43463715d940daddb475bd7ea156ba50 Mon Sep 17 00:00:00 2001
From: Antony Clince Alex <aalex@nvidia.com>
Date: Thu, 23 May 2019 12:43:25 +0530
Subject: [PATCH] gpu: nvgpu: validate PMU I/DMEM integrity at end of HS
 bootstrap

The HS ucode runs on PMU with all interrupts disabled. So it will not be
able to detect any data corruption introduced in the IMEM or DMEM due to bit
flips. In order to mitigate this issue validate the integrity of IMEM and DMEM
at the end of HS ucode bootstrap and fail the boot incase of any un-corrected
errors.

Jira NVGPU-3555

Change-Id: Icd9a2bf2c29470629be8524c9b99f90e3036abdc
Signed-off-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2124107
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/acr/acr_bootstrap.c | 11 ++++++++++
 drivers/gpu/nvgpu/common/acr/acr_bootstrap.h |  1 +
 drivers/gpu/nvgpu/common/acr/acr_sw_gv11b.c  |  1 +
 drivers/gpu/nvgpu/hal/init/hal_gv11b.c       |  1 +
 drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c        | 21 ++++++++++++++++++--
 drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h        |  1 +
 drivers/gpu/nvgpu/include/nvgpu/gk20a.h      |  1 +
 7 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c
index afdb522f3..35ebb6afd 100644
--- a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c
+++ b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c
@@ -83,6 +83,17 @@ static int acr_wait_for_completion(struct gk20a *g,
 	nvgpu_acr_dbg(g, "flcn-%d: sctl reg %x cpuctl reg %x",
 			flcn_id, sctl, cpuctl);
 
+	/*
+	 * When engine-falcon is used for ACR bootstrap, validate the integrity
+	 * of falcon IMEM and DMEM.
+	 */
+	if (g->acr->acr.acr_validate_mem_integrity != NULL) {
+		if (!g->acr->acr.acr_validate_mem_integrity(g)) {
+			nvgpu_err(g, "flcn-%d: memcheck failed", flcn_id);
+			completion = -EAGAIN;
+			error_type = ACR_BOOT_FAILED;
+		}
+	}
 exit:
 	if (completion != 0) {
 		if (g->acr->acr.report_acr_engine_bus_err_status != NULL) {
diff --git a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.h b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.h
index f39290ea5..a577ef23a 100644
--- a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.h
+++ b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.h
@@ -196,6 +196,7 @@ struct hs_acr {
 		u32 bar0_status, u32 error_type);
 	int (*acr_engine_bus_err_status)(struct gk20a *g, u32 *bar0_status,
 		u32 *error_type);
+	bool (*acr_validate_mem_integrity)(struct gk20a *g);
 };
 
 int nvgpu_acr_bootstrap_hs_ucode(struct gk20a *g, struct nvgpu_acr *acr,
diff --git a/drivers/gpu/nvgpu/common/acr/acr_sw_gv11b.c b/drivers/gpu/nvgpu/common/acr/acr_sw_gv11b.c
index f2e65487f..e7c67ba43 100644
--- a/drivers/gpu/nvgpu/common/acr/acr_sw_gv11b.c
+++ b/drivers/gpu/nvgpu/common/acr/acr_sw_gv11b.c
@@ -208,6 +208,7 @@ static void gv11b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr)
 		nvgpu_pmu_report_bar0_pri_err_status;
 	hs_acr->acr_engine_bus_err_status =
 		g->ops.pmu.bar0_error_status;;
+	hs_acr->acr_validate_mem_integrity = g->ops.pmu.validate_mem_integrity;
 }
 
 void nvgpu_gv11b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr)
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
index 9bf517d05..97784fad8 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -1001,6 +1001,7 @@ static const struct gpu_ops gv11b_ops = {
 		.write_dmatrfbase = gp10b_write_dmatrfbase,
 		/* ISR */
 		.pmu_enable_irq = gk20a_pmu_enable_irq,
+		.validate_mem_integrity = gv11b_pmu_validate_mem_integrity,
 #ifdef NVGPU_LS_PMU
 		.get_irqdest = gv11b_pmu_get_irqdest,
 		.handle_ext_irq = gv11b_pmu_handle_ext_irq,
diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c
index 3e9892104..ec1126654 100644
--- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c
@@ -236,8 +236,10 @@ int gv11b_pmu_bootstrap(struct gk20a *g, struct nvgpu_pmu *pmu,
 	return err;
 }
 
-static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
+static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
 {
+	int ret = 0;
+
 	if ((ecc_status &
 	     pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
 		nvgpu_pmu_report_ecc_error(g, 0,
@@ -253,6 +255,7 @@ static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
 			ecc_addr,
 			g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
 		nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
+		ret = -EFAULT;
 	}
 	if ((ecc_status &
 	     pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
@@ -269,7 +272,10 @@ static void gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
 			ecc_addr,
 			g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
 		nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
+		ret = -EFAULT;
 	}
+
+	return ret;
 }
 
 static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
@@ -338,7 +344,7 @@ static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
 	nvgpu_log(g, gpu_dbg_intr,
 		"pmu ecc interrupt intr1: 0x%x", intr1);
 
-	gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr);
+	(void)gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr);
 
 	if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
 		nvgpu_info(g, "ecc counter overflow!");
@@ -354,6 +360,17 @@ static void gv11b_pmu_handle_ecc_irq(struct gk20a *g)
 		g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
 }
 
+bool gv11b_pmu_validate_mem_integrity(struct gk20a *g)
+{
+	u32 ecc_status, ecc_addr;
+
+	ecc_status = nvgpu_readl(g, pwr_pmu_falcon_ecc_status_r());
+	ecc_addr = nvgpu_readl(g, pwr_pmu_falcon_ecc_address_r());
+
+	return ((gv11b_pmu_correct_ecc(g, ecc_status, ecc_addr) == 0) ? true :
+			false);
+}
+
 void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
 {
 	/*
diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h
index b938fa2e5..8cf52d7dd 100644
--- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h
@@ -34,5 +34,6 @@ void gv11b_pmu_setup_elpg(struct gk20a *g);
 u32 gv11b_pmu_get_irqdest(struct gk20a *g);
 void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0);
 void gv11b_setup_apertures(struct gk20a *g);
+bool gv11b_pmu_validate_mem_integrity(struct gk20a *g);
 
 #endif /* PMU_GV11B_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index 180c26e0b..de089f288 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -1386,6 +1386,7 @@ struct gpu_ops {
 		void (*set_irqmask)(struct gk20a *g);
 		u32 (*get_irqdest)(struct gk20a *g);
 		void (*pmu_enable_irq)(struct nvgpu_pmu *pmu, bool enable);
+		bool (*validate_mem_integrity)(struct gk20a *g);
 		void (*handle_ext_irq)(struct gk20a *g, u32 intr);
 		/* perfmon */
 		void (*pmu_init_perfmon_counter)(struct gk20a *g);