gpu: nvgpu: PMU NVRISCV engine HSI support

Below listed HSI are handled with PMU ISR handler and all these triggers interrupt from individual unit upon issue. -Add ECC check for IMEM, DMEM, DCLS, REG, and MPU as per HSI req -Add MEMERR check for GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED PMU HSI id -Add IOPMP check for GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED PMU HSI id -Add WDT check for GPU_PMU_WDT_UNCORRECTED PMU HSI id Bug 3491596 Bug 3366818 Change-Id: I751d653e447017ac62a2459da2c6bb9da506f438 Signed-off-by: mkumbar <mkumbar@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2686566 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2022-03-24 19:32:44 +05:30
parent 8cce8dea70
commit 0b9dc3dbc3
4 changed files with 94 additions and 3 deletions
--- a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c
@@ -1352,7 +1352,7 @@ static const struct gops_pmu ga10b_ops_pmu = {
 	.get_irqdest = gv11b_pmu_get_irqdest,
 	.get_irqmask = ga10b_pmu_get_irqmask,
 	.pmu_isr = gk20a_pmu_isr,
-	.handle_ext_irq = gv11b_pmu_handle_ext_irq,
+	.handle_ext_irq = ga10b_pmu_handle_ext_irq,
 #ifdef CONFIG_NVGPU_LS_PMU
 	.get_inst_block_config = ga10b_pmu_get_inst_block_config,
 	/* Init */
--- a/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.c
@@ -429,3 +429,87 @@ void ga10b_pmu_enable_irq(struct nvgpu_pmu *pmu, bool enable)
 		gv11b_pmu_enable_irq(pmu, enable);
 	}
 }
+
+static int ga10b_pmu_handle_ecc(struct gk20a *g)
+{
+	int ret = 0;
+	u32 ecc_status = 0;
+
+	ecc_status = nvgpu_readl(g, pwr_pmu_falcon_ecc_status_r());
+
+	if ((ecc_status &
+		pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+					GPU_PMU_IMEM_ECC_UNCORRECTED);
+		nvgpu_err(g, "imem ecc error uncorrected ");
+		ret = -EFAULT;
+	}
+
+	if ((ecc_status &
+		pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+					GPU_PMU_DMEM_ECC_UNCORRECTED);
+		nvgpu_err(g, "dmem ecc error uncorrected");
+		ret = -EFAULT;
+	}
+
+	if ((ecc_status &
+		pwr_pmu_falcon_ecc_status_uncorrected_err_dcls_m()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+					GPU_PMU_DCLS_UNCORRECTED);
+		nvgpu_err(g, "dcls ecc error uncorrected");
+		ret = -EFAULT;
+	}
+
+	if ((ecc_status &
+		pwr_pmu_falcon_ecc_status_uncorrected_err_reg_m()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+					GPU_PMU_REG_ECC_UNCORRECTED);
+		nvgpu_err(g, "reg ecc error uncorrected");
+		ret = -EFAULT;
+	}
+
+	if ((ecc_status &
+		pwr_pmu_falcon_ecc_status_uncorrected_err_mpu_ram_m()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+					GPU_PMU_MPU_ECC_UNCORRECTED);
+		nvgpu_err(g, "mpu ecc error uncorrected");
+		ret = -EFAULT;
+	}
+
+	if (ret != 0) {
+		nvgpu_err(g, "ecc_addr(0x%x)",
+			nvgpu_readl(g, pwr_pmu_falcon_ecc_address_r()));
+	}
+
+	return ret;
+}
+
+void ga10b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
+{
+	/* handle the ECC interrupt */
+	if ((intr0 & pwr_falcon_irqstat_ext_ecc_parity_true_f()) != 0U) {
+		ga10b_pmu_handle_ecc(g);
+	}
+
+	/* handle the MEMERR interrupt */
+	if ((intr0 & pwr_falcon_irqstat_memerr_true_f()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+				GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED);
+		nvgpu_err(g, "memerr/access timeout error uncorrected");
+	}
+
+	/* handle the IOPMP interrupt */
+	if ((intr0 & pwr_falcon_irqstat_iopmp_true_f()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+				GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED);
+		nvgpu_err(g, "iopmp/illegal access error uncorrected");
+	}
+
+	/* handle the WDT interrupt */
+	if ((intr0 & pwr_falcon_irqstat_wdt_true_f()) != 0U) {
+		nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU,
+				GPU_PMU_WDT_UNCORRECTED);
+		nvgpu_err(g, "wdt error uncorrected");
+	}
+}
--- a/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.h
+++ b/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -57,4 +57,5 @@ void ga10b_pmu_handle_swgen1_irq(struct gk20a *g, u32 intr);
 bool ga10b_pmu_is_interrupted(struct nvgpu_pmu *pmu);
 #endif
 void ga10b_pmu_enable_irq(struct nvgpu_pmu *pmu, bool enable);
+void ga10b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0);
 #endif /* NVGPU_PMU_GA10B_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pwr_ga10b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pwr_ga10b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -64,11 +64,14 @@
 #define pwr_falcon_irqsset_swgen0_set_f()                                (0x40U)
 #define pwr_falcon_irqsclr_r()                                     (0x0010a004U)
 #define pwr_falcon_irqstat_r()                                     (0x0010a008U)
+#define pwr_falcon_irqstat_wdt_true_f()                                   (0x2U)
 #define pwr_falcon_irqstat_halt_true_f()                                 (0x10U)
 #define pwr_falcon_irqstat_exterr_true_f()                               (0x20U)
 #define pwr_falcon_irqstat_swgen0_true_f()                               (0x40U)
 #define pwr_falcon_irqstat_ext_ecc_parity_true_f()                      (0x400U)
 #define pwr_falcon_irqstat_swgen1_true_f()                               (0x80U)
+#define pwr_falcon_irqstat_memerr_true_f()                            (0x40000U)
+#define pwr_falcon_irqstat_iopmp_true_f()                            (0x800000U)
 #define pwr_pmu_ecc_intr_status_r()                                (0x0010abfcU)
 #define pwr_pmu_ecc_intr_status_corrected_m()                  (U32(0x1U) << 0U)
 #define pwr_pmu_ecc_intr_status_uncorrected_m()                (U32(0x1U) << 1U)
@@ -233,6 +236,9 @@
 #define pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()       (U32(0x1U) << 1U)
 #define pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()     (U32(0x1U) << 8U)
 #define pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()     (U32(0x1U) << 9U)
+#define pwr_pmu_falcon_ecc_status_uncorrected_err_mpu_ram_m() (U32(0x1U) << 10U)
+#define pwr_pmu_falcon_ecc_status_uncorrected_err_dcls_m()    (U32(0x1U) << 11U)
+#define pwr_pmu_falcon_ecc_status_uncorrected_err_reg_m()     (U32(0x1U) << 12U)
 #define pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m()\
 				(U32(0x1U) << 16U)
 #define pwr_pmu_falcon_ecc_status_uncorrected_err_total_counter_overflow_m()\