From 0b9dc3dbc39e033d0f9e9cc3f4ac393c48a38598 Mon Sep 17 00:00:00 2001 From: mkumbar Date: Thu, 24 Mar 2022 19:32:44 +0530 Subject: [PATCH] gpu: nvgpu: PMU NVRISCV engine HSI support Below listed HSI are handled with PMU ISR handler and all these triggers interrupt from individual unit upon issue. -Add ECC check for IMEM, DMEM, DCLS, REG, and MPU as per HSI req -Add MEMERR check for GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED PMU HSI id -Add IOPMP check for GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED PMU HSI id -Add WDT check for GPU_PMU_WDT_UNCORRECTED PMU HSI id Bug 3491596 Bug 3366818 Change-Id: I751d653e447017ac62a2459da2c6bb9da506f438 Signed-off-by: mkumbar Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2686566 Tested-by: mobile promotions Reviewed-by: mobile promotions --- drivers/gpu/nvgpu/hal/init/hal_ga10b.c | 2 +- drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.c | 84 +++++++++++++++++++ drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.h | 3 +- .../include/nvgpu/hw/ga10b/hw_pwr_ga10b.h | 8 +- 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c index d1df42833..83b6aa5ce 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c @@ -1352,7 +1352,7 @@ static const struct gops_pmu ga10b_ops_pmu = { .get_irqdest = gv11b_pmu_get_irqdest, .get_irqmask = ga10b_pmu_get_irqmask, .pmu_isr = gk20a_pmu_isr, - .handle_ext_irq = gv11b_pmu_handle_ext_irq, + .handle_ext_irq = ga10b_pmu_handle_ext_irq, #ifdef CONFIG_NVGPU_LS_PMU .get_inst_block_config = ga10b_pmu_get_inst_block_config, /* Init */ diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.c b/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.c index 418de2a20..69713394d 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.c +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.c @@ -429,3 +429,87 @@ void ga10b_pmu_enable_irq(struct nvgpu_pmu *pmu, bool enable) gv11b_pmu_enable_irq(pmu, enable); } } + +static int ga10b_pmu_handle_ecc(struct gk20a *g) +{ + int ret = 0; + u32 ecc_status = 0; + + ecc_status = nvgpu_readl(g, pwr_pmu_falcon_ecc_status_r()); + + if ((ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_IMEM_ECC_UNCORRECTED); + nvgpu_err(g, "imem ecc error uncorrected "); + ret = -EFAULT; + } + + if ((ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_DMEM_ECC_UNCORRECTED); + nvgpu_err(g, "dmem ecc error uncorrected"); + ret = -EFAULT; + } + + if ((ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_dcls_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_DCLS_UNCORRECTED); + nvgpu_err(g, "dcls ecc error uncorrected"); + ret = -EFAULT; + } + + if ((ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_reg_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_REG_ECC_UNCORRECTED); + nvgpu_err(g, "reg ecc error uncorrected"); + ret = -EFAULT; + } + + if ((ecc_status & + pwr_pmu_falcon_ecc_status_uncorrected_err_mpu_ram_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_MPU_ECC_UNCORRECTED); + nvgpu_err(g, "mpu ecc error uncorrected"); + ret = -EFAULT; + } + + if (ret != 0) { + nvgpu_err(g, "ecc_addr(0x%x)", + nvgpu_readl(g, pwr_pmu_falcon_ecc_address_r())); + } + + return ret; +} + +void ga10b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) +{ + /* handle the ECC interrupt */ + if ((intr0 & pwr_falcon_irqstat_ext_ecc_parity_true_f()) != 0U) { + ga10b_pmu_handle_ecc(g); + } + + /* handle the MEMERR interrupt */ + if ((intr0 & pwr_falcon_irqstat_memerr_true_f()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED); + nvgpu_err(g, "memerr/access timeout error uncorrected"); + } + + /* handle the IOPMP interrupt */ + if ((intr0 & pwr_falcon_irqstat_iopmp_true_f()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED); + nvgpu_err(g, "iopmp/illegal access error uncorrected"); + } + + /* handle the WDT interrupt */ + if ((intr0 & pwr_falcon_irqstat_wdt_true_f()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PMU, + GPU_PMU_WDT_UNCORRECTED); + nvgpu_err(g, "wdt error uncorrected"); + } +} diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.h b/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.h index 246123a2d..723f31dae 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.h +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_ga10b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -57,4 +57,5 @@ void ga10b_pmu_handle_swgen1_irq(struct gk20a *g, u32 intr); bool ga10b_pmu_is_interrupted(struct nvgpu_pmu *pmu); #endif void ga10b_pmu_enable_irq(struct nvgpu_pmu *pmu, bool enable); +void ga10b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0); #endif /* NVGPU_PMU_GA10B_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pwr_ga10b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pwr_ga10b.h index ff573664a..4559a6c91 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pwr_ga10b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/ga10b/hw_pwr_ga10b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -64,11 +64,14 @@ #define pwr_falcon_irqsset_swgen0_set_f() (0x40U) #define pwr_falcon_irqsclr_r() (0x0010a004U) #define pwr_falcon_irqstat_r() (0x0010a008U) +#define pwr_falcon_irqstat_wdt_true_f() (0x2U) #define pwr_falcon_irqstat_halt_true_f() (0x10U) #define pwr_falcon_irqstat_exterr_true_f() (0x20U) #define pwr_falcon_irqstat_swgen0_true_f() (0x40U) #define pwr_falcon_irqstat_ext_ecc_parity_true_f() (0x400U) #define pwr_falcon_irqstat_swgen1_true_f() (0x80U) +#define pwr_falcon_irqstat_memerr_true_f() (0x40000U) +#define pwr_falcon_irqstat_iopmp_true_f() (0x800000U) #define pwr_pmu_ecc_intr_status_r() (0x0010abfcU) #define pwr_pmu_ecc_intr_status_corrected_m() (U32(0x1U) << 0U) #define pwr_pmu_ecc_intr_status_uncorrected_m() (U32(0x1U) << 1U) @@ -233,6 +236,9 @@ #define pwr_pmu_falcon_ecc_status_corrected_err_dmem_m() (U32(0x1U) << 1U) #define pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m() (U32(0x1U) << 8U) #define pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m() (U32(0x1U) << 9U) +#define pwr_pmu_falcon_ecc_status_uncorrected_err_mpu_ram_m() (U32(0x1U) << 10U) +#define pwr_pmu_falcon_ecc_status_uncorrected_err_dcls_m() (U32(0x1U) << 11U) +#define pwr_pmu_falcon_ecc_status_uncorrected_err_reg_m() (U32(0x1U) << 12U) #define pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m()\ (U32(0x1U) << 16U) #define pwr_pmu_falcon_ecc_status_uncorrected_err_total_counter_overflow_m()\