diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig index 7dba61a32..073318176 100644 --- a/drivers/gpu/nvgpu/Kconfig +++ b/drivers/gpu/nvgpu/Kconfig @@ -143,6 +143,13 @@ config NVGPU_SUPPORT_CDE help Enable support for extraction of comptags for CDE. +config NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + bool "Support ECC error reporting for Linux" + depends on TEGRA_SAFETY + default y + help + Enable support for ECC error reporting for Linux. + config NVGPU_USE_TEGRA_ALLOC_FD bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem" depends on GK20A && GK20A_VIDMEM diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 472bf32c7..d5ceecb6a 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -98,6 +98,8 @@ nvgpu-y += \ os/linux/ltc.o \ os/linux/vpr.o +nvgpu-$(CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING) += os/linux/sdl.o + nvgpu-$(CONFIG_GK20A_VIDMEM) += \ os/linux/dmabuf_vidmem.o diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index c3068b76c..1a1171691 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c @@ -1,7 +1,7 @@ /* * GK20A Graphics * - * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -525,6 +526,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount) struct gk20a *g = container_of(refcount, struct gk20a, refcount); +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + nvgpu_deinit_ecc_reporting(g); +#endif + nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!"); gk20a_ce_destroy(g); diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index a7a804d28..110819a9e 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -1,7 +1,7 @@ /* * GV11b GPU GR * - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -37,6 +37,7 @@ #include #include #include +#include #include "gk20a/gr_gk20a.h" #include "gk20a/dbg_gpu_gk20a.h" @@ -61,6 +62,8 @@ #include #include +#define SHIFT_8_BITS 8U + #define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100 /* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */ @@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += l1_tag_corrected_err_count_delta; + + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << SHIFT_8_BITS) | tpc, + GPU_SM_L1_TAG_ECC_CORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, 0); @@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += l1_tag_uncorrected_err_count_delta; + + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << SHIFT_8_BITS) | tpc, + GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, 0); @@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += lrf_uncorrected_err_count_delta; + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << SHIFT_8_BITS) | tpc, + GPU_SM_LRF_ECC_UNCORRECTED, 0, + g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, 0); @@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += cbu_uncorrected_err_count_delta; + + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << SHIFT_8_BITS) | tpc, + GPU_SM_CBU_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter); + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, 0); @@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += l1_data_uncorrected_err_count_delta; + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, + (gpc << SHIFT_8_BITS) | tpc, + GPU_SM_L1_DATA_ECC_UNCORRECTED, + 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter); gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, 0); @@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) if (ecc_status & gr_fecs_falcon_ecc_status_corrected_err_imem_m()) { + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, + GPU_FECS_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, + g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if (ecc_status & gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) { + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, + GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } @@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) } if (ecc_status & gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) { + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, + GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); } diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c index 5e586ec21..336258a71 100644 --- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c @@ -1,7 +1,7 @@ /* * GV11B PMU * - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -29,6 +29,7 @@ #include #include #include +#include #include "gk20a/pmu_gk20a.h" #include "gp10b/pmu_gp10b.h" @@ -354,10 +355,18 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) "pmu ecc interrupt intr1: 0x%x", intr1); if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) { + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, + GPU_PMU_FALCON_IMEM_ECC_CORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) { + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, + GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } @@ -366,6 +375,10 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) "dmem ecc error corrected"); } if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) { + nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, + GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, + ecc_addr, + g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); } diff --git a/drivers/gpu/nvgpu/include/nvgpu/bug.h b/drivers/gpu/nvgpu/include/nvgpu/bug.h index 3d139b757..82d641bd4 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/bug.h +++ b/drivers/gpu/nvgpu/include/nvgpu/bug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,6 +24,24 @@ #ifdef __KERNEL__ #include +/* + * Define an assert macro that code within nvgpu can use. + * + * The goal of this macro is for debugging but what that means varies from OS + * to OS. On Linux wee don't want to BUG() for general driver misbehaving. BUG() + * is a very heavy handed tool - in fact there's probably no where within the + * nvgpu core code where it makes sense to use a BUG() when running under Linux. + * + * However, on QNX (and POSIX) BUG() will just kill the current process. This + * means we can use it for handling bugs in nvgpu. + * + * As a result this macro varies depending on platform. + */ +#define nvgpu_assert(cond) ((void) WARN_ON(!(cond))) +#define nvgpu_do_assert_print(g, fmt, arg...) \ + do { \ + nvgpu_err(g, fmt, ##arg); \ + } while (false) #elif defined(__NVGPU_POSIX__) #include #else diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h index 70a167620..2bcca335e 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/log.h +++ b/drivers/gpu/nvgpu/include/nvgpu/log.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -80,6 +80,7 @@ void __nvgpu_log_dbg(struct gk20a *g, u64 log_mask, #define gpu_dbg_vidmem BIT(24) /* VIDMEM tracing. */ #define gpu_dbg_nvlink BIT(25) /* nvlink Operation tracing. */ #define gpu_dbg_clk_arb BIT(26) /* Clk arbiter debugging. */ +#define gpu_dbg_ecc BIT(27) /* Print ECC Info Logs. */ #define gpu_dbg_mem BIT(31) /* memory accesses; very verbose. */ /** diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h new file mode 100644 index 000000000..0595fafb8 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_NVGPU_ERR_H +#define NVGPU_NVGPU_ERR_H + +/** + * @file + * + * Define indices for HW units and errors. Define structures used to carry error + * information. Declare prototype for APIs that are used to report GPU HW errors + * to the Safety_Services framework. + */ + +#include +#include + +struct gk20a; + +/** + * @defgroup INDICES_FOR_GPU_HW_UNITS + * Macros used to assign unique index to GPU HW units. + * @{ + */ +#define NVGPU_ERR_MODULE_SM (0U) +#define NVGPU_ERR_MODULE_FECS (1U) +#define NVGPU_ERR_MODULE_PMU (2U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM + * Macros used to assign unique index to errors reported from the SM unit. + * @{ + */ +#define GPU_SM_L1_TAG_ECC_CORRECTED (0U) +#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U) +#define GPU_SM_CBU_ECC_UNCORRECTED (3U) +#define GPU_SM_LRF_ECC_UNCORRECTED (5U) +#define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U) +#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U) +#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U) +#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U) +#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U) +#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U) +#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS + * Macros used to assign unique index to errors reported from the FECS unit. + * @{ + */ +#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U) +#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U) +#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS + * Macros used to assign unique index to errors reported from the GPCCS unit. + * @{ + */ +#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U) +#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U) +#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU + * Macros used to assign unique index to errors reported from the MMU unit. + * @{ + */ +#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U) +#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC + * Macros used to assign unique index to errors reported from the GCC unit. + * @{ + */ +#define GPU_GCC_L15_ECC_UNCORRECTED (1U) +/** + * @} + */ + + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU + * Macros used to assign unique index to errors reported from the PMU unit. + * @{ + */ +#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U) +#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U) +#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC + * Macros used to assign unique index to errors reported from the LTC unit. + * @{ + */ +#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U) +#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U) +#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U) +#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U) +/** + * @} + */ + +/** + * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU + * Macros used to assign unique index to errors reported from the HUBMMU unit. + * @{ + */ +#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U) +#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U) +#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U) +#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U) +#define GPU_HUBMMU_PAGE_FAULT_ERROR (8U) + + +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING +/** + * @} + */ + +/** + * nvgpu_err_desc structure holds fields which describe an error along with + * function callback which can be used to inject the error. + */ +struct nvgpu_err_desc { + /** String representation of error. */ + const char *name; + + /** Flag to classify an error as critical or non-critical. */ + bool is_critical; + + /** + * Error Threshold: once this threshold value is reached, then the + * corresponding error counter will be reset to 0 and the error will be + * propagated to Safety_Services. + */ + int err_threshold; + + /** + * Total number of times an error has occurred (since its last reset). + */ + nvgpu_atomic_t err_count; + + /** Error ID. */ + u8 error_id; +}; + +/** + * gpu_err_header structure holds fields which are required to identify the + * version of header, sub-error type, sub-unit id, error address and time stamp. + */ +struct gpu_err_header { + /** Version of GPU error header. */ + struct { + /** Major version number. */ + u16 major; + /** Minor version number. */ + u16 minor; + } version; + + /** Sub error type corresponding to the error that is being reported. */ + u32 sub_err_type; + + /** ID of the sub-unit in a HW unit which encountered an error. */ + u64 sub_unit_id; + + /** Location of the error. */ + u64 address; + + /** Timestamp in nano seconds. */ + u64 timestamp_ns; +}; + +struct gpu_ecc_error_info { + struct gpu_err_header header; + + /** Number of ECC errors. */ + u64 err_cnt; +}; + +/** + * nvgpu_err_hw_module structure holds fields which describe the h/w modules + * error reporting capabilities. + */ +struct nvgpu_err_hw_module { + /** String representation of a given HW unit. */ + const char *name; + + /** HW unit ID. */ + u32 hw_unit; + + /** Total number of errors reported from a given HW unit. */ + u32 num_errs; + + u32 base_ecc_service_id; + + /** Used to get error description from look-up table. */ + struct nvgpu_err_desc *errs; +}; + +struct nvgpu_ecc_reporting_ops { + void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count); +}; + +struct nvgpu_ecc_reporting { + struct nvgpu_spinlock lock; + /* This flag is protected by the above spinlock */ + bool ecc_reporting_service_enabled; + const struct nvgpu_ecc_reporting_ops *ops; +}; + + /** + * This macro is used to initialize the members of nvgpu_err_desc struct. + */ +#define GPU_ERR(err, critical, id, threshold, ecount) \ +{ \ + .name = (err), \ + .is_critical = (critical), \ + .error_id = (id), \ + .err_threshold = (threshold), \ + .err_count = NVGPU_ATOMIC_INIT(ecount), \ +} + +/** + * This macro is used to initialize critical errors. + */ +#define GPU_CRITERR(err, id, threshold, ecount) \ + GPU_ERR(err, true, id, threshold, ecount) + +/** + * This macro is used to initialize non-critical errors. + */ +#define GPU_NONCRITERR(err, id, threshold, ecount) \ + GPU_ERR(err, false, id, threshold, ecount) + +/** + * @brief GPU HW errors need to be reported to Safety_Services via SDL unit. + * This function provides an interface to report ECC erros to SDL unit. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * - List of valid HW unit IDs + * - NVGPU_ERR_MODULE_SM + * - NVGPU_ERR_MODULE_FECS + * - NVGPU_ERR_MODULE_GPCCS + * - NVGPU_ERR_MODULE_MMU + * - NVGPU_ERR_MODULE_GCC + * - NVGPU_ERR_MODULE_PMU + * - NVGPU_ERR_MODULE_LTC + * - NVGPU_ERR_MODULE_HUBMMU + * @param inst [in] - Instance ID. + * - In case of multiple instances of the same HW + * unit (e.g., there are multiple instances of + * SM), it is used to identify the instance + * that encountered a fault. + * @param err_id [in] - Error index. + * - For SM: + * - Min: GPU_SM_L1_TAG_ECC_CORRECTED + * - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED + * - For FECS: + * - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED + * - Max: GPU_FECS_INVALID_ERROR + * - For GPCCS: + * - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED + * - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED + * - For MMU: + * - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED + * - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED + * - For GCC: + * - Min: GPU_GCC_L15_ECC_UNCORRECTED + * - Max: GPU_GCC_L15_ECC_UNCORRECTED + * - For PMU: + * - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED + * - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED + * - For LTC: + * - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED + * - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED + * - For HUBMMU: + * - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED + * - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED + * @param err_addr [in] - Error address. + * - This is the location at which correctable or + * uncorrectable error has occurred. + * @param err_count [in] - Error count. + * + * - Checks whether SDL is supported in the current GPU platform. If SDL is not + * supported, it simply returns. + * - Validates both \a hw_unit and \a err_id indices. In case of a failure, + * invokes #nvgpu_sdl_handle_report_failure() api. + * - Gets the current time of a clock. In case of a failure, invokes + * #nvgpu_sdl_handle_report_failure() api. + * - Gets error description from internal look-up table using \a hw_unit and + * \a err_id indices. + * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id, + * criticality of the error, \a inst, \a err_addr, \a err_count, error + * description, and size of the error packet. + * - Performs compile-time assert check to ensure that the size of the error + * packet does not exceed the maximum allowable size specified in + * #MAX_ERR_MSG_SIZE. + * + * @return None + */ +void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count); + +void nvgpu_init_ecc_reporting(struct gk20a *g); +void nvgpu_enable_ecc_reporting(struct gk20a *g); +void nvgpu_disable_ecc_reporting(struct gk20a *g); +void nvgpu_deinit_ecc_reporting(struct gk20a *g); + +#else + +static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count) { + +} + +#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */ + +#endif /* NVGPU_NVGPU_ERR_H */ \ No newline at end of file diff --git a/drivers/gpu/nvgpu/os/linux/ecc_linux.h b/drivers/gpu/nvgpu/os/linux/ecc_linux.h new file mode 100644 index 000000000..7e0f650bb --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/ecc_linux.h @@ -0,0 +1,49 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_OS_ECC_LINUX_H +#define NVGPU_OS_ECC_LINUX_H + +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + +#include +#include +#include +#include + +#include + +struct nvgpu_ecc_reporting_linux { + struct nvgpu_ecc_reporting common; + client_param_t priv; +}; + +static inline struct nvgpu_ecc_reporting_linux *get_ecc_reporting_linux( + struct nvgpu_ecc_reporting *ecc_report) +{ + return container_of(ecc_report, struct nvgpu_ecc_reporting_linux, common); +} + +#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */ + +#endif \ No newline at end of file diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index 807df2cad..fdbab46d6 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -1,7 +1,7 @@ /* * GK20A Graphics * - * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -49,6 +49,7 @@ #include #include #include +#include #include "platform_gk20a.h" #include "sysfs.h" @@ -355,6 +356,10 @@ int gk20a_pm_finalize_poweron(struct device *dev) gk20a_init_cde_support(l); #endif +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + nvgpu_enable_ecc_reporting(g); +#endif + err = gk20a_sched_ctrl_init(g); if (err) { nvgpu_err(g, "failed to init sched control"); @@ -364,9 +369,14 @@ int gk20a_pm_finalize_poweron(struct device *dev) g->sw_ready = true; done: - if (err) + if (err) { g->power_on = false; +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + nvgpu_disable_ecc_reporting(g); +#endif + } + nvgpu_mutex_release(&g->power_lock); return err; } @@ -433,6 +443,10 @@ static int gk20a_pm_prepare_poweroff(struct device *dev) /* Stop CPU from accessing the GPU registers. */ gk20a_lockout_registers(g); +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + nvgpu_disable_ecc_reporting(g); +#endif + nvgpu_hide_usermode_for_poweroff(g); nvgpu_mutex_release(&g->power_lock); return 0; @@ -1382,6 +1396,10 @@ static int gk20a_probe(struct platform_device *dev) goto return_err; } +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + nvgpu_init_ecc_reporting(gk20a); +#endif + gk20a->nvgpu_reboot_nb.notifier_call = nvgpu_kernel_shutdown_notification; err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb); diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h index 25c6c03a3..adcfdb2fa 100644 --- a/drivers/gpu/nvgpu/os/linux/os_linux.h +++ b/drivers/gpu/nvgpu/os/linux/os_linux.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -25,6 +25,7 @@ #include "cde.h" #include "sched.h" +#include "ecc_linux.h" struct nvgpu_os_linux_ops { struct { @@ -134,6 +135,10 @@ struct nvgpu_os_linux { u64 regs_bus_addr; +#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING + struct nvgpu_ecc_reporting_linux ecc_reporting_linux; +#endif + struct nvgpu_os_linux_ops ops; #ifdef CONFIG_DEBUG_FS diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c new file mode 100644 index 000000000..c4dccdc62 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/sdl.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2021, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include + +#include "ecc_linux.h" +#include "os_linux.h" +#include "module.h" + +/* This look-up table initializes the list of hw units and their errors. + * It also specifies the error injection mechanism supported, for each error. + * In case of hw error injection support, this initialization will be overriden + * by the values provided from the hal layes of corresponding hw units. + */ +static struct nvgpu_err_hw_module gv11b_err_lut[] = { + { + .name = "sm", + .hw_unit = (u32)NVGPU_ERR_MODULE_SM, + .num_errs = 21U, + .base_ecc_service_id = + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("l1_tag_ecc_corrected", + GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0), + GPU_CRITERR("l1_tag_ecc_uncorrected", + GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0), + GPU_CRITERR("cbu_ecc_uncorrected", + GPU_SM_CBU_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0), + GPU_CRITERR("lrf_ecc_uncorrected", + GPU_SM_LRF_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0), + GPU_CRITERR("l1_data_ecc_uncorrected", + GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0), + GPU_CRITERR("icache_l0_data_ecc_uncorrected", + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0), + GPU_CRITERR("icache_l1_data_ecc_uncorrected", + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0), + GPU_CRITERR("icache_l0_predecode_ecc_uncorrected", + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0), + GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected", + GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0), + GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected", + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0), + GPU_CRITERR("machine_check_error", 0, 0, 0), + GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0), + GPU_CRITERR("icache_l1_predecode_ecc_uncorrected", + GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0), + }, + }, + { + .name = "fecs", + .hw_unit = (u32)NVGPU_ERR_MODULE_FECS, + .num_errs = 4U, + .base_ecc_service_id = + NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("falcon_imem_ecc_corrected", + GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0), + GPU_CRITERR("falcon_imem_ecc_uncorrected", + GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), + GPU_CRITERR("falcon_dmem_ecc_uncorrected", + GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), + }, + }, + { + .name = "pmu", + .hw_unit = NVGPU_ERR_MODULE_PMU, + .num_errs = 4U, + .base_ecc_service_id = + NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("falcon_imem_ecc_corrected", + GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0), + GPU_CRITERR("falcon_imem_ecc_uncorrected", + GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), + GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), + GPU_CRITERR("falcon_dmem_ecc_uncorrected", + GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), + }, + }, +}; + +static void nvgpu_init_err_msg_header(struct gpu_err_header *header) +{ + header->version.major = (u16)1U; + header->version.minor = (u16)0U; + header->sub_err_type = 0U; + header->sub_unit_id = 0UL; + header->address = 0UL; + header->timestamp_ns = 0UL; +} + +static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info) +{ + nvgpu_init_err_msg_header(&err_info->header); + err_info->err_cnt = 0UL; +} + +static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count) +{ + int err = 0; + u32 s_id = 0; + u8 err_status = 0; + u8 err_info_size = 0; + u64 timestamp = 0ULL; + int err_threshold_counter = 0; + struct gpu_ecc_error_info err_pkt; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_hw_module *hw_module = NULL; + nv_guard_request_t req; + + memset(&req, 0, sizeof(req)); + nvgpu_init_ecc_err_msg(&err_pkt); + if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) { + err = -EINVAL; + goto done; + } + + hw_module = &gv11b_err_lut[hw_unit]; + if (err_id >= hw_module->num_errs) { + nvgpu_err(g, "invalid err_id (%u) for hw module (%u)", + err_id, hw_module->hw_unit); + err = -EINVAL; + goto done; + } + err_desc = &hw_module->errs[err_id]; + timestamp = (u64)nvgpu_current_time_ns(); + + err_pkt.header.timestamp_ns = timestamp; + err_pkt.header.sub_unit_id = inst; + err_pkt.header.address = err_addr; + err_pkt.err_cnt = err_count; + err_info_size = sizeof(err_pkt); + + s_id = hw_module->base_ecc_service_id + err_id; + + if (err_desc->is_critical) { + err_status = NVGUARD_ERROR_DETECTED; + } else { + err_status = NVGUARD_NO_ERROR; + } + + nvgpu_atomic_inc(&err_desc->err_count); + err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count, + err_desc->err_threshold + 1, 0); + + if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) { + goto done; + } + + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu", + hw_module->name, err_desc->name, err_count); + + req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION; + req.srv_status.srv_id = (nv_guard_service_id_t)s_id; + req.srv_status.status = err_status; + req.srv_status.timestamp = timestamp; + req.srv_status.error_info_size = err_info_size; + memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size); + + /* + * l1ss_submit_rq may fail due to kmalloc failures but may pass in + * subsequent calls + */ + err = l1ss_submit_rq(&req, true); + if (err != 0) { + nvgpu_err(g, "Error returned from L1SS submit %d", err); + } + + if (err_desc->is_critical) { + nvgpu_quiesce(g); + } + +done: + return; +} + +static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count) { + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty"); +} + +const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = { + .report_ecc_err = nvgpu_report_ecc_error_empty, +}; + +const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = { + .report_ecc_err = nvgpu_report_ecc_error_linux, +}; + +static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data) +{ + struct gk20a *g = (struct gk20a *)data; + struct nvgpu_os_linux *l = NULL; + struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL; + int err = 0; + /* Ensure we have a valid gk20a struct before proceeding */ + if ((g == NULL) || (gk20a_get(g) == NULL)) { + return -ENODEV; + } + + l = nvgpu_os_linux_from_gk20a(g); + ecc_reporting_linux = &l->ecc_reporting_linux; + + nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock); + if (param == L1SS_READY) { + if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) { + ecc_reporting_linux->common.ecc_reporting_service_enabled = true; + ecc_reporting_linux->common.ops = &ecc_enable_report_ops; + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); + } + } else if (param == L1SS_NOT_READY) { + if (ecc_reporting_linux->common.ecc_reporting_service_enabled) { + ecc_reporting_linux->common.ecc_reporting_service_enabled = false; + ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops; + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); + } + } else { + err = -EINVAL; + } + nvgpu_spinlock_release(&ecc_reporting_linux->common.lock); + + gk20a_put(g); + + return err; +} + +void nvgpu_init_ecc_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; + int err = 0; + /* This will invoke the registration API */ + nvgpu_spinlock_init(&ecc_report_linux->common.lock); + ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK); + ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback; + ecc_report_linux->priv.data = g; + ecc_report_linux->common.ops = &default_disabled_ecc_report_ops; + + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init"); + + /* + * err == 0 indicates service is available but not active yet. + * err == 1 indicates service is available and active + * error for other cases. + */ + err = l1ss_register_client(&ecc_report_linux->priv); + if (err == 0) { + ecc_report_linux->common.ecc_reporting_service_enabled = false; + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success"); + } else if (err == 1) { + ecc_report_linux->common.ecc_reporting_service_enabled = true; + /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting + * called as part of gk20a_busy() + */ + } else { + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err); + } +} + +void nvgpu_deinit_ecc_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; + + if (ecc_report_linux->common.ecc_reporting_service_enabled) { + ecc_report_linux->common.ecc_reporting_service_enabled = false; + l1ss_deregister_client(ecc_report_linux->priv.id); + memset(ecc_report_linux, 0, sizeof(*ecc_report_linux)); + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success"); + } + +} + +void nvgpu_enable_ecc_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; + struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; + + nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); + if (error_reporting->ecc_reporting_service_enabled) { + error_reporting->ops = &ecc_enable_report_ops; + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); + } + nvgpu_spinlock_release(&ecc_report_linux->common.lock); +} + +void nvgpu_disable_ecc_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; + struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; + + nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); + error_reporting->ops = &default_disabled_ecc_report_ops; + nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); + nvgpu_spinlock_release(&ecc_report_linux->common.lock); +} + +void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; + struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; + void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count); + + nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); + report_ecc_err_func = error_reporting->ops->report_ecc_err; + nvgpu_spinlock_release(&ecc_report_linux->common.lock); + + report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count); +}