From e89553fe6287845b486e429333c86598588b63e0 Mon Sep 17 00:00:00 2001 From: Debarshi Dutta Date: Mon, 27 Jun 2022 16:39:08 +0530 Subject: [PATCH] gpu: nvgpu: add error reporting support for L4T Add error reporting support for T194's L1SS safety services for linux. Used GA10B's LUT for GV11B. The error ids for T194 are different compared to GA10B. This is handled by creating a separate table mapping existing error ids to match GV11B. Ids that are not used by GV11B are set to U32_MAX to indicate the driver to not send them to the l1ss driver. Bug 200588528 Signed-off-by: Debarshi Dutta Change-Id: I10a267942df77458c3deee0aad1179955490aa74 Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2736772 Tested-by: mobile promotions Reviewed-by: mobile promotions --- arch/nvgpu-common.yaml | 3 +- arch/nvgpu-hal-new.yaml | 4 +- arch/nvgpu-linux.yaml | 3 +- drivers/gpu/nvgpu/Makefile | 8 + drivers/gpu/nvgpu/Makefile.linux.configs | 5 + drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h | 36 ++ .../nvgpu/hal/cic/mon/init_gv11b_non_fusa.c | 39 ++ drivers/gpu/nvgpu/hal/init/hal_gv11b.c | 16 + .../nvgpu/include/nvgpu/l1ss_err_reporting.h | 38 ++ .../gpu/nvgpu/os/linux/cic/l1ss_report_err.c | 399 ++++++++++++++++++ drivers/gpu/nvgpu/os/linux/module.c | 9 + drivers/gpu/nvgpu/os/linux/os_linux.h | 4 + 12 files changed, 561 insertions(+), 3 deletions(-) create mode 100644 drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h create mode 100644 drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c create mode 100644 drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h create mode 100644 drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml index 932a71008..72133bb32 100644 --- a/arch/nvgpu-common.yaml +++ b/arch/nvgpu-common.yaml @@ -15,7 +15,8 @@ nvgpu: sources: [ include/nvgpu/gk20a.h, include/nvgpu/nvgpu_common.h, include/nvgpu/cov_whitelist.h, - include/nvgpu/static_analysis.h ] + include/nvgpu/static_analysis.h, + include/nvgpu/l1ss_err_reporting.h ] bios: safe: yes diff --git a/arch/nvgpu-hal-new.yaml b/arch/nvgpu-hal-new.yaml index b4ab29661..950c139d5 100644 --- a/arch/nvgpu-hal-new.yaml +++ b/arch/nvgpu-hal-new.yaml @@ -1027,7 +1027,9 @@ tpc: cic: safe: yes owner: Tejal K - sources: [ hal/cic/mon/init_ga10b_fusa.c, + sources: [ hal/cic/mon/init_gv11b_non_fusa.c, + hal/cic/mon/cic_gv11b.h, + hal/cic/mon/init_ga10b_fusa.c, hal/cic/mon/lut_ga10b_fusa.c, hal/cic/mon/cic_ga10b.h ] diff --git a/arch/nvgpu-linux.yaml b/arch/nvgpu-linux.yaml index 2087ac487..44b3c193c 100644 --- a/arch/nvgpu-linux.yaml +++ b/arch/nvgpu-linux.yaml @@ -241,7 +241,8 @@ vm: os/linux/nvgpu_ivm.c ] cic: - sources: [ os/linux/cic/cic_report_err.c ] + sources: [ os/linux/cic/cic_report_err.c, + os/linux/cic/l1ss_report_err.c ] # Group all the Linux headers for now. headers: diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index f2d16148a..c3ce19ab9 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -61,6 +61,10 @@ ifeq ($(CONFIG_NVGPU_IVM_BUILD),y) ccflags-y += -DCONFIG_NVGPU_IVM_BUILD endif +ifeq ($(CONFIG_TEGRA_L1SS_SUPPORT),y) +ccflags-y += -DCONFIG_TEGRA_L1SS_SUPPORT +endif + ccflags-y += -DCONFIG_NVGPU_DETERMINISTIC_CHANNELS ccflags-y += -DCONFIG_NVGPU_STATIC_POWERGATE ccflags-y += -DCONFIG_NVGPU_ACR_LEGACY @@ -606,6 +610,10 @@ nvgpu-$(CONFIG_NVGPU_SUPPORT_CDE) += \ os/linux/debug_cde.o endif +nvgpu-$(CONFIG_TEGRA_L1SS_SUPPORT) += \ + os/linux/cic/l1ss_report_err.o \ + hal/cic/mon/init_gv11b_non_fusa.o \ + nvgpu-y += \ common/mm/allocators/nvgpu_allocator.o \ common/mm/allocators/bitmap_allocator.o \ diff --git a/drivers/gpu/nvgpu/Makefile.linux.configs b/drivers/gpu/nvgpu/Makefile.linux.configs index 7be955f4c..bdeae5544 100644 --- a/drivers/gpu/nvgpu/Makefile.linux.configs +++ b/drivers/gpu/nvgpu/Makefile.linux.configs @@ -112,6 +112,11 @@ ifdef CONFIG_TEGRA_EPL CONFIG_NVGPU_ENABLE_MISC_EC := y endif +CONFIG_TEGRA_L1SS_SUPPORT := n +ifdef CONFIG_TEGRA_SAFETY +CONFIG_TEGRA_L1SS_SUPPORT := y +endif + CONFIG_NVGPU_NVMEM_FUSE := n CONFIG_NVGPU_NVMAP_NEXT := y diff --git a/drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h b/drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h new file mode 100644 index 000000000..923a84581 --- /dev/null +++ b/drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIC_GV11B_H +#define CIC_GV11B_H + +#include + +struct gk20a; +struct nvgpu_cic_mon; + +extern struct nvgpu_err_hw_module gv11b_err_lut[]; +extern u32 size_of_gv11b_lut; + +int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon); + +#endif /* CIC_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c b/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c new file mode 100644 index 000000000..33e9bdb7e --- /dev/null +++ b/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "common/cic/mon/cic_mon_priv.h" +#include "cic_ga10b.h" +#include "cic_gv11b.h" + +int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon) +{ + if (cic_mon == NULL) { + nvgpu_err(g, "Invalid CIC reference pointer."); + return -EINVAL; + } + + cic_mon->err_lut = ga10b_err_lut; + cic_mon->num_hw_modules = size_of_ga10b_lut; + return 0; +} diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 3f36740c1..b87eb5792 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -49,6 +49,8 @@ #include #endif +#include + #include "hal/mm/mm_gp10b.h" #include "hal/mm/mm_gv11b.h" #include "hal/mm/cache/flush_gk20a.h" @@ -199,6 +201,10 @@ #include "hal/tpc/tpc_gv11b.h" #endif +#ifdef CONFIG_TEGRA_L1SS_SUPPORT +#include "hal/cic/mon/cic_gv11b.h" +#endif + #include "hal_gv11b.h" #include "hal_gv11b_litter.h" @@ -1488,6 +1494,13 @@ static const struct gops_grmgr gv11b_ops_grmgr = { .init_gr_manager = nvgpu_init_gr_manager, }; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT +static const struct gops_cic_mon gv11b_ops_cic_mon = { + .init = gv11b_cic_mon_init, + .report_err = nvgpu_l1ss_report_err +}; +#endif + int gv11b_init_hal(struct gk20a *g) { struct gpu_ops *gops = &g->ops; @@ -1587,6 +1600,9 @@ int gv11b_init_hal(struct gk20a *g) gops->gpc_pg = gv11b_ops_gpc_pg; #endif gops->grmgr = gv11b_ops_grmgr; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + gops->cic_mon = gv11b_ops_cic_mon; +#endif gops->chip_init_gpu_characteristics = gv11b_init_gpu_characteristics; gops->get_litter_value = gv11b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; diff --git a/drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h b/drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h new file mode 100644 index 000000000..8af23e167 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h @@ -0,0 +1,38 @@ + +/* + * + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifndef NVGPU_L1SS_ECC_H +#define NVGPU_L1SS_ECC_H + +#include + +struct gk20a; +struct nvgpu_l1ss_ecc_reporting; + +#ifdef CONFIG_TEGRA_L1SS_SUPPORT +int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id); +void nvgpu_l1ss_deinit_reporting(struct gk20a *g); +void nvgpu_l1ss_init_reporting(struct gk20a *g); +#endif + +#endif /* NVGPU_L1SS_ECC_H */ diff --git a/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c b/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c new file mode 100644 index 000000000..52026cf85 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2022, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "os/linux/os_linux.h" + +#define NVGPU_ERR_INVALID U32_MAX + +struct nvgpu_l1ss_ecc_reporting { + struct gk20a *g; + client_param_t priv; + bool service_enabled; + /* protects service enabled */ + struct nvgpu_spinlock lock; +}; + +struct nvgpu_l1ss_error_id_mappings { + u32 num_errs; + u32 *error_id_mappings; +}; + +static struct nvgpu_l1ss_error_id_mappings mappings[] = { + { +/* *************** SERVICE ID for IGPU_HOST*************** */ + .num_errs = 16, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_BIND_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_SCHED_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CHSW_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_MEMOP_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_LB_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_SQUASH_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_FECS_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_EXTRA_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_GPFIFO_PB_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_METHOD_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_SIGNATURE_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_HCE_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CTXSW_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_FB_FLUSH_TIMEOUT_ERROR, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_SM*************** */ + .num_errs = 11, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_CBU_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_LRF_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L1_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_MACHINE_CHECK_ERROR, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_FECS*************** */ + .num_errs = 7, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_DMEM_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_WATCHDOG_TIMEOUT, + NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_CRC_MISMATCH, + NVGUARD_SERVICE_IGPU_FECS_SWERR_FAULT_DURING_CTXSW, + NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_INIT_ERROR, + }, + }, + { +/* *************** SERVICE ID for IGPU_GPCCS*************** */ + .num_errs = 3, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_DMEM_ECC_UNCORRECTED, + }, + }, + { +/* *************** SERVICE ID for IGPU_MMU*************** */ + .num_errs = 2, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_SA_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_FA_DATA_ECC_UNCORRECTED, + }, + }, + { +/* *************** SERVICE ID for IGPU_GCC*************** */ + .num_errs = 1, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_GCC_SWERR_L15_ECC_UNCORRECTED + }, + }, + { +/* *************** SERVICE ID for IGPU_PMU*************** */ + .num_errs = 10, + .error_id_mappings = (u32 []) { + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_UNCORRECTED, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_DMEM_ECC_UNCORRECTED, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PMU_SWERR_BAR0_ERROR_TIMEOUT, + }, + }, + { +/* *************** SERVICE ID for IGPU_PGRAPH*************** */ + .num_errs = 21, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_FE_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MEMFMT_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_PD_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SCC_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_DS_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SSYNC_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MME_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SKED_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MPC_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_LTC*************** */ + .num_errs = 4, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_TSTG_ECC_UNCORRECTED, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_HUBMMU*************** */ + .num_errs = 9, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_L2TLB_SA_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_TLB_SA_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PTE_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PDE0_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + }, + }, + { +/* *************** SERVICE ID for IGPU_PRI*************** */ + .num_errs = 2, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_PRI_SWERR_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_PRI_SWERR_ACCESS_VIOLATION, + }, + }, + { +/* *************** SERVICE ID for IGPU_CE*************** */ + .num_errs = 5, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_CE_SWERR_LAUNCH_ERROR, + NVGUARD_SERVICE_IGPU_CE_SWERR_METHOD_BUFFER_FAULT, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_CE_SWERR_INVALID_CONFIG, + }, + }, +}; + +static int nvgpu_l1ss_report_error_linux(struct gk20a *g, u32 hw_unit_id, u32 err_id, + bool is_critical) +{ + int err = 0; + u32 nv_service_id = 0; + u8 err_status = 0; + u64 timestamp = (u64)nvgpu_current_time_ns(); + nv_guard_request_t req; + + if (hw_unit_id >= sizeof(mappings)) { + nvgpu_err(g, "Error Id H/W index out of bounds\n"); + return -EINVAL; + } else if (err_id >= mappings[hw_unit_id].num_errs) { + nvgpu_err(g, "Error Id index out of bounds\n"); + return -EINVAL; + } + + memset(&req, 0, sizeof(req)); + + nv_service_id = mappings[hw_unit_id].error_id_mappings[err_id]; + + if (nv_service_id == NVGPU_ERR_INVALID) { + /* error id not supported */ + return -EOPNOTSUPP; + } + + if (is_critical) + err_status = NVGUARD_ERROR_DETECTED; + else + err_status = NVGUARD_NO_ERROR; + + req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION; + req.srv_status.srv_id = (nv_guard_service_id_t)nv_service_id; + req.srv_status.status = err_status; + req.srv_status.timestamp = timestamp; + + /* + * l1ss_submit_rq may fail due to kmalloc failures but may pass in + * subsequent calls + */ + err = l1ss_submit_rq(&req, true); + if (err != 0) + nvgpu_err(g, "Error returned from L1SS submit %d", err); + + return err; +} + +static int nvgpu_l1ss_report_error_empty(struct gk20a *g, + u32 hw_unit_id, u32 err_id, bool is_critical) +{ + nvgpu_log(g, gpu_dbg_info, "ECC reporting is empty"); + return -EOPNOTSUPP; +} + +static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data) +{ + struct gk20a *g = (struct gk20a *)data; + struct nvgpu_os_linux *l = NULL; + struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting = NULL; + int err = 0; + /* Ensure we have a valid gk20a struct before proceeding */ + if ((g == NULL) || (nvgpu_get(g) == NULL)) + return -ENODEV; + + l = nvgpu_os_linux_from_gk20a(g); + l1ss_linux_ecc_reporting = l->l1ss_linux_ecc_reporting; + + nvgpu_spinlock_acquire(&l1ss_linux_ecc_reporting->lock); + if (param == L1SS_READY) { + if (!l1ss_linux_ecc_reporting->service_enabled) { + l1ss_linux_ecc_reporting->service_enabled = true; + nvgpu_log(g, gpu_dbg_info, "ECC reporting is enabled"); + } + } else if (param == L1SS_NOT_READY) { + if (l1ss_linux_ecc_reporting->service_enabled) { + l1ss_linux_ecc_reporting->service_enabled = false; + nvgpu_log(g, gpu_dbg_info, "ECC reporting is disabled"); + } + } else { + err = -EINVAL; + } + nvgpu_spinlock_release(&l1ss_linux_ecc_reporting->lock); + + nvgpu_put(g); + + return err; +} + +void nvgpu_l1ss_init_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = NULL; + int err = 0; + + l->l1ss_linux_ecc_reporting = nvgpu_kzalloc(g, sizeof(*l->l1ss_linux_ecc_reporting)); + if (l->l1ss_linux_ecc_reporting == NULL) { + nvgpu_err(g, "unable to allocate memory for l1ss safety services"); + return; + } + + ecc_report_linux = l->l1ss_linux_ecc_reporting; + + /* This will invoke the registration API */ + nvgpu_spinlock_init(&ecc_report_linux->lock); + ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK); + ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback; + ecc_report_linux->priv.data = g; + + nvgpu_log(g, gpu_dbg_info, "ECC reporting Init (L1SS)"); + + /* + * err == 0 indicates service is available but not active yet. + * err == 1 indicates service is available and active + * error for other cases. + */ + err = l1ss_register_client(&ecc_report_linux->priv); + if (err == 0) { + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + ecc_report_linux->service_enabled = false; + nvgpu_spinlock_release(&ecc_report_linux->lock); + nvgpu_log(g, gpu_dbg_info, "ECC reporting init success"); + } else if (err == 1) { + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + ecc_report_linux->service_enabled = true; + nvgpu_spinlock_release(&ecc_report_linux->lock); + nvgpu_log(g, gpu_dbg_info, "ECC reporting init started"); + } else { + nvgpu_log(g, gpu_dbg_info, "ECC reporting init failure %d", err); + } +} + +void nvgpu_l1ss_deinit_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting; + + if (ecc_report_linux == NULL) + return; + + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + if (ecc_report_linux->service_enabled) { + ecc_report_linux->service_enabled = false; + } + nvgpu_spinlock_release(&ecc_report_linux->lock); + + (void)l1ss_deregister_client(ecc_report_linux->priv.id); + nvgpu_log(g, gpu_dbg_info, "ECC reporting de-init success"); + + nvgpu_kfree(g, ecc_report_linux); + l->l1ss_linux_ecc_reporting = NULL; +} + +int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id) +{ + int ret = 0; + bool service_enabled; + + /* - HW_unit_id (4-bits: bit-0 to 3), + * - Error_id (5-bits: bit-4 to 8), + * - Corrected/Uncorrected error (1-bit: bit-9), + * - Remaining 22-bits are unused. + */ + + u32 hw_unit = (err_id & HW_UNIT_ID_MASK); + u32 error_id = ((err_id >> ERR_ID_FIELD_SHIFT) & ERR_ID_MASK); + bool is_critical = ((err_id & (1 << CORRECTED_BIT_FIELD_SHIFT)) != 0U) ? true : false; + + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting; + + nvgpu_log(g, gpu_dbg_info, "hw_unit = %u, error_id = %u, is_critical = %d", + hw_unit, error_id, is_critical); + + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + service_enabled = ecc_report_linux->service_enabled; + nvgpu_spinlock_release(&ecc_report_linux->lock); + + if (service_enabled) { + ret = nvgpu_l1ss_report_error_linux(g, hw_unit, error_id, is_critical); + } else { + ret = nvgpu_l1ss_report_error_empty(g, hw_unit, err_id, is_critical); + } + + return ret; +} diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index 163ab1f74..415c2e6ff 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -70,6 +70,7 @@ #include #include #include +#include #include "platform_gk20a.h" #include "sysfs.h" @@ -1016,6 +1017,10 @@ void gk20a_remove_support(struct gk20a *g) struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); struct sim_nvgpu_linux *sim_linux; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + nvgpu_l1ss_deinit_reporting(g); +#endif + #if NVGPU_VPR_RESIZE_SUPPORTED if (nvgpu_is_enabled(g, NVGPU_SUPPORT_VPR)) { tegra_unregister_idle_unidle(gk20a_do_idle); @@ -1865,6 +1870,10 @@ static int gk20a_probe(struct platform_device *dev) if (err) goto return_err; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + nvgpu_l1ss_init_reporting(gk20a); +#endif + nvgpu_mutex_init(&l->dmabuf_priv_list_lock); nvgpu_init_list_node(&l->dmabuf_priv_list); diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h index a57b39930..eb277f6a4 100644 --- a/drivers/gpu/nvgpu/os/linux/os_linux.h +++ b/drivers/gpu/nvgpu/os/linux/os_linux.h @@ -108,6 +108,10 @@ struct nvgpu_os_linux { struct nvgpu_os_linux_ops ops; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting; +#endif + struct notifier_block nvgpu_reboot_nb; #ifdef CONFIG_DEBUG_FS