diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml index 932a71008..72133bb32 100644 --- a/arch/nvgpu-common.yaml +++ b/arch/nvgpu-common.yaml @@ -15,7 +15,8 @@ nvgpu: sources: [ include/nvgpu/gk20a.h, include/nvgpu/nvgpu_common.h, include/nvgpu/cov_whitelist.h, - include/nvgpu/static_analysis.h ] + include/nvgpu/static_analysis.h, + include/nvgpu/l1ss_err_reporting.h ] bios: safe: yes diff --git a/arch/nvgpu-hal-new.yaml b/arch/nvgpu-hal-new.yaml index b4ab29661..950c139d5 100644 --- a/arch/nvgpu-hal-new.yaml +++ b/arch/nvgpu-hal-new.yaml @@ -1027,7 +1027,9 @@ tpc: cic: safe: yes owner: Tejal K - sources: [ hal/cic/mon/init_ga10b_fusa.c, + sources: [ hal/cic/mon/init_gv11b_non_fusa.c, + hal/cic/mon/cic_gv11b.h, + hal/cic/mon/init_ga10b_fusa.c, hal/cic/mon/lut_ga10b_fusa.c, hal/cic/mon/cic_ga10b.h ] diff --git a/arch/nvgpu-linux.yaml b/arch/nvgpu-linux.yaml index 2087ac487..44b3c193c 100644 --- a/arch/nvgpu-linux.yaml +++ b/arch/nvgpu-linux.yaml @@ -241,7 +241,8 @@ vm: os/linux/nvgpu_ivm.c ] cic: - sources: [ os/linux/cic/cic_report_err.c ] + sources: [ os/linux/cic/cic_report_err.c, + os/linux/cic/l1ss_report_err.c ] # Group all the Linux headers for now. headers: diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index f2d16148a..c3ce19ab9 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -61,6 +61,10 @@ ifeq ($(CONFIG_NVGPU_IVM_BUILD),y) ccflags-y += -DCONFIG_NVGPU_IVM_BUILD endif +ifeq ($(CONFIG_TEGRA_L1SS_SUPPORT),y) +ccflags-y += -DCONFIG_TEGRA_L1SS_SUPPORT +endif + ccflags-y += -DCONFIG_NVGPU_DETERMINISTIC_CHANNELS ccflags-y += -DCONFIG_NVGPU_STATIC_POWERGATE ccflags-y += -DCONFIG_NVGPU_ACR_LEGACY @@ -606,6 +610,10 @@ nvgpu-$(CONFIG_NVGPU_SUPPORT_CDE) += \ os/linux/debug_cde.o endif +nvgpu-$(CONFIG_TEGRA_L1SS_SUPPORT) += \ + os/linux/cic/l1ss_report_err.o \ + hal/cic/mon/init_gv11b_non_fusa.o \ + nvgpu-y += \ common/mm/allocators/nvgpu_allocator.o \ common/mm/allocators/bitmap_allocator.o \ diff --git a/drivers/gpu/nvgpu/Makefile.linux.configs b/drivers/gpu/nvgpu/Makefile.linux.configs index 7be955f4c..bdeae5544 100644 --- a/drivers/gpu/nvgpu/Makefile.linux.configs +++ b/drivers/gpu/nvgpu/Makefile.linux.configs @@ -112,6 +112,11 @@ ifdef CONFIG_TEGRA_EPL CONFIG_NVGPU_ENABLE_MISC_EC := y endif +CONFIG_TEGRA_L1SS_SUPPORT := n +ifdef CONFIG_TEGRA_SAFETY +CONFIG_TEGRA_L1SS_SUPPORT := y +endif + CONFIG_NVGPU_NVMEM_FUSE := n CONFIG_NVGPU_NVMAP_NEXT := y diff --git a/drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h b/drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h new file mode 100644 index 000000000..923a84581 --- /dev/null +++ b/drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIC_GV11B_H +#define CIC_GV11B_H + +#include + +struct gk20a; +struct nvgpu_cic_mon; + +extern struct nvgpu_err_hw_module gv11b_err_lut[]; +extern u32 size_of_gv11b_lut; + +int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon); + +#endif /* CIC_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c b/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c new file mode 100644 index 000000000..33e9bdb7e --- /dev/null +++ b/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "common/cic/mon/cic_mon_priv.h" +#include "cic_ga10b.h" +#include "cic_gv11b.h" + +int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon) +{ + if (cic_mon == NULL) { + nvgpu_err(g, "Invalid CIC reference pointer."); + return -EINVAL; + } + + cic_mon->err_lut = ga10b_err_lut; + cic_mon->num_hw_modules = size_of_ga10b_lut; + return 0; +} diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 3f36740c1..b87eb5792 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -49,6 +49,8 @@ #include #endif +#include + #include "hal/mm/mm_gp10b.h" #include "hal/mm/mm_gv11b.h" #include "hal/mm/cache/flush_gk20a.h" @@ -199,6 +201,10 @@ #include "hal/tpc/tpc_gv11b.h" #endif +#ifdef CONFIG_TEGRA_L1SS_SUPPORT +#include "hal/cic/mon/cic_gv11b.h" +#endif + #include "hal_gv11b.h" #include "hal_gv11b_litter.h" @@ -1488,6 +1494,13 @@ static const struct gops_grmgr gv11b_ops_grmgr = { .init_gr_manager = nvgpu_init_gr_manager, }; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT +static const struct gops_cic_mon gv11b_ops_cic_mon = { + .init = gv11b_cic_mon_init, + .report_err = nvgpu_l1ss_report_err +}; +#endif + int gv11b_init_hal(struct gk20a *g) { struct gpu_ops *gops = &g->ops; @@ -1587,6 +1600,9 @@ int gv11b_init_hal(struct gk20a *g) gops->gpc_pg = gv11b_ops_gpc_pg; #endif gops->grmgr = gv11b_ops_grmgr; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + gops->cic_mon = gv11b_ops_cic_mon; +#endif gops->chip_init_gpu_characteristics = gv11b_init_gpu_characteristics; gops->get_litter_value = gv11b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; diff --git a/drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h b/drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h new file mode 100644 index 000000000..8af23e167 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h @@ -0,0 +1,38 @@ + +/* + * + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifndef NVGPU_L1SS_ECC_H +#define NVGPU_L1SS_ECC_H + +#include + +struct gk20a; +struct nvgpu_l1ss_ecc_reporting; + +#ifdef CONFIG_TEGRA_L1SS_SUPPORT +int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id); +void nvgpu_l1ss_deinit_reporting(struct gk20a *g); +void nvgpu_l1ss_init_reporting(struct gk20a *g); +#endif + +#endif /* NVGPU_L1SS_ECC_H */ diff --git a/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c b/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c new file mode 100644 index 000000000..52026cf85 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2022, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "os/linux/os_linux.h" + +#define NVGPU_ERR_INVALID U32_MAX + +struct nvgpu_l1ss_ecc_reporting { + struct gk20a *g; + client_param_t priv; + bool service_enabled; + /* protects service enabled */ + struct nvgpu_spinlock lock; +}; + +struct nvgpu_l1ss_error_id_mappings { + u32 num_errs; + u32 *error_id_mappings; +}; + +static struct nvgpu_l1ss_error_id_mappings mappings[] = { + { +/* *************** SERVICE ID for IGPU_HOST*************** */ + .num_errs = 16, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_BIND_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_SCHED_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CHSW_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_MEMOP_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_LB_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_SQUASH_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_FECS_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_EXTRA_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_GPFIFO_PB_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_METHOD_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_SIGNATURE_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_HCE_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CTXSW_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_FB_FLUSH_TIMEOUT_ERROR, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_SM*************** */ + .num_errs = 11, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_CBU_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_LRF_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L1_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_SM_SWERR_MACHINE_CHECK_ERROR, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_FECS*************** */ + .num_errs = 7, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_DMEM_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_WATCHDOG_TIMEOUT, + NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_CRC_MISMATCH, + NVGUARD_SERVICE_IGPU_FECS_SWERR_FAULT_DURING_CTXSW, + NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_INIT_ERROR, + }, + }, + { +/* *************** SERVICE ID for IGPU_GPCCS*************** */ + .num_errs = 3, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_DMEM_ECC_UNCORRECTED, + }, + }, + { +/* *************** SERVICE ID for IGPU_MMU*************** */ + .num_errs = 2, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_SA_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_FA_DATA_ECC_UNCORRECTED, + }, + }, + { +/* *************** SERVICE ID for IGPU_GCC*************** */ + .num_errs = 1, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_GCC_SWERR_L15_ECC_UNCORRECTED + }, + }, + { +/* *************** SERVICE ID for IGPU_PMU*************** */ + .num_errs = 10, + .error_id_mappings = (u32 []) { + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_UNCORRECTED, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_DMEM_ECC_UNCORRECTED, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PMU_SWERR_BAR0_ERROR_TIMEOUT, + }, + }, + { +/* *************** SERVICE ID for IGPU_PGRAPH*************** */ + .num_errs = 21, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_FE_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MEMFMT_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_PD_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SCC_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_DS_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SSYNC_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MME_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SKED_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MPC_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_LTC*************** */ + .num_errs = 4, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_CORRECTED, + NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_TSTG_ECC_UNCORRECTED, + NVGPU_ERR_INVALID, + }, + }, + { +/* *************** SERVICE ID for IGPU_HUBMMU*************** */ + .num_errs = 9, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_L2TLB_SA_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_TLB_SA_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PTE_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PDE0_DATA_ECC_UNCORRECTED, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR, + }, + }, + { +/* *************** SERVICE ID for IGPU_PRI*************** */ + .num_errs = 2, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_PRI_SWERR_TIMEOUT_ERROR, + NVGUARD_SERVICE_IGPU_PRI_SWERR_ACCESS_VIOLATION, + }, + }, + { +/* *************** SERVICE ID for IGPU_CE*************** */ + .num_errs = 5, + .error_id_mappings = (u32 []) { + NVGUARD_SERVICE_IGPU_CE_SWERR_LAUNCH_ERROR, + NVGUARD_SERVICE_IGPU_CE_SWERR_METHOD_BUFFER_FAULT, + NVGPU_ERR_INVALID, + NVGPU_ERR_INVALID, + NVGUARD_SERVICE_IGPU_CE_SWERR_INVALID_CONFIG, + }, + }, +}; + +static int nvgpu_l1ss_report_error_linux(struct gk20a *g, u32 hw_unit_id, u32 err_id, + bool is_critical) +{ + int err = 0; + u32 nv_service_id = 0; + u8 err_status = 0; + u64 timestamp = (u64)nvgpu_current_time_ns(); + nv_guard_request_t req; + + if (hw_unit_id >= sizeof(mappings)) { + nvgpu_err(g, "Error Id H/W index out of bounds\n"); + return -EINVAL; + } else if (err_id >= mappings[hw_unit_id].num_errs) { + nvgpu_err(g, "Error Id index out of bounds\n"); + return -EINVAL; + } + + memset(&req, 0, sizeof(req)); + + nv_service_id = mappings[hw_unit_id].error_id_mappings[err_id]; + + if (nv_service_id == NVGPU_ERR_INVALID) { + /* error id not supported */ + return -EOPNOTSUPP; + } + + if (is_critical) + err_status = NVGUARD_ERROR_DETECTED; + else + err_status = NVGUARD_NO_ERROR; + + req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION; + req.srv_status.srv_id = (nv_guard_service_id_t)nv_service_id; + req.srv_status.status = err_status; + req.srv_status.timestamp = timestamp; + + /* + * l1ss_submit_rq may fail due to kmalloc failures but may pass in + * subsequent calls + */ + err = l1ss_submit_rq(&req, true); + if (err != 0) + nvgpu_err(g, "Error returned from L1SS submit %d", err); + + return err; +} + +static int nvgpu_l1ss_report_error_empty(struct gk20a *g, + u32 hw_unit_id, u32 err_id, bool is_critical) +{ + nvgpu_log(g, gpu_dbg_info, "ECC reporting is empty"); + return -EOPNOTSUPP; +} + +static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data) +{ + struct gk20a *g = (struct gk20a *)data; + struct nvgpu_os_linux *l = NULL; + struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting = NULL; + int err = 0; + /* Ensure we have a valid gk20a struct before proceeding */ + if ((g == NULL) || (nvgpu_get(g) == NULL)) + return -ENODEV; + + l = nvgpu_os_linux_from_gk20a(g); + l1ss_linux_ecc_reporting = l->l1ss_linux_ecc_reporting; + + nvgpu_spinlock_acquire(&l1ss_linux_ecc_reporting->lock); + if (param == L1SS_READY) { + if (!l1ss_linux_ecc_reporting->service_enabled) { + l1ss_linux_ecc_reporting->service_enabled = true; + nvgpu_log(g, gpu_dbg_info, "ECC reporting is enabled"); + } + } else if (param == L1SS_NOT_READY) { + if (l1ss_linux_ecc_reporting->service_enabled) { + l1ss_linux_ecc_reporting->service_enabled = false; + nvgpu_log(g, gpu_dbg_info, "ECC reporting is disabled"); + } + } else { + err = -EINVAL; + } + nvgpu_spinlock_release(&l1ss_linux_ecc_reporting->lock); + + nvgpu_put(g); + + return err; +} + +void nvgpu_l1ss_init_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = NULL; + int err = 0; + + l->l1ss_linux_ecc_reporting = nvgpu_kzalloc(g, sizeof(*l->l1ss_linux_ecc_reporting)); + if (l->l1ss_linux_ecc_reporting == NULL) { + nvgpu_err(g, "unable to allocate memory for l1ss safety services"); + return; + } + + ecc_report_linux = l->l1ss_linux_ecc_reporting; + + /* This will invoke the registration API */ + nvgpu_spinlock_init(&ecc_report_linux->lock); + ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK); + ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback; + ecc_report_linux->priv.data = g; + + nvgpu_log(g, gpu_dbg_info, "ECC reporting Init (L1SS)"); + + /* + * err == 0 indicates service is available but not active yet. + * err == 1 indicates service is available and active + * error for other cases. + */ + err = l1ss_register_client(&ecc_report_linux->priv); + if (err == 0) { + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + ecc_report_linux->service_enabled = false; + nvgpu_spinlock_release(&ecc_report_linux->lock); + nvgpu_log(g, gpu_dbg_info, "ECC reporting init success"); + } else if (err == 1) { + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + ecc_report_linux->service_enabled = true; + nvgpu_spinlock_release(&ecc_report_linux->lock); + nvgpu_log(g, gpu_dbg_info, "ECC reporting init started"); + } else { + nvgpu_log(g, gpu_dbg_info, "ECC reporting init failure %d", err); + } +} + +void nvgpu_l1ss_deinit_reporting(struct gk20a *g) +{ + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting; + + if (ecc_report_linux == NULL) + return; + + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + if (ecc_report_linux->service_enabled) { + ecc_report_linux->service_enabled = false; + } + nvgpu_spinlock_release(&ecc_report_linux->lock); + + (void)l1ss_deregister_client(ecc_report_linux->priv.id); + nvgpu_log(g, gpu_dbg_info, "ECC reporting de-init success"); + + nvgpu_kfree(g, ecc_report_linux); + l->l1ss_linux_ecc_reporting = NULL; +} + +int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id) +{ + int ret = 0; + bool service_enabled; + + /* - HW_unit_id (4-bits: bit-0 to 3), + * - Error_id (5-bits: bit-4 to 8), + * - Corrected/Uncorrected error (1-bit: bit-9), + * - Remaining 22-bits are unused. + */ + + u32 hw_unit = (err_id & HW_UNIT_ID_MASK); + u32 error_id = ((err_id >> ERR_ID_FIELD_SHIFT) & ERR_ID_MASK); + bool is_critical = ((err_id & (1 << CORRECTED_BIT_FIELD_SHIFT)) != 0U) ? true : false; + + struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting; + + nvgpu_log(g, gpu_dbg_info, "hw_unit = %u, error_id = %u, is_critical = %d", + hw_unit, error_id, is_critical); + + nvgpu_spinlock_acquire(&ecc_report_linux->lock); + service_enabled = ecc_report_linux->service_enabled; + nvgpu_spinlock_release(&ecc_report_linux->lock); + + if (service_enabled) { + ret = nvgpu_l1ss_report_error_linux(g, hw_unit, error_id, is_critical); + } else { + ret = nvgpu_l1ss_report_error_empty(g, hw_unit, err_id, is_critical); + } + + return ret; +} diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index 163ab1f74..415c2e6ff 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -70,6 +70,7 @@ #include #include #include +#include #include "platform_gk20a.h" #include "sysfs.h" @@ -1016,6 +1017,10 @@ void gk20a_remove_support(struct gk20a *g) struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); struct sim_nvgpu_linux *sim_linux; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + nvgpu_l1ss_deinit_reporting(g); +#endif + #if NVGPU_VPR_RESIZE_SUPPORTED if (nvgpu_is_enabled(g, NVGPU_SUPPORT_VPR)) { tegra_unregister_idle_unidle(gk20a_do_idle); @@ -1865,6 +1870,10 @@ static int gk20a_probe(struct platform_device *dev) if (err) goto return_err; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + nvgpu_l1ss_init_reporting(gk20a); +#endif + nvgpu_mutex_init(&l->dmabuf_priv_list_lock); nvgpu_init_list_node(&l->dmabuf_priv_list); diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h index a57b39930..eb277f6a4 100644 --- a/drivers/gpu/nvgpu/os/linux/os_linux.h +++ b/drivers/gpu/nvgpu/os/linux/os_linux.h @@ -108,6 +108,10 @@ struct nvgpu_os_linux { struct nvgpu_os_linux_ops ops; +#ifdef CONFIG_TEGRA_L1SS_SUPPORT + struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting; +#endif + struct notifier_block nvgpu_reboot_nb; #ifdef CONFIG_DEBUG_FS