From ddbd2da4a9fc4416da616a734fcd3922b2333158 Mon Sep 17 00:00:00 2001 From: Kishan Date: Tue, 7 Mar 2023 11:34:57 +0000 Subject: [PATCH] gpu: nvgpu: Fix gv11b LUT for safe jetpack product Current ga10b LUT used in gv11b is tailormade for auto safety wherein non-ecc errors are treated as fatal and accordingly quiesce is triggered. Recovery is also not supported. Jetson industrial expects recovery in scenarios where it can be supported. Replaced ga10b automotive safety based LUT with gv11b safe jetpack specific LUT. With this LUT, error criticality is consistent across rel-32 and rel-35 . The supported behaviour is: 1.Corrected ECC error, we report it as non-fatal error and only convey the error to L1SS. 2.Uncorrected ECC error, we report it as fatal error and hence trigger quiesce. 3.Non-ECC error, we report it as non-fatal and let nvgpu perform recovery if it exists. Bug 3920935 Change-Id: Iaa64aa91d6dd84b21c4d0c4684ead498e398698a Signed-off-by: Kishan Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2866975 Reviewed-by: Tejal Kudav Reviewed-by: Sagar Kamble Reviewed-by: Bibek Basu GVS: Gerrit_Virtual_Submit --- arch/nvgpu-hal-new.yaml | 3 +- drivers/gpu/nvgpu/Makefile | 1 + .../nvgpu/hal/cic/mon/init_gv11b_non_fusa.c | 7 +- .../nvgpu/hal/cic/mon/lut_gv11b_non_fusa.c | 601 ++++++++++++++++++ .../gpu/nvgpu/os/linux/cic/l1ss_report_err.c | 5 +- 5 files changed, 611 insertions(+), 6 deletions(-) create mode 100644 drivers/gpu/nvgpu/hal/cic/mon/lut_gv11b_non_fusa.c diff --git a/arch/nvgpu-hal-new.yaml b/arch/nvgpu-hal-new.yaml index 950c139d5..69650ba6a 100644 --- a/arch/nvgpu-hal-new.yaml +++ b/arch/nvgpu-hal-new.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. All Rights Reserved. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All Rights Reserved. # # HAL units. These are the units that have access to HW. # @@ -1031,6 +1031,7 @@ cic: hal/cic/mon/cic_gv11b.h, hal/cic/mon/init_ga10b_fusa.c, hal/cic/mon/lut_ga10b_fusa.c, + hal/cic/mon/lut_gv11b_non_fusa.c, hal/cic/mon/cic_ga10b.h ] grmgr: diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 4a92eb891..3dd89823b 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -617,6 +617,7 @@ endif nvgpu-$(CONFIG_TEGRA_L1SS_SUPPORT) += \ os/linux/cic/l1ss_report_err.o \ hal/cic/mon/init_gv11b_non_fusa.o \ + hal/cic/mon/lut_gv11b_non_fusa.o \ nvgpu-y += \ common/mm/allocators/nvgpu_allocator.o \ diff --git a/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c b/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c index 33e9bdb7e..30db2c68d 100644 --- a/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c +++ b/drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,7 +23,6 @@ #include #include "common/cic/mon/cic_mon_priv.h" -#include "cic_ga10b.h" #include "cic_gv11b.h" int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon) @@ -33,7 +32,7 @@ int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon) return -EINVAL; } - cic_mon->err_lut = ga10b_err_lut; - cic_mon->num_hw_modules = size_of_ga10b_lut; + cic_mon->err_lut = gv11b_err_lut; + cic_mon->num_hw_modules = size_of_gv11b_lut; return 0; } diff --git a/drivers/gpu/nvgpu/hal/cic/mon/lut_gv11b_non_fusa.c b/drivers/gpu/nvgpu/hal/cic/mon/lut_gv11b_non_fusa.c new file mode 100644 index 000000000..6beba7d94 --- /dev/null +++ b/drivers/gpu/nvgpu/hal/cic/mon/lut_gv11b_non_fusa.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include "common/cic/mon/cic_mon_priv.h" +#include "cic_gv11b.h" + +/* + * A flag to enable/disable hw error injection. + */ +#ifdef CONFIG_NVGPU_INJECT_HWERR +#define INJECT_TYPE (INJECT_HW) +#else +#define INJECT_TYPE (INJECT_SW) +#endif + +/* This look-up table initializes the list of hw units and their errors. + * It also specifies the error injection mechanism supported, for each error. + * In case of hw error injection support, this initialization will be overriden + * by the values provided from the hal layers of corresponding hw units. + */ +struct nvgpu_err_hw_module gv11b_err_lut[] = { + { + .name = "host", + .hw_unit = (u32)NVGPU_ERR_MODULE_HOST, + .num_instances = 1U, + .num_errs = 16U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("pfifo_bind_error", + GPU_HOST_PFIFO_BIND_ERROR, INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pfifo_sched_error", + GPU_HOST_PFIFO_SCHED_ERROR, INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pfifo_chsw_error", + GPU_HOST_PFIFO_CHSW_ERROR, INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pfifo_memop_error", + GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pfifo_lb_error", + GPU_HOST_PFIFO_LB_ERROR, INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbus_squash_error", + GPU_HOST_PBUS_SQUASH_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbus_fecs_error", + GPU_HOST_PBUS_FECS_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbus_timeout_error", + GPU_HOST_PBUS_TIMEOUT_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbdma_timeout_error", + GPU_HOST_PBDMA_TIMEOUT_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbdma_extra_error", + GPU_HOST_PBDMA_EXTRA_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbdma_gpfifo_pb_error", + GPU_HOST_PBDMA_GPFIFO_PB_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbdma_method_error", + GPU_HOST_PBDMA_METHOD_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbdma_signature_error", + GPU_HOST_PBDMA_SIGNATURE_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pbdma_hce_error", + GPU_HOST_PBDMA_HCE_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pfifo_ctxsw_timeout", + GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pfifo_fb_flush_timeout", + GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "sm", + .hw_unit = (u32)NVGPU_ERR_MODULE_SM, + .num_instances = 8U, + .num_errs = 12U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("l1_tag_ecc_corrected", + GPU_SM_L1_TAG_ECC_CORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_tag_ecc_uncorrected", + GPU_SM_L1_TAG_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cbu_ecc_uncorrected", + GPU_SM_CBU_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("lrf_ecc_uncorrected", + GPU_SM_LRF_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_data_ecc_uncorrected", + GPU_SM_L1_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("icache_l0_data_ecc_uncorrected", + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("icache_l1_data_ecc_uncorrected", + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("icache_l0_predecode_ecc_uncorrected", + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected", + GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected", + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("machine_check_error", + GPU_SM_MACHINE_CHECK_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("rams_urf_ecc_uncorrected", + GPU_SM_RAMS_URF_ECC_UNCORRECTED, + INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "fecs", + .hw_unit = (u32)NVGPU_ERR_MODULE_FECS, + .num_instances = 1U, + .num_errs = 7U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("falcon_imem_ecc_corrected", + GPU_FECS_FALCON_IMEM_ECC_CORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_imem_ecc_uncorrected", + GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_dmem_ecc_uncorrected", + GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ctxsw_watchdog_timeout", + GPU_FECS_CTXSW_WATCHDOG_TIMEOUT, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ctxsw_crc_mismatch", + GPU_FECS_CTXSW_CRC_MISMATCH, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("fault_during_ctxsw", + GPU_FECS_FAULT_DURING_CTXSW, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ctxsw_init_error", + GPU_FECS_CTXSW_INIT_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "gpccs", + .hw_unit = (u32)NVGPU_ERR_MODULE_GPCCS, + .num_instances = 1U, + .num_errs = 3U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("falcon_imem_ecc_corrected", + GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_imem_ecc_uncorrected", + GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_dmem_ecc_uncorrected", + GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "mmu", + .hw_unit = (u32)NVGPU_ERR_MODULE_MMU, + .num_instances = 1U, + .num_errs = 2U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("l1tlb_sa_data_ecc_uncorrected", + GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1tlb_fa_data_ecc_uncorrected", + GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "gcc", + .hw_unit = (u32)NVGPU_ERR_MODULE_GCC, + .num_instances = 1U, + .num_errs = 1U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("l15_ecc_uncorrected", + GPU_GCC_L15_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "pmu", + .hw_unit = (u32)NVGPU_ERR_MODULE_PMU, + .num_instances = 1U, + .num_errs = 10U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("pmu_nvriscv_brom_failure", + GPU_PMU_NVRISCV_BROM_FAILURE, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pmu_access_timeout", + GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_mpu_ecc_uncorrected", + GPU_PMU_MPU_ECC_UNCORRECTED, + INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_illegal_access_uncorrected", + GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_imem_ecc_uncorrected", + GPU_PMU_IMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_dcls_uncorrected", + GPU_PMU_DCLS_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_dmem_ecc_uncorrected", + GPU_PMU_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_wdt_uncorrected", + GPU_PMU_WDT_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("pmu_reg_ecc_uncorrected", + GPU_PMU_REG_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pmu_bar0_error_timeout", + GPU_PMU_BAR0_ERROR_TIMEOUT, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "pgraph", + .hw_unit = (u32)NVGPU_ERR_MODULE_PGRAPH, + .num_instances = 1U, + .num_errs = 21U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("fe_exception", + GPU_PGRAPH_FE_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("memfmt_exception", + GPU_PGRAPH_MEMFMT_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pd_exception", + GPU_PGRAPH_PD_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("scc_exception", + GPU_PGRAPH_SCC_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ds_exception", + GPU_PGRAPH_DS_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ssync_exception", + GPU_PGRAPH_SSYNC_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("mme_exception", + GPU_PGRAPH_MME_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("sked_exception", + GPU_PGRAPH_SKED_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("be_crop_exception", + GPU_PGRAPH_BE_CROP_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("be_zrop_exception", + GPU_PGRAPH_BE_ZROP_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("mpc_exception", + GPU_PGRAPH_MPC_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("illegal_notify_error", + GPU_PGRAPH_ILLEGAL_NOTIFY_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("illegal_method_error", + GPU_PGRAPH_ILLEGAL_METHOD_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("illegal_class_error", + GPU_PGRAPH_ILLEGAL_CLASS_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("class_error", + GPU_PGRAPH_CLASS_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("gpc_gfx_prop_exception", + GPU_PGRAPH_GPC_GFX_PROP_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("gpc_gfx_zcull_exception", + GPU_PGRAPH_GPC_GFX_ZCULL_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("gpc_gfx_setup_exception", + GPU_PGRAPH_GPC_GFX_SETUP_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("gpc_gfx_pes_exception", + GPU_PGRAPH_GPC_GFX_PES_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("gpc_gfx_tpc_pe_exception", + GPU_PGRAPH_GPC_GFX_TPC_PE_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("mme_fe1_exception", + GPU_PGRAPH_MME_FE1_EXCEPTION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "ltc", + .hw_unit = (u32)NVGPU_ERR_MODULE_LTC, + .num_instances = 1U, + .num_errs = 4U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("cache_dstg_ecc_corrected", + GPU_LTC_CACHE_DSTG_ECC_CORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cache_dstg_ecc_uncorrected", + GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cache_tstg_ecc_uncorrected", + GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cache_rstg_cbc_ecc_uncorrected", + GPU_LTC_CACHE_RSTG_CBC_ECC_UNCORRECTED, + INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "hubmmu", + .hw_unit = (u32)NVGPU_ERR_MODULE_HUBMMU, + .num_instances = 1U, + .num_errs = 9U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("hubmmu_l2tlb_sa_data_ecc_uncorrected", + GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_tlb_sa_data_ecc_uncorrected", + GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_pte_data_ecc_uncorrected", + GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_pde0_data_ecc_uncorrected", + GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_page_fault_other_fault_notify_error", + GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_page_fault_nonreplayable_fault_overflow_error", + GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_page_fault_replayable_fault_overflow_error", + GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_page_fault_replayable_fault_notify_error", + GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_page_fault_nonreplayable_fault_notify_error", + GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "pri", + .hw_unit = (u32)NVGPU_ERR_MODULE_PRI, + .num_instances = 1U, + .num_errs = 2U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("pri_timeout_error", + GPU_PRI_TIMEOUT_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pri_access_violation", + GPU_PRI_ACCESS_VIOLATION, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + }, + }, + { + .name = "ce", + .hw_unit = (u32)NVGPU_ERR_MODULE_CE, + .num_instances = 1U, + .num_errs = 5U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("ce_launch_error", + GPU_CE_LAUNCH_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), +#ifdef CONFIG_NVGPU_NON_FUSA + GPU_NONCRITERR("ce_method_buffer_fault", + GPU_CE_METHOD_BUFFER_FAULT, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ce_fbuf_crc_fail", + GPU_CE_FBUF_CRC_FAIL, + INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ce_fbuf_magic_chk_fail", + GPU_CE_FBUF_MAGIC_CHK_FAIL, + INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ce_invalid_config", + GPU_CE_INVALID_CONFIG, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), +#endif + }, + }, +}; + +u32 size_of_gv11b_lut = sizeof(gv11b_err_lut) / + sizeof(struct nvgpu_err_hw_module); diff --git a/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c b/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c index 52026cf85..8a1ac37c5 100644 --- a/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c +++ b/drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA Corporation. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -256,6 +256,9 @@ static int nvgpu_l1ss_report_error_linux(struct gk20a *g, u32 hw_unit_id, u32 er if (err != 0) nvgpu_err(g, "Error returned from L1SS submit %d", err); + if (is_critical) + nvgpu_sw_quiesce(g); + return err; }