gpu: nvgpu: Fix gv11b LUT for safe jetpack product

Current ga10b LUT used in gv11b is tailormade for auto safety
wherein non-ecc errors are treated as fatal and accordingly
quiesce is triggered. Recovery is also not supported.
Jetson industrial expects recovery in scenarios where it can
be supported.
Replaced ga10b automotive safety based LUT with gv11b
safe jetpack specific LUT. With this LUT, error criticality
is consistent across rel-32 and rel-35 .
The supported behaviour is:
1.Corrected ECC error, we report it as non-fatal
error and only convey the error to L1SS.
2.Uncorrected ECC error, we report it as fatal error
and hence trigger quiesce.
3.Non-ECC error, we report it as non-fatal and let
nvgpu perform recovery if it exists.

Bug 3920935

Change-Id: Iaa64aa91d6dd84b21c4d0c4684ead498e398698a
Signed-off-by: Kishan <kpalankar@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2866975
Reviewed-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
Kishan
2023-03-07 11:34:57 +00:00
committed by mobile promotions
parent 90e7747074
commit ddbd2da4a9
5 changed files with 611 additions and 6 deletions

View File

@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION. All Rights Reserved.
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All Rights Reserved.
#
# HAL units. These are the units that have access to HW.
#
@@ -1031,6 +1031,7 @@ cic:
hal/cic/mon/cic_gv11b.h,
hal/cic/mon/init_ga10b_fusa.c,
hal/cic/mon/lut_ga10b_fusa.c,
hal/cic/mon/lut_gv11b_non_fusa.c,
hal/cic/mon/cic_ga10b.h ]
grmgr:

View File

@@ -617,6 +617,7 @@ endif
nvgpu-$(CONFIG_TEGRA_L1SS_SUPPORT) += \
os/linux/cic/l1ss_report_err.o \
hal/cic/mon/init_gv11b_non_fusa.o \
hal/cic/mon/lut_gv11b_non_fusa.o \
nvgpu-y += \
common/mm/allocators/nvgpu_allocator.o \

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -23,7 +23,6 @@
#include <nvgpu/log.h>
#include "common/cic/mon/cic_mon_priv.h"
#include "cic_ga10b.h"
#include "cic_gv11b.h"
int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon)
@@ -33,7 +32,7 @@ int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon)
return -EINVAL;
}
cic_mon->err_lut = ga10b_err_lut;
cic_mon->num_hw_modules = size_of_ga10b_lut;
cic_mon->err_lut = gv11b_err_lut;
cic_mon->num_hw_modules = size_of_gv11b_lut;
return 0;
}

View File

@@ -0,0 +1,601 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/static_analysis.h>
#include "common/cic/mon/cic_mon_priv.h"
#include "cic_gv11b.h"
/*
* A flag to enable/disable hw error injection.
*/
#ifdef CONFIG_NVGPU_INJECT_HWERR
#define INJECT_TYPE (INJECT_HW)
#else
#define INJECT_TYPE (INJECT_SW)
#endif
/* This look-up table initializes the list of hw units and their errors.
* It also specifies the error injection mechanism supported, for each error.
* In case of hw error injection support, this initialization will be overriden
* by the values provided from the hal layers of corresponding hw units.
*/
struct nvgpu_err_hw_module gv11b_err_lut[] = {
{
.name = "host",
.hw_unit = (u32)NVGPU_ERR_MODULE_HOST,
.num_instances = 1U,
.num_errs = 16U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("pfifo_bind_error",
GPU_HOST_PFIFO_BIND_ERROR, INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pfifo_sched_error",
GPU_HOST_PFIFO_SCHED_ERROR, INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pfifo_chsw_error",
GPU_HOST_PFIFO_CHSW_ERROR, INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pfifo_memop_error",
GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pfifo_lb_error",
GPU_HOST_PFIFO_LB_ERROR, INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbus_squash_error",
GPU_HOST_PBUS_SQUASH_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbus_fecs_error",
GPU_HOST_PBUS_FECS_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbus_timeout_error",
GPU_HOST_PBUS_TIMEOUT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbdma_timeout_error",
GPU_HOST_PBDMA_TIMEOUT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbdma_extra_error",
GPU_HOST_PBDMA_EXTRA_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbdma_gpfifo_pb_error",
GPU_HOST_PBDMA_GPFIFO_PB_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbdma_method_error",
GPU_HOST_PBDMA_METHOD_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbdma_signature_error",
GPU_HOST_PBDMA_SIGNATURE_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pbdma_hce_error",
GPU_HOST_PBDMA_HCE_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pfifo_ctxsw_timeout",
GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pfifo_fb_flush_timeout",
GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "sm",
.hw_unit = (u32)NVGPU_ERR_MODULE_SM,
.num_instances = 8U,
.num_errs = 12U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("l1_tag_ecc_corrected",
GPU_SM_L1_TAG_ECC_CORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1_tag_ecc_uncorrected",
GPU_SM_L1_TAG_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("cbu_ecc_uncorrected",
GPU_SM_CBU_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("lrf_ecc_uncorrected",
GPU_SM_LRF_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1_data_ecc_uncorrected",
GPU_SM_L1_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("icache_l0_data_ecc_uncorrected",
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("icache_l1_data_ecc_uncorrected",
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("machine_check_error",
GPU_SM_MACHINE_CHECK_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("rams_urf_ecc_uncorrected",
GPU_SM_RAMS_URF_ECC_UNCORRECTED,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "fecs",
.hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
.num_instances = 1U,
.num_errs = 7U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("falcon_imem_ecc_corrected",
GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("falcon_imem_ecc_uncorrected",
GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ctxsw_watchdog_timeout",
GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ctxsw_crc_mismatch",
GPU_FECS_CTXSW_CRC_MISMATCH,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("fault_during_ctxsw",
GPU_FECS_FAULT_DURING_CTXSW,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ctxsw_init_error",
GPU_FECS_CTXSW_INIT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "gpccs",
.hw_unit = (u32)NVGPU_ERR_MODULE_GPCCS,
.num_instances = 1U,
.num_errs = 3U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("falcon_imem_ecc_corrected",
GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("falcon_imem_ecc_uncorrected",
GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "mmu",
.hw_unit = (u32)NVGPU_ERR_MODULE_MMU,
.num_instances = 1U,
.num_errs = 2U,
.errs = (struct nvgpu_err_desc[]) {
GPU_CRITERR("l1tlb_sa_data_ecc_uncorrected",
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1tlb_fa_data_ecc_uncorrected",
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "gcc",
.hw_unit = (u32)NVGPU_ERR_MODULE_GCC,
.num_instances = 1U,
.num_errs = 1U,
.errs = (struct nvgpu_err_desc[]) {
GPU_CRITERR("l15_ecc_uncorrected",
GPU_GCC_L15_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "pmu",
.hw_unit = (u32)NVGPU_ERR_MODULE_PMU,
.num_instances = 1U,
.num_errs = 10U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("pmu_nvriscv_brom_failure",
GPU_PMU_NVRISCV_BROM_FAILURE,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pmu_access_timeout",
GPU_PMU_ACCESS_TIMEOUT_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_mpu_ecc_uncorrected",
GPU_PMU_MPU_ECC_UNCORRECTED,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_illegal_access_uncorrected",
GPU_PMU_ILLEGAL_ACCESS_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_imem_ecc_uncorrected",
GPU_PMU_IMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_dcls_uncorrected",
GPU_PMU_DCLS_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_dmem_ecc_uncorrected",
GPU_PMU_DMEM_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_wdt_uncorrected",
GPU_PMU_WDT_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pmu_reg_ecc_uncorrected",
GPU_PMU_REG_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pmu_bar0_error_timeout",
GPU_PMU_BAR0_ERROR_TIMEOUT,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "pgraph",
.hw_unit = (u32)NVGPU_ERR_MODULE_PGRAPH,
.num_instances = 1U,
.num_errs = 21U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("fe_exception",
GPU_PGRAPH_FE_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("memfmt_exception",
GPU_PGRAPH_MEMFMT_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pd_exception",
GPU_PGRAPH_PD_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("scc_exception",
GPU_PGRAPH_SCC_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ds_exception",
GPU_PGRAPH_DS_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ssync_exception",
GPU_PGRAPH_SSYNC_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("mme_exception",
GPU_PGRAPH_MME_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("sked_exception",
GPU_PGRAPH_SKED_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("be_crop_exception",
GPU_PGRAPH_BE_CROP_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("be_zrop_exception",
GPU_PGRAPH_BE_ZROP_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("mpc_exception",
GPU_PGRAPH_MPC_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("illegal_notify_error",
GPU_PGRAPH_ILLEGAL_NOTIFY_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("illegal_method_error",
GPU_PGRAPH_ILLEGAL_METHOD_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("illegal_class_error",
GPU_PGRAPH_ILLEGAL_CLASS_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("class_error",
GPU_PGRAPH_CLASS_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("gpc_gfx_prop_exception",
GPU_PGRAPH_GPC_GFX_PROP_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("gpc_gfx_zcull_exception",
GPU_PGRAPH_GPC_GFX_ZCULL_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("gpc_gfx_setup_exception",
GPU_PGRAPH_GPC_GFX_SETUP_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("gpc_gfx_pes_exception",
GPU_PGRAPH_GPC_GFX_PES_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("gpc_gfx_tpc_pe_exception",
GPU_PGRAPH_GPC_GFX_TPC_PE_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("mme_fe1_exception",
GPU_PGRAPH_MME_FE1_EXCEPTION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "ltc",
.hw_unit = (u32)NVGPU_ERR_MODULE_LTC,
.num_instances = 1U,
.num_errs = 4U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("cache_dstg_ecc_corrected",
GPU_LTC_CACHE_DSTG_ECC_CORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("cache_dstg_ecc_uncorrected",
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("cache_tstg_ecc_uncorrected",
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("cache_rstg_cbc_ecc_uncorrected",
GPU_LTC_CACHE_RSTG_CBC_ECC_UNCORRECTED,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "hubmmu",
.hw_unit = (u32)NVGPU_ERR_MODULE_HUBMMU,
.num_instances = 1U,
.num_errs = 9U,
.errs = (struct nvgpu_err_desc[]) {
GPU_CRITERR("hubmmu_l2tlb_sa_data_ecc_uncorrected",
GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_tlb_sa_data_ecc_uncorrected",
GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_pte_data_ecc_uncorrected",
GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_pde0_data_ecc_uncorrected",
GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_page_fault_other_fault_notify_error",
GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_page_fault_nonreplayable_fault_overflow_error",
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_page_fault_replayable_fault_overflow_error",
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_page_fault_replayable_fault_notify_error",
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_page_fault_nonreplayable_fault_notify_error",
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "pri",
.hw_unit = (u32)NVGPU_ERR_MODULE_PRI,
.num_instances = 1U,
.num_errs = 2U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("pri_timeout_error",
GPU_PRI_TIMEOUT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pri_access_violation",
GPU_PRI_ACCESS_VIOLATION,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
},
},
{
.name = "ce",
.hw_unit = (u32)NVGPU_ERR_MODULE_CE,
.num_instances = 1U,
.num_errs = 5U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("ce_launch_error",
GPU_CE_LAUNCH_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
#ifdef CONFIG_NVGPU_NON_FUSA
GPU_NONCRITERR("ce_method_buffer_fault",
GPU_CE_METHOD_BUFFER_FAULT,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ce_fbuf_crc_fail",
GPU_CE_FBUF_CRC_FAIL,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ce_fbuf_magic_chk_fail",
GPU_CE_FBUF_MAGIC_CHK_FAIL,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("ce_invalid_config",
GPU_CE_INVALID_CONFIG,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
#endif
},
},
};
u32 size_of_gv11b_lut = sizeof(gv11b_err_lut) /
sizeof(struct nvgpu_err_hw_module);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA Corporation. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -256,6 +256,9 @@ static int nvgpu_l1ss_report_error_linux(struct gk20a *g, u32 hw_unit_id, u32 er
if (err != 0)
nvgpu_err(g, "Error returned from L1SS submit %d", err);
if (is_critical)
nvgpu_sw_quiesce(g);
return err;
}