mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: add error reporting support for L4T
Add error reporting support for T194's L1SS safety services for linux. Used GA10B's LUT for GV11B. The error ids for T194 are different compared to GA10B. This is handled by creating a separate table mapping existing error ids to match GV11B. Ids that are not used by GV11B are set to U32_MAX to indicate the driver to not send them to the l1ss driver. Bug 200588528 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Change-Id: I10a267942df77458c3deee0aad1179955490aa74 Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2736772 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
28ddb0996f
commit
e89553fe62
@@ -15,7 +15,8 @@ nvgpu:
|
|||||||
sources: [ include/nvgpu/gk20a.h,
|
sources: [ include/nvgpu/gk20a.h,
|
||||||
include/nvgpu/nvgpu_common.h,
|
include/nvgpu/nvgpu_common.h,
|
||||||
include/nvgpu/cov_whitelist.h,
|
include/nvgpu/cov_whitelist.h,
|
||||||
include/nvgpu/static_analysis.h ]
|
include/nvgpu/static_analysis.h,
|
||||||
|
include/nvgpu/l1ss_err_reporting.h ]
|
||||||
|
|
||||||
bios:
|
bios:
|
||||||
safe: yes
|
safe: yes
|
||||||
|
|||||||
@@ -1027,7 +1027,9 @@ tpc:
|
|||||||
cic:
|
cic:
|
||||||
safe: yes
|
safe: yes
|
||||||
owner: Tejal K
|
owner: Tejal K
|
||||||
sources: [ hal/cic/mon/init_ga10b_fusa.c,
|
sources: [ hal/cic/mon/init_gv11b_non_fusa.c,
|
||||||
|
hal/cic/mon/cic_gv11b.h,
|
||||||
|
hal/cic/mon/init_ga10b_fusa.c,
|
||||||
hal/cic/mon/lut_ga10b_fusa.c,
|
hal/cic/mon/lut_ga10b_fusa.c,
|
||||||
hal/cic/mon/cic_ga10b.h ]
|
hal/cic/mon/cic_ga10b.h ]
|
||||||
|
|
||||||
|
|||||||
@@ -241,7 +241,8 @@ vm:
|
|||||||
os/linux/nvgpu_ivm.c ]
|
os/linux/nvgpu_ivm.c ]
|
||||||
|
|
||||||
cic:
|
cic:
|
||||||
sources: [ os/linux/cic/cic_report_err.c ]
|
sources: [ os/linux/cic/cic_report_err.c,
|
||||||
|
os/linux/cic/l1ss_report_err.c ]
|
||||||
|
|
||||||
# Group all the Linux headers for now.
|
# Group all the Linux headers for now.
|
||||||
headers:
|
headers:
|
||||||
|
|||||||
@@ -61,6 +61,10 @@ ifeq ($(CONFIG_NVGPU_IVM_BUILD),y)
|
|||||||
ccflags-y += -DCONFIG_NVGPU_IVM_BUILD
|
ccflags-y += -DCONFIG_NVGPU_IVM_BUILD
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CONFIG_TEGRA_L1SS_SUPPORT),y)
|
||||||
|
ccflags-y += -DCONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
endif
|
||||||
|
|
||||||
ccflags-y += -DCONFIG_NVGPU_DETERMINISTIC_CHANNELS
|
ccflags-y += -DCONFIG_NVGPU_DETERMINISTIC_CHANNELS
|
||||||
ccflags-y += -DCONFIG_NVGPU_STATIC_POWERGATE
|
ccflags-y += -DCONFIG_NVGPU_STATIC_POWERGATE
|
||||||
ccflags-y += -DCONFIG_NVGPU_ACR_LEGACY
|
ccflags-y += -DCONFIG_NVGPU_ACR_LEGACY
|
||||||
@@ -606,6 +610,10 @@ nvgpu-$(CONFIG_NVGPU_SUPPORT_CDE) += \
|
|||||||
os/linux/debug_cde.o
|
os/linux/debug_cde.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
nvgpu-$(CONFIG_TEGRA_L1SS_SUPPORT) += \
|
||||||
|
os/linux/cic/l1ss_report_err.o \
|
||||||
|
hal/cic/mon/init_gv11b_non_fusa.o \
|
||||||
|
|
||||||
nvgpu-y += \
|
nvgpu-y += \
|
||||||
common/mm/allocators/nvgpu_allocator.o \
|
common/mm/allocators/nvgpu_allocator.o \
|
||||||
common/mm/allocators/bitmap_allocator.o \
|
common/mm/allocators/bitmap_allocator.o \
|
||||||
|
|||||||
@@ -112,6 +112,11 @@ ifdef CONFIG_TEGRA_EPL
|
|||||||
CONFIG_NVGPU_ENABLE_MISC_EC := y
|
CONFIG_NVGPU_ENABLE_MISC_EC := y
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
CONFIG_TEGRA_L1SS_SUPPORT := n
|
||||||
|
ifdef CONFIG_TEGRA_SAFETY
|
||||||
|
CONFIG_TEGRA_L1SS_SUPPORT := y
|
||||||
|
endif
|
||||||
|
|
||||||
CONFIG_NVGPU_NVMEM_FUSE := n
|
CONFIG_NVGPU_NVMEM_FUSE := n
|
||||||
|
|
||||||
CONFIG_NVGPU_NVMAP_NEXT := y
|
CONFIG_NVGPU_NVMAP_NEXT := y
|
||||||
|
|||||||
36
drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h
Normal file
36
drivers/gpu/nvgpu/hal/cic/mon/cic_gv11b.h
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef CIC_GV11B_H
|
||||||
|
#define CIC_GV11B_H
|
||||||
|
|
||||||
|
#include <nvgpu/nvgpu_err_info.h>
|
||||||
|
|
||||||
|
struct gk20a;
|
||||||
|
struct nvgpu_cic_mon;
|
||||||
|
|
||||||
|
extern struct nvgpu_err_hw_module gv11b_err_lut[];
|
||||||
|
extern u32 size_of_gv11b_lut;
|
||||||
|
|
||||||
|
int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon);
|
||||||
|
|
||||||
|
#endif /* CIC_GV11B_H */
|
||||||
39
drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c
Normal file
39
drivers/gpu/nvgpu/hal/cic/mon/init_gv11b_non_fusa.c
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <nvgpu/log.h>
|
||||||
|
|
||||||
|
#include "common/cic/mon/cic_mon_priv.h"
|
||||||
|
#include "cic_ga10b.h"
|
||||||
|
#include "cic_gv11b.h"
|
||||||
|
|
||||||
|
int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon)
|
||||||
|
{
|
||||||
|
if (cic_mon == NULL) {
|
||||||
|
nvgpu_err(g, "Invalid CIC reference pointer.");
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
cic_mon->err_lut = ga10b_err_lut;
|
||||||
|
cic_mon->num_hw_modules = size_of_ga10b_lut;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -49,6 +49,8 @@
|
|||||||
#include <nvgpu/pmu/pmu_pg.h>
|
#include <nvgpu/pmu/pmu_pg.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <nvgpu/l1ss_err_reporting.h>
|
||||||
|
|
||||||
#include "hal/mm/mm_gp10b.h"
|
#include "hal/mm/mm_gp10b.h"
|
||||||
#include "hal/mm/mm_gv11b.h"
|
#include "hal/mm/mm_gv11b.h"
|
||||||
#include "hal/mm/cache/flush_gk20a.h"
|
#include "hal/mm/cache/flush_gk20a.h"
|
||||||
@@ -199,6 +201,10 @@
|
|||||||
#include "hal/tpc/tpc_gv11b.h"
|
#include "hal/tpc/tpc_gv11b.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
#include "hal/cic/mon/cic_gv11b.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "hal_gv11b.h"
|
#include "hal_gv11b.h"
|
||||||
#include "hal_gv11b_litter.h"
|
#include "hal_gv11b_litter.h"
|
||||||
|
|
||||||
@@ -1488,6 +1494,13 @@ static const struct gops_grmgr gv11b_ops_grmgr = {
|
|||||||
.init_gr_manager = nvgpu_init_gr_manager,
|
.init_gr_manager = nvgpu_init_gr_manager,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
static const struct gops_cic_mon gv11b_ops_cic_mon = {
|
||||||
|
.init = gv11b_cic_mon_init,
|
||||||
|
.report_err = nvgpu_l1ss_report_err
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
int gv11b_init_hal(struct gk20a *g)
|
int gv11b_init_hal(struct gk20a *g)
|
||||||
{
|
{
|
||||||
struct gpu_ops *gops = &g->ops;
|
struct gpu_ops *gops = &g->ops;
|
||||||
@@ -1587,6 +1600,9 @@ int gv11b_init_hal(struct gk20a *g)
|
|||||||
gops->gpc_pg = gv11b_ops_gpc_pg;
|
gops->gpc_pg = gv11b_ops_gpc_pg;
|
||||||
#endif
|
#endif
|
||||||
gops->grmgr = gv11b_ops_grmgr;
|
gops->grmgr = gv11b_ops_grmgr;
|
||||||
|
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
gops->cic_mon = gv11b_ops_cic_mon;
|
||||||
|
#endif
|
||||||
gops->chip_init_gpu_characteristics = gv11b_init_gpu_characteristics;
|
gops->chip_init_gpu_characteristics = gv11b_init_gpu_characteristics;
|
||||||
gops->get_litter_value = gv11b_get_litter_value;
|
gops->get_litter_value = gv11b_get_litter_value;
|
||||||
gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;
|
gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;
|
||||||
|
|||||||
38
drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h
Normal file
38
drivers/gpu/nvgpu/include/nvgpu/l1ss_err_reporting.h
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
#ifndef NVGPU_L1SS_ECC_H
|
||||||
|
#define NVGPU_L1SS_ECC_H
|
||||||
|
|
||||||
|
#include <nvgpu/types.h>
|
||||||
|
|
||||||
|
struct gk20a;
|
||||||
|
struct nvgpu_l1ss_ecc_reporting;
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id);
|
||||||
|
void nvgpu_l1ss_deinit_reporting(struct gk20a *g);
|
||||||
|
void nvgpu_l1ss_init_reporting(struct gk20a *g);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* NVGPU_L1SS_ECC_H */
|
||||||
399
drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c
Normal file
399
drivers/gpu/nvgpu/os/linux/cic/l1ss_report_err.c
Normal file
@@ -0,0 +1,399 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2022, NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms and conditions of the GNU General Public License,
|
||||||
|
* version 2, as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||||
|
* more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/tegra_l1ss_kernel_interface.h>
|
||||||
|
#include <linux/tegra_l1ss_ioctl.h>
|
||||||
|
#include <linux/tegra_nv_guard_service_id.h>
|
||||||
|
#include <linux/tegra_nv_guard_group_id.h>
|
||||||
|
|
||||||
|
#include <nvgpu/gk20a.h>
|
||||||
|
#include <nvgpu/lock.h>
|
||||||
|
#include <nvgpu/timers.h>
|
||||||
|
#include <nvgpu/cic_mon.h>
|
||||||
|
#include <nvgpu/log.h>
|
||||||
|
#include <nvgpu/nvgpu_init.h>
|
||||||
|
#include <nvgpu/l1ss_err_reporting.h>
|
||||||
|
|
||||||
|
#include "os/linux/os_linux.h"
|
||||||
|
|
||||||
|
#define NVGPU_ERR_INVALID U32_MAX
|
||||||
|
|
||||||
|
struct nvgpu_l1ss_ecc_reporting {
|
||||||
|
struct gk20a *g;
|
||||||
|
client_param_t priv;
|
||||||
|
bool service_enabled;
|
||||||
|
/* protects service enabled */
|
||||||
|
struct nvgpu_spinlock lock;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct nvgpu_l1ss_error_id_mappings {
|
||||||
|
u32 num_errs;
|
||||||
|
u32 *error_id_mappings;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct nvgpu_l1ss_error_id_mappings mappings[] = {
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_HOST*************** */
|
||||||
|
.num_errs = 16,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_BIND_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_SCHED_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CHSW_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_MEMOP_TIMEOUT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_LB_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_SQUASH_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_FECS_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_TIMEOUT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_TIMEOUT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_EXTRA_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_GPFIFO_PB_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_METHOD_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_SIGNATURE_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_HCE_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CTXSW_TIMEOUT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_FB_FLUSH_TIMEOUT_ERROR,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_SM*************** */
|
||||||
|
.num_errs = 11,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_CBU_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_LRF_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L1_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_MISS_FIFO_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_SM_SWERR_MACHINE_CHECK_ERROR,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_FECS*************** */
|
||||||
|
.num_errs = 7,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_DMEM_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_WATCHDOG_TIMEOUT,
|
||||||
|
NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_CRC_MISMATCH,
|
||||||
|
NVGUARD_SERVICE_IGPU_FECS_SWERR_FAULT_DURING_CTXSW,
|
||||||
|
NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_INIT_ERROR,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_GPCCS*************** */
|
||||||
|
.num_errs = 3,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_CORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_DMEM_ECC_UNCORRECTED,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_MMU*************** */
|
||||||
|
.num_errs = 2,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_SA_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_FA_DATA_ECC_UNCORRECTED,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_GCC*************** */
|
||||||
|
.num_errs = 1,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_GCC_SWERR_L15_ECC_UNCORRECTED
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_PMU*************** */
|
||||||
|
.num_errs = 10,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_UNCORRECTED,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_DMEM_ECC_UNCORRECTED,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGUARD_SERVICE_IGPU_PMU_SWERR_BAR0_ERROR_TIMEOUT,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_PGRAPH*************** */
|
||||||
|
.num_errs = 21,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_FE_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MEMFMT_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_PD_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SCC_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_DS_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SSYNC_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MME_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SKED_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MPC_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
|
||||||
|
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_LTC*************** */
|
||||||
|
.num_errs = 4,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_CORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_TSTG_ECC_UNCORRECTED,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_HUBMMU*************** */
|
||||||
|
.num_errs = 9,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_L2TLB_SA_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_TLB_SA_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PTE_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PDE0_DATA_ECC_UNCORRECTED,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_PRI*************** */
|
||||||
|
.num_errs = 2,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_PRI_SWERR_TIMEOUT_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_PRI_SWERR_ACCESS_VIOLATION,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* *************** SERVICE ID for IGPU_CE*************** */
|
||||||
|
.num_errs = 5,
|
||||||
|
.error_id_mappings = (u32 []) {
|
||||||
|
NVGUARD_SERVICE_IGPU_CE_SWERR_LAUNCH_ERROR,
|
||||||
|
NVGUARD_SERVICE_IGPU_CE_SWERR_METHOD_BUFFER_FAULT,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGPU_ERR_INVALID,
|
||||||
|
NVGUARD_SERVICE_IGPU_CE_SWERR_INVALID_CONFIG,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
static int nvgpu_l1ss_report_error_linux(struct gk20a *g, u32 hw_unit_id, u32 err_id,
|
||||||
|
bool is_critical)
|
||||||
|
{
|
||||||
|
int err = 0;
|
||||||
|
u32 nv_service_id = 0;
|
||||||
|
u8 err_status = 0;
|
||||||
|
u64 timestamp = (u64)nvgpu_current_time_ns();
|
||||||
|
nv_guard_request_t req;
|
||||||
|
|
||||||
|
if (hw_unit_id >= sizeof(mappings)) {
|
||||||
|
nvgpu_err(g, "Error Id H/W index out of bounds\n");
|
||||||
|
return -EINVAL;
|
||||||
|
} else if (err_id >= mappings[hw_unit_id].num_errs) {
|
||||||
|
nvgpu_err(g, "Error Id index out of bounds\n");
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(&req, 0, sizeof(req));
|
||||||
|
|
||||||
|
nv_service_id = mappings[hw_unit_id].error_id_mappings[err_id];
|
||||||
|
|
||||||
|
if (nv_service_id == NVGPU_ERR_INVALID) {
|
||||||
|
/* error id not supported */
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_critical)
|
||||||
|
err_status = NVGUARD_ERROR_DETECTED;
|
||||||
|
else
|
||||||
|
err_status = NVGUARD_NO_ERROR;
|
||||||
|
|
||||||
|
req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
|
||||||
|
req.srv_status.srv_id = (nv_guard_service_id_t)nv_service_id;
|
||||||
|
req.srv_status.status = err_status;
|
||||||
|
req.srv_status.timestamp = timestamp;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* l1ss_submit_rq may fail due to kmalloc failures but may pass in
|
||||||
|
* subsequent calls
|
||||||
|
*/
|
||||||
|
err = l1ss_submit_rq(&req, true);
|
||||||
|
if (err != 0)
|
||||||
|
nvgpu_err(g, "Error returned from L1SS submit %d", err);
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int nvgpu_l1ss_report_error_empty(struct gk20a *g,
|
||||||
|
u32 hw_unit_id, u32 err_id, bool is_critical)
|
||||||
|
{
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting is empty");
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
|
||||||
|
{
|
||||||
|
struct gk20a *g = (struct gk20a *)data;
|
||||||
|
struct nvgpu_os_linux *l = NULL;
|
||||||
|
struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting = NULL;
|
||||||
|
int err = 0;
|
||||||
|
/* Ensure we have a valid gk20a struct before proceeding */
|
||||||
|
if ((g == NULL) || (nvgpu_get(g) == NULL))
|
||||||
|
return -ENODEV;
|
||||||
|
|
||||||
|
l = nvgpu_os_linux_from_gk20a(g);
|
||||||
|
l1ss_linux_ecc_reporting = l->l1ss_linux_ecc_reporting;
|
||||||
|
|
||||||
|
nvgpu_spinlock_acquire(&l1ss_linux_ecc_reporting->lock);
|
||||||
|
if (param == L1SS_READY) {
|
||||||
|
if (!l1ss_linux_ecc_reporting->service_enabled) {
|
||||||
|
l1ss_linux_ecc_reporting->service_enabled = true;
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting is enabled");
|
||||||
|
}
|
||||||
|
} else if (param == L1SS_NOT_READY) {
|
||||||
|
if (l1ss_linux_ecc_reporting->service_enabled) {
|
||||||
|
l1ss_linux_ecc_reporting->service_enabled = false;
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting is disabled");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
err = -EINVAL;
|
||||||
|
}
|
||||||
|
nvgpu_spinlock_release(&l1ss_linux_ecc_reporting->lock);
|
||||||
|
|
||||||
|
nvgpu_put(g);
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
void nvgpu_l1ss_init_reporting(struct gk20a *g)
|
||||||
|
{
|
||||||
|
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
|
||||||
|
struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = NULL;
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
l->l1ss_linux_ecc_reporting = nvgpu_kzalloc(g, sizeof(*l->l1ss_linux_ecc_reporting));
|
||||||
|
if (l->l1ss_linux_ecc_reporting == NULL) {
|
||||||
|
nvgpu_err(g, "unable to allocate memory for l1ss safety services");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ecc_report_linux = l->l1ss_linux_ecc_reporting;
|
||||||
|
|
||||||
|
/* This will invoke the registration API */
|
||||||
|
nvgpu_spinlock_init(&ecc_report_linux->lock);
|
||||||
|
ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
|
||||||
|
ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
|
||||||
|
ecc_report_linux->priv.data = g;
|
||||||
|
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting Init (L1SS)");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* err == 0 indicates service is available but not active yet.
|
||||||
|
* err == 1 indicates service is available and active
|
||||||
|
* error for other cases.
|
||||||
|
*/
|
||||||
|
err = l1ss_register_client(&ecc_report_linux->priv);
|
||||||
|
if (err == 0) {
|
||||||
|
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
|
||||||
|
ecc_report_linux->service_enabled = false;
|
||||||
|
nvgpu_spinlock_release(&ecc_report_linux->lock);
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting init success");
|
||||||
|
} else if (err == 1) {
|
||||||
|
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
|
||||||
|
ecc_report_linux->service_enabled = true;
|
||||||
|
nvgpu_spinlock_release(&ecc_report_linux->lock);
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting init started");
|
||||||
|
} else {
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting init failure %d", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nvgpu_l1ss_deinit_reporting(struct gk20a *g)
|
||||||
|
{
|
||||||
|
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
|
||||||
|
struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting;
|
||||||
|
|
||||||
|
if (ecc_report_linux == NULL)
|
||||||
|
return;
|
||||||
|
|
||||||
|
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
|
||||||
|
if (ecc_report_linux->service_enabled) {
|
||||||
|
ecc_report_linux->service_enabled = false;
|
||||||
|
}
|
||||||
|
nvgpu_spinlock_release(&ecc_report_linux->lock);
|
||||||
|
|
||||||
|
(void)l1ss_deregister_client(ecc_report_linux->priv.id);
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "ECC reporting de-init success");
|
||||||
|
|
||||||
|
nvgpu_kfree(g, ecc_report_linux);
|
||||||
|
l->l1ss_linux_ecc_reporting = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
bool service_enabled;
|
||||||
|
|
||||||
|
/* - HW_unit_id (4-bits: bit-0 to 3),
|
||||||
|
* - Error_id (5-bits: bit-4 to 8),
|
||||||
|
* - Corrected/Uncorrected error (1-bit: bit-9),
|
||||||
|
* - Remaining 22-bits are unused.
|
||||||
|
*/
|
||||||
|
|
||||||
|
u32 hw_unit = (err_id & HW_UNIT_ID_MASK);
|
||||||
|
u32 error_id = ((err_id >> ERR_ID_FIELD_SHIFT) & ERR_ID_MASK);
|
||||||
|
bool is_critical = ((err_id & (1 << CORRECTED_BIT_FIELD_SHIFT)) != 0U) ? true : false;
|
||||||
|
|
||||||
|
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
|
||||||
|
struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting;
|
||||||
|
|
||||||
|
nvgpu_log(g, gpu_dbg_info, "hw_unit = %u, error_id = %u, is_critical = %d",
|
||||||
|
hw_unit, error_id, is_critical);
|
||||||
|
|
||||||
|
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
|
||||||
|
service_enabled = ecc_report_linux->service_enabled;
|
||||||
|
nvgpu_spinlock_release(&ecc_report_linux->lock);
|
||||||
|
|
||||||
|
if (service_enabled) {
|
||||||
|
ret = nvgpu_l1ss_report_error_linux(g, hw_unit, error_id, is_critical);
|
||||||
|
} else {
|
||||||
|
ret = nvgpu_l1ss_report_error_empty(g, hw_unit, err_id, is_critical);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
@@ -70,6 +70,7 @@
|
|||||||
#include <nvgpu/cic_rm.h>
|
#include <nvgpu/cic_rm.h>
|
||||||
#include <nvgpu/fb.h>
|
#include <nvgpu/fb.h>
|
||||||
#include <nvgpu/nvs.h>
|
#include <nvgpu/nvs.h>
|
||||||
|
#include <nvgpu/l1ss_err_reporting.h>
|
||||||
|
|
||||||
#include "platform_gk20a.h"
|
#include "platform_gk20a.h"
|
||||||
#include "sysfs.h"
|
#include "sysfs.h"
|
||||||
@@ -1016,6 +1017,10 @@ void gk20a_remove_support(struct gk20a *g)
|
|||||||
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
|
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
|
||||||
struct sim_nvgpu_linux *sim_linux;
|
struct sim_nvgpu_linux *sim_linux;
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
nvgpu_l1ss_deinit_reporting(g);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if NVGPU_VPR_RESIZE_SUPPORTED
|
#if NVGPU_VPR_RESIZE_SUPPORTED
|
||||||
if (nvgpu_is_enabled(g, NVGPU_SUPPORT_VPR)) {
|
if (nvgpu_is_enabled(g, NVGPU_SUPPORT_VPR)) {
|
||||||
tegra_unregister_idle_unidle(gk20a_do_idle);
|
tegra_unregister_idle_unidle(gk20a_do_idle);
|
||||||
@@ -1865,6 +1870,10 @@ static int gk20a_probe(struct platform_device *dev)
|
|||||||
if (err)
|
if (err)
|
||||||
goto return_err;
|
goto return_err;
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
nvgpu_l1ss_init_reporting(gk20a);
|
||||||
|
#endif
|
||||||
|
|
||||||
nvgpu_mutex_init(&l->dmabuf_priv_list_lock);
|
nvgpu_mutex_init(&l->dmabuf_priv_list_lock);
|
||||||
nvgpu_init_list_node(&l->dmabuf_priv_list);
|
nvgpu_init_list_node(&l->dmabuf_priv_list);
|
||||||
|
|
||||||
|
|||||||
@@ -108,6 +108,10 @@ struct nvgpu_os_linux {
|
|||||||
|
|
||||||
struct nvgpu_os_linux_ops ops;
|
struct nvgpu_os_linux_ops ops;
|
||||||
|
|
||||||
|
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
|
||||||
|
struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting;
|
||||||
|
#endif
|
||||||
|
|
||||||
struct notifier_block nvgpu_reboot_nb;
|
struct notifier_block nvgpu_reboot_nb;
|
||||||
|
|
||||||
#ifdef CONFIG_DEBUG_FS
|
#ifdef CONFIG_DEBUG_FS
|
||||||
|
|||||||
Reference in New Issue
Block a user