gpu: nvgpu: add error reporting support for L4T

Add error reporting support for T194's L1SS safety
services for linux.

Used GA10B's LUT for GV11B. The error ids for T194 are
different compared to GA10B. This is handled by creating
a separate table mapping existing error ids to match GV11B.

Ids that are not used by GV11B are set to U32_MAX to indicate
the driver to not send them to the l1ss driver.

Bug 200588528

Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Change-Id: I10a267942df77458c3deee0aad1179955490aa74
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2736772
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Debarshi Dutta
2022-06-27 16:39:08 +05:30
committed by mobile promotions
parent 28ddb0996f
commit e89553fe62
12 changed files with 561 additions and 3 deletions

View File

@@ -15,7 +15,8 @@ nvgpu:
sources: [ include/nvgpu/gk20a.h, sources: [ include/nvgpu/gk20a.h,
include/nvgpu/nvgpu_common.h, include/nvgpu/nvgpu_common.h,
include/nvgpu/cov_whitelist.h, include/nvgpu/cov_whitelist.h,
include/nvgpu/static_analysis.h ] include/nvgpu/static_analysis.h,
include/nvgpu/l1ss_err_reporting.h ]
bios: bios:
safe: yes safe: yes

View File

@@ -1027,7 +1027,9 @@ tpc:
cic: cic:
safe: yes safe: yes
owner: Tejal K owner: Tejal K
sources: [ hal/cic/mon/init_ga10b_fusa.c, sources: [ hal/cic/mon/init_gv11b_non_fusa.c,
hal/cic/mon/cic_gv11b.h,
hal/cic/mon/init_ga10b_fusa.c,
hal/cic/mon/lut_ga10b_fusa.c, hal/cic/mon/lut_ga10b_fusa.c,
hal/cic/mon/cic_ga10b.h ] hal/cic/mon/cic_ga10b.h ]

View File

@@ -241,7 +241,8 @@ vm:
os/linux/nvgpu_ivm.c ] os/linux/nvgpu_ivm.c ]
cic: cic:
sources: [ os/linux/cic/cic_report_err.c ] sources: [ os/linux/cic/cic_report_err.c,
os/linux/cic/l1ss_report_err.c ]
# Group all the Linux headers for now. # Group all the Linux headers for now.
headers: headers:

View File

@@ -61,6 +61,10 @@ ifeq ($(CONFIG_NVGPU_IVM_BUILD),y)
ccflags-y += -DCONFIG_NVGPU_IVM_BUILD ccflags-y += -DCONFIG_NVGPU_IVM_BUILD
endif endif
ifeq ($(CONFIG_TEGRA_L1SS_SUPPORT),y)
ccflags-y += -DCONFIG_TEGRA_L1SS_SUPPORT
endif
ccflags-y += -DCONFIG_NVGPU_DETERMINISTIC_CHANNELS ccflags-y += -DCONFIG_NVGPU_DETERMINISTIC_CHANNELS
ccflags-y += -DCONFIG_NVGPU_STATIC_POWERGATE ccflags-y += -DCONFIG_NVGPU_STATIC_POWERGATE
ccflags-y += -DCONFIG_NVGPU_ACR_LEGACY ccflags-y += -DCONFIG_NVGPU_ACR_LEGACY
@@ -606,6 +610,10 @@ nvgpu-$(CONFIG_NVGPU_SUPPORT_CDE) += \
os/linux/debug_cde.o os/linux/debug_cde.o
endif endif
nvgpu-$(CONFIG_TEGRA_L1SS_SUPPORT) += \
os/linux/cic/l1ss_report_err.o \
hal/cic/mon/init_gv11b_non_fusa.o \
nvgpu-y += \ nvgpu-y += \
common/mm/allocators/nvgpu_allocator.o \ common/mm/allocators/nvgpu_allocator.o \
common/mm/allocators/bitmap_allocator.o \ common/mm/allocators/bitmap_allocator.o \

View File

@@ -112,6 +112,11 @@ ifdef CONFIG_TEGRA_EPL
CONFIG_NVGPU_ENABLE_MISC_EC := y CONFIG_NVGPU_ENABLE_MISC_EC := y
endif endif
CONFIG_TEGRA_L1SS_SUPPORT := n
ifdef CONFIG_TEGRA_SAFETY
CONFIG_TEGRA_L1SS_SUPPORT := y
endif
CONFIG_NVGPU_NVMEM_FUSE := n CONFIG_NVGPU_NVMEM_FUSE := n
CONFIG_NVGPU_NVMAP_NEXT := y CONFIG_NVGPU_NVMAP_NEXT := y

View File

@@ -0,0 +1,36 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef CIC_GV11B_H
#define CIC_GV11B_H
#include <nvgpu/nvgpu_err_info.h>
struct gk20a;
struct nvgpu_cic_mon;
extern struct nvgpu_err_hw_module gv11b_err_lut[];
extern u32 size_of_gv11b_lut;
int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon);
#endif /* CIC_GV11B_H */

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/log.h>
#include "common/cic/mon/cic_mon_priv.h"
#include "cic_ga10b.h"
#include "cic_gv11b.h"
int gv11b_cic_mon_init(struct gk20a *g, struct nvgpu_cic_mon *cic_mon)
{
if (cic_mon == NULL) {
nvgpu_err(g, "Invalid CIC reference pointer.");
return -EINVAL;
}
cic_mon->err_lut = ga10b_err_lut;
cic_mon->num_hw_modules = size_of_ga10b_lut;
return 0;
}

View File

@@ -49,6 +49,8 @@
#include <nvgpu/pmu/pmu_pg.h> #include <nvgpu/pmu/pmu_pg.h>
#endif #endif
#include <nvgpu/l1ss_err_reporting.h>
#include "hal/mm/mm_gp10b.h" #include "hal/mm/mm_gp10b.h"
#include "hal/mm/mm_gv11b.h" #include "hal/mm/mm_gv11b.h"
#include "hal/mm/cache/flush_gk20a.h" #include "hal/mm/cache/flush_gk20a.h"
@@ -199,6 +201,10 @@
#include "hal/tpc/tpc_gv11b.h" #include "hal/tpc/tpc_gv11b.h"
#endif #endif
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
#include "hal/cic/mon/cic_gv11b.h"
#endif
#include "hal_gv11b.h" #include "hal_gv11b.h"
#include "hal_gv11b_litter.h" #include "hal_gv11b_litter.h"
@@ -1488,6 +1494,13 @@ static const struct gops_grmgr gv11b_ops_grmgr = {
.init_gr_manager = nvgpu_init_gr_manager, .init_gr_manager = nvgpu_init_gr_manager,
}; };
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
static const struct gops_cic_mon gv11b_ops_cic_mon = {
.init = gv11b_cic_mon_init,
.report_err = nvgpu_l1ss_report_err
};
#endif
int gv11b_init_hal(struct gk20a *g) int gv11b_init_hal(struct gk20a *g)
{ {
struct gpu_ops *gops = &g->ops; struct gpu_ops *gops = &g->ops;
@@ -1587,6 +1600,9 @@ int gv11b_init_hal(struct gk20a *g)
gops->gpc_pg = gv11b_ops_gpc_pg; gops->gpc_pg = gv11b_ops_gpc_pg;
#endif #endif
gops->grmgr = gv11b_ops_grmgr; gops->grmgr = gv11b_ops_grmgr;
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
gops->cic_mon = gv11b_ops_cic_mon;
#endif
gops->chip_init_gpu_characteristics = gv11b_init_gpu_characteristics; gops->chip_init_gpu_characteristics = gv11b_init_gpu_characteristics;
gops->get_litter_value = gv11b_get_litter_value; gops->get_litter_value = gv11b_get_litter_value;
gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;

View File

@@ -0,0 +1,38 @@
/*
*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef NVGPU_L1SS_ECC_H
#define NVGPU_L1SS_ECC_H
#include <nvgpu/types.h>
struct gk20a;
struct nvgpu_l1ss_ecc_reporting;
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id);
void nvgpu_l1ss_deinit_reporting(struct gk20a *g);
void nvgpu_l1ss_init_reporting(struct gk20a *g);
#endif
#endif /* NVGPU_L1SS_ECC_H */

View File

@@ -0,0 +1,399 @@
/*
* Copyright (c) 2022, NVIDIA Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/tegra_l1ss_kernel_interface.h>
#include <linux/tegra_l1ss_ioctl.h>
#include <linux/tegra_nv_guard_service_id.h>
#include <linux/tegra_nv_guard_group_id.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/lock.h>
#include <nvgpu/timers.h>
#include <nvgpu/cic_mon.h>
#include <nvgpu/log.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/l1ss_err_reporting.h>
#include "os/linux/os_linux.h"
#define NVGPU_ERR_INVALID U32_MAX
struct nvgpu_l1ss_ecc_reporting {
struct gk20a *g;
client_param_t priv;
bool service_enabled;
/* protects service enabled */
struct nvgpu_spinlock lock;
};
struct nvgpu_l1ss_error_id_mappings {
u32 num_errs;
u32 *error_id_mappings;
};
static struct nvgpu_l1ss_error_id_mappings mappings[] = {
{
/* *************** SERVICE ID for IGPU_HOST*************** */
.num_errs = 16,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_BIND_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_SCHED_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CHSW_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_MEMOP_TIMEOUT_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_LB_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_SQUASH_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_FECS_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBUS_TIMEOUT_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_TIMEOUT_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_EXTRA_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_GPFIFO_PB_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_METHOD_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_SIGNATURE_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PBDMA_HCE_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_CTXSW_TIMEOUT_ERROR,
NVGUARD_SERVICE_IGPU_HOST_SWERR_PFIFO_FB_FLUSH_TIMEOUT_ERROR,
NVGPU_ERR_INVALID,
},
},
{
/* *************** SERVICE ID for IGPU_SM*************** */
.num_errs = 11,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_CBU_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_LRF_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L1_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_MISS_FIFO_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_SM_SWERR_MACHINE_CHECK_ERROR,
NVGPU_ERR_INVALID,
},
},
{
/* *************** SERVICE ID for IGPU_FECS*************** */
.num_errs = 7,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_DMEM_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_WATCHDOG_TIMEOUT,
NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_CRC_MISMATCH,
NVGUARD_SERVICE_IGPU_FECS_SWERR_FAULT_DURING_CTXSW,
NVGUARD_SERVICE_IGPU_FECS_SWERR_CTXSW_INIT_ERROR,
},
},
{
/* *************** SERVICE ID for IGPU_GPCCS*************** */
.num_errs = 3,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_CORRECTED,
NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_IMEM_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_GPCCS_SWERR_FALCON_DMEM_ECC_UNCORRECTED,
},
},
{
/* *************** SERVICE ID for IGPU_MMU*************** */
.num_errs = 2,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_SA_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_MMU_SWERR_L1TLB_FA_DATA_ECC_UNCORRECTED,
},
},
{
/* *************** SERVICE ID for IGPU_GCC*************** */
.num_errs = 1,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_GCC_SWERR_L15_ECC_UNCORRECTED
},
},
{
/* *************** SERVICE ID for IGPU_PMU*************** */
.num_errs = 10,
.error_id_mappings = (u32 []) {
NVGPU_ERR_INVALID,
NVGPU_ERR_INVALID,
NVGPU_ERR_INVALID,
NVGPU_ERR_INVALID,
NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_UNCORRECTED,
NVGPU_ERR_INVALID,
NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_DMEM_ECC_UNCORRECTED,
NVGPU_ERR_INVALID,
NVGPU_ERR_INVALID,
NVGUARD_SERVICE_IGPU_PMU_SWERR_BAR0_ERROR_TIMEOUT,
},
},
{
/* *************** SERVICE ID for IGPU_PGRAPH*************** */
.num_errs = 21,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_FE_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MEMFMT_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_PD_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SCC_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_DS_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SSYNC_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MME_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_SKED_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_BE_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_MPC_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR,
NVGPU_ERR_INVALID,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_GPC_GFX_EXCEPTION,
NVGPU_ERR_INVALID,
},
},
{
/* *************** SERVICE ID for IGPU_LTC*************** */
.num_errs = 4,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_CORRECTED,
NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_DSTG_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_LTC_SWERR_CACHE_TSTG_ECC_UNCORRECTED,
NVGPU_ERR_INVALID,
},
},
{
/* *************** SERVICE ID for IGPU_HUBMMU*************** */
.num_errs = 9,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_L2TLB_SA_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_TLB_SA_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PTE_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PDE0_DATA_ECC_UNCORRECTED,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
NVGUARD_SERVICE_IGPU_HUBMMU_SWERR_PAGE_FAULT_ERROR,
},
},
{
/* *************** SERVICE ID for IGPU_PRI*************** */
.num_errs = 2,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_PRI_SWERR_TIMEOUT_ERROR,
NVGUARD_SERVICE_IGPU_PRI_SWERR_ACCESS_VIOLATION,
},
},
{
/* *************** SERVICE ID for IGPU_CE*************** */
.num_errs = 5,
.error_id_mappings = (u32 []) {
NVGUARD_SERVICE_IGPU_CE_SWERR_LAUNCH_ERROR,
NVGUARD_SERVICE_IGPU_CE_SWERR_METHOD_BUFFER_FAULT,
NVGPU_ERR_INVALID,
NVGPU_ERR_INVALID,
NVGUARD_SERVICE_IGPU_CE_SWERR_INVALID_CONFIG,
},
},
};
static int nvgpu_l1ss_report_error_linux(struct gk20a *g, u32 hw_unit_id, u32 err_id,
bool is_critical)
{
int err = 0;
u32 nv_service_id = 0;
u8 err_status = 0;
u64 timestamp = (u64)nvgpu_current_time_ns();
nv_guard_request_t req;
if (hw_unit_id >= sizeof(mappings)) {
nvgpu_err(g, "Error Id H/W index out of bounds\n");
return -EINVAL;
} else if (err_id >= mappings[hw_unit_id].num_errs) {
nvgpu_err(g, "Error Id index out of bounds\n");
return -EINVAL;
}
memset(&req, 0, sizeof(req));
nv_service_id = mappings[hw_unit_id].error_id_mappings[err_id];
if (nv_service_id == NVGPU_ERR_INVALID) {
/* error id not supported */
return -EOPNOTSUPP;
}
if (is_critical)
err_status = NVGUARD_ERROR_DETECTED;
else
err_status = NVGUARD_NO_ERROR;
req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
req.srv_status.srv_id = (nv_guard_service_id_t)nv_service_id;
req.srv_status.status = err_status;
req.srv_status.timestamp = timestamp;
/*
* l1ss_submit_rq may fail due to kmalloc failures but may pass in
* subsequent calls
*/
err = l1ss_submit_rq(&req, true);
if (err != 0)
nvgpu_err(g, "Error returned from L1SS submit %d", err);
return err;
}
static int nvgpu_l1ss_report_error_empty(struct gk20a *g,
u32 hw_unit_id, u32 err_id, bool is_critical)
{
nvgpu_log(g, gpu_dbg_info, "ECC reporting is empty");
return -EOPNOTSUPP;
}
static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
{
struct gk20a *g = (struct gk20a *)data;
struct nvgpu_os_linux *l = NULL;
struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting = NULL;
int err = 0;
/* Ensure we have a valid gk20a struct before proceeding */
if ((g == NULL) || (nvgpu_get(g) == NULL))
return -ENODEV;
l = nvgpu_os_linux_from_gk20a(g);
l1ss_linux_ecc_reporting = l->l1ss_linux_ecc_reporting;
nvgpu_spinlock_acquire(&l1ss_linux_ecc_reporting->lock);
if (param == L1SS_READY) {
if (!l1ss_linux_ecc_reporting->service_enabled) {
l1ss_linux_ecc_reporting->service_enabled = true;
nvgpu_log(g, gpu_dbg_info, "ECC reporting is enabled");
}
} else if (param == L1SS_NOT_READY) {
if (l1ss_linux_ecc_reporting->service_enabled) {
l1ss_linux_ecc_reporting->service_enabled = false;
nvgpu_log(g, gpu_dbg_info, "ECC reporting is disabled");
}
} else {
err = -EINVAL;
}
nvgpu_spinlock_release(&l1ss_linux_ecc_reporting->lock);
nvgpu_put(g);
return err;
}
void nvgpu_l1ss_init_reporting(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = NULL;
int err = 0;
l->l1ss_linux_ecc_reporting = nvgpu_kzalloc(g, sizeof(*l->l1ss_linux_ecc_reporting));
if (l->l1ss_linux_ecc_reporting == NULL) {
nvgpu_err(g, "unable to allocate memory for l1ss safety services");
return;
}
ecc_report_linux = l->l1ss_linux_ecc_reporting;
/* This will invoke the registration API */
nvgpu_spinlock_init(&ecc_report_linux->lock);
ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
ecc_report_linux->priv.data = g;
nvgpu_log(g, gpu_dbg_info, "ECC reporting Init (L1SS)");
/*
* err == 0 indicates service is available but not active yet.
* err == 1 indicates service is available and active
* error for other cases.
*/
err = l1ss_register_client(&ecc_report_linux->priv);
if (err == 0) {
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
ecc_report_linux->service_enabled = false;
nvgpu_spinlock_release(&ecc_report_linux->lock);
nvgpu_log(g, gpu_dbg_info, "ECC reporting init success");
} else if (err == 1) {
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
ecc_report_linux->service_enabled = true;
nvgpu_spinlock_release(&ecc_report_linux->lock);
nvgpu_log(g, gpu_dbg_info, "ECC reporting init started");
} else {
nvgpu_log(g, gpu_dbg_info, "ECC reporting init failure %d", err);
}
}
void nvgpu_l1ss_deinit_reporting(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting;
if (ecc_report_linux == NULL)
return;
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
if (ecc_report_linux->service_enabled) {
ecc_report_linux->service_enabled = false;
}
nvgpu_spinlock_release(&ecc_report_linux->lock);
(void)l1ss_deregister_client(ecc_report_linux->priv.id);
nvgpu_log(g, gpu_dbg_info, "ECC reporting de-init success");
nvgpu_kfree(g, ecc_report_linux);
l->l1ss_linux_ecc_reporting = NULL;
}
int nvgpu_l1ss_report_err(struct gk20a *g, u32 err_id)
{
int ret = 0;
bool service_enabled;
/* - HW_unit_id (4-bits: bit-0 to 3),
* - Error_id (5-bits: bit-4 to 8),
* - Corrected/Uncorrected error (1-bit: bit-9),
* - Remaining 22-bits are unused.
*/
u32 hw_unit = (err_id & HW_UNIT_ID_MASK);
u32 error_id = ((err_id >> ERR_ID_FIELD_SHIFT) & ERR_ID_MASK);
bool is_critical = ((err_id & (1 << CORRECTED_BIT_FIELD_SHIFT)) != 0U) ? true : false;
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_l1ss_ecc_reporting *ecc_report_linux = l->l1ss_linux_ecc_reporting;
nvgpu_log(g, gpu_dbg_info, "hw_unit = %u, error_id = %u, is_critical = %d",
hw_unit, error_id, is_critical);
nvgpu_spinlock_acquire(&ecc_report_linux->lock);
service_enabled = ecc_report_linux->service_enabled;
nvgpu_spinlock_release(&ecc_report_linux->lock);
if (service_enabled) {
ret = nvgpu_l1ss_report_error_linux(g, hw_unit, error_id, is_critical);
} else {
ret = nvgpu_l1ss_report_error_empty(g, hw_unit, err_id, is_critical);
}
return ret;
}

View File

@@ -70,6 +70,7 @@
#include <nvgpu/cic_rm.h> #include <nvgpu/cic_rm.h>
#include <nvgpu/fb.h> #include <nvgpu/fb.h>
#include <nvgpu/nvs.h> #include <nvgpu/nvs.h>
#include <nvgpu/l1ss_err_reporting.h>
#include "platform_gk20a.h" #include "platform_gk20a.h"
#include "sysfs.h" #include "sysfs.h"
@@ -1016,6 +1017,10 @@ void gk20a_remove_support(struct gk20a *g)
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct sim_nvgpu_linux *sim_linux; struct sim_nvgpu_linux *sim_linux;
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
nvgpu_l1ss_deinit_reporting(g);
#endif
#if NVGPU_VPR_RESIZE_SUPPORTED #if NVGPU_VPR_RESIZE_SUPPORTED
if (nvgpu_is_enabled(g, NVGPU_SUPPORT_VPR)) { if (nvgpu_is_enabled(g, NVGPU_SUPPORT_VPR)) {
tegra_unregister_idle_unidle(gk20a_do_idle); tegra_unregister_idle_unidle(gk20a_do_idle);
@@ -1865,6 +1870,10 @@ static int gk20a_probe(struct platform_device *dev)
if (err) if (err)
goto return_err; goto return_err;
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
nvgpu_l1ss_init_reporting(gk20a);
#endif
nvgpu_mutex_init(&l->dmabuf_priv_list_lock); nvgpu_mutex_init(&l->dmabuf_priv_list_lock);
nvgpu_init_list_node(&l->dmabuf_priv_list); nvgpu_init_list_node(&l->dmabuf_priv_list);

View File

@@ -108,6 +108,10 @@ struct nvgpu_os_linux {
struct nvgpu_os_linux_ops ops; struct nvgpu_os_linux_ops ops;
#ifdef CONFIG_TEGRA_L1SS_SUPPORT
struct nvgpu_l1ss_ecc_reporting *l1ss_linux_ecc_reporting;
#endif
struct notifier_block nvgpu_reboot_nb; struct notifier_block nvgpu_reboot_nb;
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS