mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: merge error reporting apis
In DRIVE 6.0, NvGPU is allowed to report only 32-bit metadata to Safety_Services. So, there is no need to have distinct APIs for reporting errors from units like GR, MM, FIFO to SDL unit. All these error reporting APIs will be replaced with a single API. To meet this objective, this patch does the following changes: - Replaces nvgpu_report_*_err with nvgpu_report_err_to_sdl. - Removes the reporting of error messages. - Replaces nvgpu_log() with nvgpu_err(), for error reporting. - Removes error reporting to Safety_Services from nvgpu_report_*_err. However, nvgpu_report_*_err APIs and their related files are not removed. During the creation of nvgpu-mon, they will be moved under nvgpu-rm, in debug builds. Note: - There will be a follow-up patch to fix error IDs. - As discussed in https://nvbugs/3491596 (comment #12), the high level expectation is to report only errors. JIRA NVGPU-7450 Change-Id: I428f2a9043086462754ac36a15edf6094985316f Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2662590 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
2a98d20263
commit
7dc013d242
@@ -1135,6 +1135,7 @@ cic:
|
||||
common/cic/mon/mon_pri.c,
|
||||
common/cic/mon/mon_pmu.c,
|
||||
common/cic/mon/mon_mmu.c,
|
||||
common/cic/mon/mon_report_err.c,
|
||||
common/cic/mon/cic_mon_priv.h,
|
||||
include/nvgpu/gops/cic_mon.h,
|
||||
include/nvgpu/cic_mon.h,
|
||||
|
||||
@@ -237,7 +237,7 @@ vm:
|
||||
os/linux/nvgpu_ivm.c ]
|
||||
|
||||
cic:
|
||||
sources: [ os/linux/cic/cic_stub.c ]
|
||||
sources: [ os/linux/cic/cic_report_err.c ]
|
||||
|
||||
# Group all the Linux headers for now.
|
||||
headers:
|
||||
|
||||
@@ -342,6 +342,7 @@ nvgpu-y += \
|
||||
common/cic/mon/mon_pri.o \
|
||||
common/cic/mon/mon_pmu.o \
|
||||
common/cic/mon/mon_mmu.o \
|
||||
common/cic/mon/mon_report_err.o \
|
||||
common/cic/rm/rm_init.o \
|
||||
common/cic/rm/rm_intr.o \
|
||||
hal/bus/bus_gk20a.o \
|
||||
@@ -472,7 +473,7 @@ nvgpu-y += \
|
||||
os/linux/dt.o \
|
||||
os/linux/ecc_sysfs.o \
|
||||
os/linux/bsearch.o \
|
||||
os/linux/cic/cic_stub.o \
|
||||
os/linux/cic/cic_report_err.o \
|
||||
os/linux/dmabuf_priv.o \
|
||||
os/linux/power_ops.o
|
||||
|
||||
|
||||
@@ -168,6 +168,7 @@ srcs += common/device.c \
|
||||
common/cic/mon/mon_pri.c \
|
||||
common/cic/mon/mon_pmu.c \
|
||||
common/cic/mon/mon_mmu.c \
|
||||
common/cic/mon/mon_report_err.c \
|
||||
common/cic/rm/rm_init.c \
|
||||
common/cic/rm/rm_intr.c \
|
||||
hal/init/hal_gv11b.c \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -69,15 +69,6 @@ void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
|
||||
sizeof(err_pkt.err_info.ce_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "Failed to report CE error: "
|
||||
"inst=%u err_id=%u intr_info=%u",
|
||||
inst, err_id, intr_info);
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
@@ -72,15 +72,6 @@ void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
|
||||
sizeof(err_pkt.err_info.ctxsw_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "Failed to report CTXSW error: "
|
||||
"err_id=%u, mailbox_val=%u",
|
||||
err_id, err_info->mailbox_value);
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -60,15 +60,6 @@ void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
|
||||
sizeof(err_pkt.err_info.ecc_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "Failed to report ECC error: hw_unit=%u, inst=%u, "
|
||||
"err_id=%u, err_addr=%llu, err_count=%llu",
|
||||
hw_unit, inst, err_id, err_addr, err_count);
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
@@ -99,26 +99,6 @@ void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
nvpgu_report_fill_err_info(hw_unit, &err_pkt, err_info);
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(sizeof(err_pkt.err_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
if (hw_unit == NVGPU_ERR_MODULE_SM) {
|
||||
nvgpu_err(g, "Failed to report SM exception"
|
||||
"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
|
||||
err_pkt.err_info.sm_info.gpc,
|
||||
err_pkt.err_info.sm_info.tpc,
|
||||
err_pkt.err_info.sm_info.sm,
|
||||
err_pkt.err_info.sm_info.warp_esr_status);
|
||||
}
|
||||
if (hw_unit == NVGPU_ERR_MODULE_PGRAPH) {
|
||||
nvgpu_err(g, "Failed to report PGRAPH"
|
||||
"exception: inst=%u, err_id=%u, "
|
||||
"status=%u", inst, err_id,
|
||||
err_pkt.err_info.gr_info.status);
|
||||
}
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -69,15 +69,6 @@ void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
|
||||
sizeof(err_pkt.err_info.host_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "Failed to report HOST error: "
|
||||
"inst=%u, err_id=%u, intr_info=%u",
|
||||
inst, err_id, intr_info);
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
@@ -105,15 +105,6 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
|
||||
sizeof(err_pkt.err_info.mmu_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "Failed to report MMU fault: hw_unit=%u, "
|
||||
"err_id=%u, sub_err_type=%u, status=%u",
|
||||
hw_unit, err_id, sub_err_type, status);
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -65,15 +65,6 @@ void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
|
||||
sizeof(err_pkt.err_info.pmu_err_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "Failed to report PMU error: "
|
||||
"err_id=%u, sub_err_type=%u, status=%u",
|
||||
err_id, sub_err_type, status);
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -69,15 +69,6 @@ void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
|
||||
sizeof(err_pkt.err_info.pri_info));
|
||||
|
||||
if (g->ops.cic_mon.report_err != NULL) {
|
||||
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
|
||||
sizeof(err_pkt), err_desc->is_critical);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "Failed to report PRI error: "
|
||||
"inst=%u, err_id=%u, err_code=%u",
|
||||
inst, err_id, err_code);
|
||||
}
|
||||
}
|
||||
handle_report_failure:
|
||||
if (err != 0) {
|
||||
nvgpu_sw_quiesce(g);
|
||||
|
||||
42
drivers/gpu/nvgpu/common/cic/mon/mon_report_err.c
Normal file
42
drivers/gpu/nvgpu/common/cic/mon/mon_report_err.c
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <nvgpu/gk20a.h>
|
||||
#include <nvgpu/nvgpu_init.h>
|
||||
#include <nvgpu/nvgpu_err.h>
|
||||
#include <nvgpu/nvgpu_err_info.h>
|
||||
#include <nvgpu/cic_mon.h>
|
||||
|
||||
#include "cic_mon_priv.h"
|
||||
|
||||
void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id)
|
||||
{
|
||||
if (g->ops.cic_mon.report_err == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (g->ops.cic_mon.report_err(g, err_id) != 0) {
|
||||
nvgpu_err(g, "Failed to report an error: err_id=%x",
|
||||
err_id);
|
||||
nvgpu_sw_quiesce(g);
|
||||
}
|
||||
}
|
||||
@@ -817,7 +817,7 @@ static int gr_init_ctxsw_falcon_support(struct gk20a *g, struct nvgpu_gr *gr)
|
||||
|
||||
err = nvgpu_gr_falcon_init_ctxsw(g, gr->falcon);
|
||||
if (err != 0) {
|
||||
gr_intr_report_ctxsw_error(g, GPU_FECS_CTXSW_INIT_ERROR, 0, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_INIT_ERROR);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
@@ -42,21 +42,6 @@
|
||||
|
||||
#include "gr_intr_priv.h"
|
||||
|
||||
void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
|
||||
u32 mailbox_value)
|
||||
{
|
||||
struct ctxsw_err_info err_info;
|
||||
|
||||
err_info.curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
|
||||
err_info.ctxsw_status0 = g->ops.gr.falcon.read_fecs_ctxsw_status0(g);
|
||||
err_info.ctxsw_status1 = g->ops.gr.falcon.read_fecs_ctxsw_status1(g);
|
||||
err_info.mailbox_value = mailbox_value;
|
||||
err_info.chid = chid;
|
||||
|
||||
nvgpu_report_ctxsw_err(g, NVGPU_ERR_MODULE_FECS,
|
||||
err_type, (void *)&err_info);
|
||||
}
|
||||
|
||||
static int gr_intr_handle_pending_tpc_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
bool *post_event, struct nvgpu_channel *fault_ch,
|
||||
u32 *hww_global_esr)
|
||||
@@ -201,41 +186,6 @@ static void gr_intr_handle_class_error(struct gk20a *g,
|
||||
NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
|
||||
}
|
||||
|
||||
static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
|
||||
{
|
||||
struct gr_sm_mcerr_info err_info;
|
||||
struct nvgpu_channel *ch;
|
||||
struct gr_err_info info;
|
||||
u32 tsgid, chid, curr_ctx, inst = 0;
|
||||
|
||||
tsgid = NVGPU_INVALID_TSG_ID;
|
||||
curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
|
||||
if (curr_ctx == 0U) {
|
||||
return;
|
||||
}
|
||||
|
||||
ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
|
||||
chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID;
|
||||
if (ch != NULL) {
|
||||
nvgpu_channel_put(ch);
|
||||
}
|
||||
|
||||
(void) memset(&err_info, 0, sizeof(err_info));
|
||||
(void) memset(&info, 0, sizeof(info));
|
||||
err_info.curr_ctx = curr_ctx;
|
||||
err_info.chid = chid;
|
||||
err_info.tsgid = tsgid;
|
||||
err_info.hww_warp_esr_pc = hww_warp_esr_pc;
|
||||
err_info.hww_warp_esr_status = hww_warp_esr_status;
|
||||
err_info.gpc = gpc;
|
||||
err_info.tpc = tpc;
|
||||
err_info.sm = sm;
|
||||
info.sm_mcerr_info = &err_info;
|
||||
nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst,
|
||||
GPU_SM_MACHINE_CHECK_ERROR, &info, 0U);
|
||||
}
|
||||
|
||||
/* Used by sw interrupt thread to translate current ctx to chid.
|
||||
* Also used by regops to translate current ctx to chid and tsgid.
|
||||
* For performance, we don't want to go through 128 channels every time.
|
||||
@@ -318,35 +268,6 @@ unlock:
|
||||
return ret_ch;
|
||||
}
|
||||
|
||||
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
||||
u32 err_type, u32 status, u32 sub_err_type)
|
||||
{
|
||||
struct nvgpu_channel *ch = NULL;
|
||||
struct gr_exception_info err_info;
|
||||
struct gr_err_info info;
|
||||
u32 tsgid, chid, curr_ctx;
|
||||
|
||||
tsgid = NVGPU_INVALID_TSG_ID;
|
||||
curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
|
||||
if (curr_ctx != 0U) {
|
||||
ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
|
||||
}
|
||||
chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID;
|
||||
if (ch != NULL) {
|
||||
nvgpu_channel_put(ch);
|
||||
}
|
||||
|
||||
(void) memset(&err_info, 0, sizeof(err_info));
|
||||
(void) memset(&info, 0, sizeof(info));
|
||||
err_info.curr_ctx = curr_ctx;
|
||||
err_info.chid = chid;
|
||||
err_info.tsgid = tsgid;
|
||||
err_info.status = status;
|
||||
info.exception_info = &err_info;
|
||||
nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH,
|
||||
inst, err_type, &info, sub_err_type);
|
||||
}
|
||||
|
||||
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
|
||||
struct nvgpu_gr_isr_data *isr_data, u32 error_notifier)
|
||||
{
|
||||
@@ -372,22 +293,6 @@ static bool is_global_esr_error(u32 global_esr, u32 global_mask)
|
||||
return ((global_esr & ~global_mask) != 0U) ? true: false;
|
||||
}
|
||||
|
||||
static void gr_intr_report_warp_error(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
u32 sm, u32 global_esr, u32 warp_esr,
|
||||
u32 global_mask, u32 offset)
|
||||
{
|
||||
u64 hww_warp_esr_pc = 0;
|
||||
|
||||
if (is_global_esr_error(global_esr, global_mask)) {
|
||||
if (g->ops.gr.intr.get_sm_hww_warp_esr_pc != NULL) {
|
||||
hww_warp_esr_pc = g->ops.gr.intr.get_sm_hww_warp_esr_pc(g,
|
||||
offset);
|
||||
}
|
||||
gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr,
|
||||
hww_warp_esr_pc);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVGPU_DEBUGGER
|
||||
static int gr_intr_sm_exception_warp_sync(struct gk20a *g,
|
||||
u32 gpc, u32 tpc, u32 sm,
|
||||
@@ -454,8 +359,11 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
|
||||
/*
|
||||
* Check and report any fatal warp errors.
|
||||
*/
|
||||
gr_intr_report_warp_error(g, gpc, tpc, sm, global_esr, warp_esr,
|
||||
global_mask, offset);
|
||||
if (is_global_esr_error(global_esr, global_mask)) {
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_MACHINE_CHECK_ERROR);
|
||||
nvgpu_err(g, "sm machine check err. gpc_id(%d), tpc_id(%d), "
|
||||
"offset(%d)", gpc, tpc, offset);
|
||||
}
|
||||
|
||||
(void)nvgpu_pg_elpg_protected_call(g,
|
||||
nvgpu_safe_cast_u32_to_s32(
|
||||
@@ -570,9 +478,7 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch,
|
||||
&& (mailbox_value ==
|
||||
g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val())) {
|
||||
|
||||
gr_intr_report_ctxsw_error(g,
|
||||
GPU_FECS_CTXSW_CRC_MISMATCH,
|
||||
chid, mailbox_value);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_CRC_MISMATCH);
|
||||
nvgpu_err(g, "ctxsw intr0 set by ucode, "
|
||||
"ctxsw checksum mismatch");
|
||||
ret = -1;
|
||||
@@ -582,9 +488,7 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch,
|
||||
* recovery is initiated and error is reported to
|
||||
* 3LSS.
|
||||
*/
|
||||
gr_intr_report_ctxsw_error(g,
|
||||
GPU_FECS_FAULT_DURING_CTXSW,
|
||||
chid, mailbox_value);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_FAULT_DURING_CTXSW);
|
||||
nvgpu_err(g,
|
||||
"ctxsw intr0 set by ucode, error_code: 0x%08x",
|
||||
mailbox_value);
|
||||
@@ -593,17 +497,13 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch,
|
||||
}
|
||||
|
||||
if (fecs_host_intr->fault_during_ctxsw_active) {
|
||||
gr_intr_report_ctxsw_error(g,
|
||||
GPU_FECS_FAULT_DURING_CTXSW,
|
||||
chid, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_FAULT_DURING_CTXSW);
|
||||
nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid);
|
||||
ret = -1;
|
||||
}
|
||||
|
||||
if (fecs_host_intr->watchdog_active) {
|
||||
gr_intr_report_ctxsw_error(g,
|
||||
GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
|
||||
chid, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT);
|
||||
/* currently, recovery is not initiated */
|
||||
nvgpu_err(g, "fecs watchdog triggered for channel %u, "
|
||||
"cannot ctxsw anymore !!", chid);
|
||||
@@ -861,7 +761,7 @@ static u32 gr_intr_handle_exception_interrupts(struct gk20a *g,
|
||||
}
|
||||
|
||||
static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
|
||||
u32 gr_intr, u32 *clear_intr,
|
||||
u32 *clear_intr,
|
||||
struct nvgpu_gr_intr_info *intr_info,
|
||||
struct nvgpu_gr_isr_data *isr_data)
|
||||
{
|
||||
@@ -870,9 +770,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
|
||||
if (intr_info->illegal_notify != 0U) {
|
||||
nvgpu_err(g, "illegal notify pending");
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_ILLEGAL_NOTIFY);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
|
||||
nvgpu_gr_intr_set_error_notifier(g, isr_data,
|
||||
NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
|
||||
do_reset = 1U;
|
||||
@@ -881,9 +779,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
|
||||
|
||||
if (intr_info->illegal_method != 0U) {
|
||||
if (gr_intr_handle_illegal_method(g, isr_data) != 0) {
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_ILLEGAL_METHOD);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
|
||||
|
||||
do_reset = 1U;
|
||||
}
|
||||
@@ -891,9 +787,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
|
||||
}
|
||||
|
||||
if (intr_info->illegal_class != 0U) {
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_ILLEGAL_CLASS);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
|
||||
nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x",
|
||||
isr_data->class_num, isr_data->offset);
|
||||
|
||||
@@ -906,7 +800,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
|
||||
}
|
||||
|
||||
static u32 gr_intr_handle_error_interrupts(struct gk20a *g,
|
||||
u32 gr_intr, u32 *clear_intr,
|
||||
u32 *clear_intr,
|
||||
struct nvgpu_gr_intr_info *intr_info,
|
||||
struct nvgpu_gr_isr_data *isr_data)
|
||||
{
|
||||
@@ -923,9 +817,7 @@ static u32 gr_intr_handle_error_interrupts(struct gk20a *g,
|
||||
}
|
||||
|
||||
if (intr_info->class_error != 0U) {
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_CLASS_ERROR);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
|
||||
gr_intr_handle_class_error(g, isr_data);
|
||||
do_reset = 1U;
|
||||
*clear_intr &= ~intr_info->class_error;
|
||||
@@ -1073,10 +965,10 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
||||
gr_intr_handle_pending_interrupts(g, &clear_intr,
|
||||
&intr_info, &isr_data);
|
||||
|
||||
need_reset |= gr_intr_handle_illegal_interrupts(g, gr_intr,
|
||||
need_reset |= gr_intr_handle_illegal_interrupts(g,
|
||||
&clear_intr, &intr_info, &isr_data);
|
||||
|
||||
need_reset |= gr_intr_handle_error_interrupts(g, gr_intr,
|
||||
need_reset |= gr_intr_handle_error_interrupts(g,
|
||||
&clear_intr, &intr_info, &isr_data);
|
||||
|
||||
need_reset |= gr_intr_handle_exception_interrupts(g, &clear_intr,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -34,8 +34,9 @@
|
||||
void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status,
|
||||
u32 error_type)
|
||||
{
|
||||
nvgpu_report_pmu_err(g, NVGPU_ERR_MODULE_PMU,
|
||||
GPU_PMU_BAR0_ERROR_TIMEOUT, error_type, bar0_status);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PMU_BAR0_ERROR_TIMEOUT);
|
||||
nvgpu_err(g, "Falcon mem scrubbing timeout. status(0x%x), "
|
||||
"error_type(0x%x)", bar0_status, error_type);
|
||||
}
|
||||
|
||||
/* PMU engine reset functions */
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -156,6 +156,6 @@ void ga10b_bus_isr(struct gk20a *g)
|
||||
bus_intr_0 & ~bus_intr_0_handled);
|
||||
}
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, err_type, bus_intr_0);
|
||||
nvgpu_report_err_to_sdl(g, err_type);
|
||||
nvgpu_writel(g, bus_intr_0_r(), bus_intr_0);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -89,7 +89,6 @@ void gk20a_bus_isr(struct gk20a *g)
|
||||
*/
|
||||
err_type = GPU_HOST_PBUS_TIMEOUT_ERROR;
|
||||
}
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
0, err_type, val);
|
||||
nvgpu_report_err_to_sdl(g, err_type);
|
||||
nvgpu_writel(g, bus_intr_0_r(), val);
|
||||
}
|
||||
|
||||
@@ -43,15 +43,13 @@ void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
|
||||
|
||||
/* clear blocking interrupts: they exibit broken behavior */
|
||||
if ((ce_intr & ce_intr_status_blockpipe_pending_f()) != 0U) {
|
||||
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id,
|
||||
GPU_CE_BLOCK_PIPE, ce_intr);
|
||||
nvgpu_report_err_to_sdl(g, GPU_CE_BLOCK_PIPE);
|
||||
nvgpu_err(g, "ce blocking pipe interrupt");
|
||||
clear_intr |= ce_intr_status_blockpipe_pending_f();
|
||||
}
|
||||
|
||||
if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) {
|
||||
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id,
|
||||
GPU_CE_LAUNCH_ERROR, ce_intr);
|
||||
nvgpu_report_err_to_sdl(g, GPU_CE_LAUNCH_ERROR);
|
||||
nvgpu_err(g, "ce launch error interrupt");
|
||||
clear_intr |= ce_intr_status_launcherr_pending_f();
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Volta GPU series Copy Engine.
|
||||
*
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -61,8 +61,7 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
|
||||
* reset to get back to a working state.
|
||||
*/
|
||||
if ((ce_intr & ce_intr_status_invalid_config_pending_f()) != 0U) {
|
||||
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id,
|
||||
GPU_CE_INVALID_CONFIG, ce_intr);
|
||||
nvgpu_report_err_to_sdl(g, GPU_CE_INVALID_CONFIG);
|
||||
nvgpu_err(g, "ce: inst %d: invalid config", inst_id);
|
||||
clear_intr |= ce_intr_status_invalid_config_reset_f();
|
||||
}
|
||||
@@ -74,8 +73,7 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
|
||||
* reset before operations can start again, if not the entire GPU.
|
||||
*/
|
||||
if ((ce_intr & ce_intr_status_mthd_buffer_fault_pending_f()) != 0U) {
|
||||
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id,
|
||||
GPU_CE_METHOD_BUFFER_FAULT, ce_intr);
|
||||
nvgpu_report_err_to_sdl(g, GPU_CE_METHOD_BUFFER_FAULT);
|
||||
nvgpu_err(g, "ce: inst %d: mthd buffer fault", inst_id);
|
||||
clear_intr |= ce_intr_status_mthd_buffer_fault_reset_f();
|
||||
}
|
||||
|
||||
@@ -511,11 +511,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
|
||||
if ((niso_intr &
|
||||
fb_niso_intr_mmu_other_fault_notify_m()) != 0U) {
|
||||
|
||||
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR,
|
||||
NULL,
|
||||
fault_status,
|
||||
GPU_HUBMMU_OTHER_FAULT_NOTIFY);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
|
||||
"sub-err: OTHER_FAULT_NOTIFY. "
|
||||
"fault_status(0x%x)", fault_status);
|
||||
|
||||
gv11b_fb_handle_dropped_mmu_fault(g, fault_status);
|
||||
|
||||
@@ -540,11 +539,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
|
||||
if ((niso_intr &
|
||||
fb_niso_intr_mmu_nonreplayable_fault_overflow_m()) != 0U) {
|
||||
|
||||
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR,
|
||||
NULL,
|
||||
fault_status,
|
||||
GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
|
||||
"sub-err: NONREPLAYABLE_FAULT_OVERFLOW. "
|
||||
"fault_status(0x%x)", fault_status);
|
||||
|
||||
gv11b_fb_handle_nonreplay_fault_overflow(g,
|
||||
fault_status);
|
||||
@@ -565,11 +563,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
|
||||
if ((niso_intr &
|
||||
fb_niso_intr_mmu_replayable_fault_overflow_m()) != 0U) {
|
||||
|
||||
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR,
|
||||
NULL,
|
||||
fault_status,
|
||||
GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
|
||||
"sub-err: REPLAYABLE_FAULT_OVERFLOW. "
|
||||
"fault_status(0x%x)", fault_status);
|
||||
|
||||
gv11b_fb_handle_replay_fault_overflow(g,
|
||||
fault_status);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* GV11B ECC INTR
|
||||
*
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -49,11 +49,9 @@ static void gv11b_fb_intr_handle_ecc_l2tlb_errs(struct gk20a *g,
|
||||
BUG();
|
||||
}
|
||||
if ((ecc_status & uncorrected_error_mask) != 0U) {
|
||||
nvgpu_report_fb_ecc_err(g,
|
||||
GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED,
|
||||
ecc_addr,
|
||||
g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc sa data error. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,11 +135,9 @@ static void gv11b_fb_intr_handle_ecc_hubtlb_errs(struct gk20a *g,
|
||||
}
|
||||
if ((ecc_status &
|
||||
fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) {
|
||||
nvgpu_report_fb_ecc_err(g,
|
||||
GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED,
|
||||
ecc_addr,
|
||||
g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc sa data error. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -228,15 +224,14 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m())
|
||||
!= 0U) {
|
||||
nvgpu_report_fb_ecc_err(g,
|
||||
GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED,
|
||||
ecc_addr,
|
||||
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pte data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc pte data error. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
if ((ecc_status &
|
||||
fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "corrected ecc pde0 data error");
|
||||
nvgpu_log(g, gpu_dbg_intr, "corrected ecc pde0 data error"
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
/* This error is not expected to occur in gv11b and hence,
|
||||
* this scenario is considered as a fatal error.
|
||||
*/
|
||||
@@ -246,11 +241,9 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m())
|
||||
!= 0U) {
|
||||
nvgpu_report_fb_ecc_err(g,
|
||||
GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED,
|
||||
ecc_addr,
|
||||
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pde0 data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc pde0 data error. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -272,8 +272,7 @@ void ga10b_fifo_ctxsw_timeout_isr(struct gk20a *g,
|
||||
continue;
|
||||
}
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
0, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR, tsgid);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR);
|
||||
|
||||
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
|
||||
recover = g->ops.tsg.check_ctxsw_timeout(tsg, &debug_dump, &ms);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -215,9 +215,7 @@ bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g)
|
||||
continue;
|
||||
}
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
0, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR,
|
||||
tsgid);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR);
|
||||
|
||||
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
|
||||
recover = g->ops.tsg.check_ctxsw_timeout(tsg,
|
||||
|
||||
@@ -294,8 +294,7 @@ static void ga10b_fifo_handle_bad_tsg(struct gk20a *g,
|
||||
nvgpu_err(g, "runlist bad tsg error code not supported");
|
||||
}
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
0, GPU_HOST_PFIFO_SCHED_ERROR, bad_tsg_code);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_SCHED_ERROR);
|
||||
|
||||
/* id is unknown, preempt all runlists and do recovery */
|
||||
/* TBD: nvgpu_rc_sched_error_bad_tsg(g); */
|
||||
|
||||
@@ -142,8 +142,7 @@ static u32 gk20a_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr)
|
||||
}
|
||||
|
||||
if ((fifo_intr & fifo_intr_0_fb_flush_timeout_pending_f()) != 0U) {
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
0, GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR);
|
||||
nvgpu_err(g, "fifo fb flush timeout error");
|
||||
handled |= fifo_intr_0_fb_flush_timeout_pending_f();
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -70,8 +70,7 @@ void gk20a_fifo_intr_handle_chsw_error(struct gk20a *g)
|
||||
u32 intr;
|
||||
|
||||
intr = nvgpu_readl(g, fifo_intr_chsw_error_r());
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
0, GPU_HOST_PFIFO_CHSW_ERROR, intr);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CHSW_ERROR);
|
||||
nvgpu_err(g, "chsw: %08x", intr);
|
||||
g->ops.gr.falcon.dump_stats(g);
|
||||
nvgpu_writel(g, fifo_intr_chsw_error_r(), intr);
|
||||
|
||||
@@ -132,8 +132,7 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g)
|
||||
nvgpu_err(g, "fifo sched error code not supported");
|
||||
}
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
0, GPU_HOST_PFIFO_SCHED_ERROR, sched_error);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_SCHED_ERROR);
|
||||
|
||||
if (sched_error == SCHED_ERROR_CODE_BAD_TSG) {
|
||||
/* id is unknown, preempt all runlists and do recovery */
|
||||
@@ -151,8 +150,7 @@ static u32 gv11b_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr)
|
||||
|
||||
if ((fifo_intr & fifo_intr_0_bind_error_pending_f()) != 0U) {
|
||||
u32 bind_error = nvgpu_readl(g, fifo_intr_bind_error_r());
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0,
|
||||
GPU_HOST_PFIFO_BIND_ERROR, bind_error);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_BIND_ERROR);
|
||||
nvgpu_err(g, "fifo bind error: 0x%08x", bind_error);
|
||||
handled |= fifo_intr_0_bind_error_pending_f();
|
||||
}
|
||||
@@ -163,17 +161,13 @@ static u32 gv11b_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr)
|
||||
}
|
||||
|
||||
if ((fifo_intr & fifo_intr_0_memop_timeout_pending_f()) != 0U) {
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0,
|
||||
GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR);
|
||||
nvgpu_err(g, "fifo memop timeout error");
|
||||
handled |= fifo_intr_0_memop_timeout_pending_f();
|
||||
}
|
||||
|
||||
if ((fifo_intr & fifo_intr_0_lb_error_pending_f()) != 0U) {
|
||||
u32 lb_error = nvgpu_readl(g, fifo_intr_lb_error_r());
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0,
|
||||
GPU_HOST_PFIFO_LB_ERROR, lb_error);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_LB_ERROR);
|
||||
nvgpu_err(g, "fifo lb error");
|
||||
handled |= fifo_intr_0_lb_error_pending_f();
|
||||
}
|
||||
|
||||
@@ -326,8 +326,9 @@ static void report_pbdma_error(struct gk20a *g, u32 pbdma_id,
|
||||
err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR;
|
||||
}
|
||||
if (err_type != GPU_HOST_INVALID_ERROR) {
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
pbdma_id, err_type, pbdma_intr_0);
|
||||
nvgpu_err(g, "pbdma_intr_0(%d)= 0x%08x ",
|
||||
pbdma_id, pbdma_intr_0);
|
||||
nvgpu_report_err_to_sdl(g, err_type);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -536,8 +537,7 @@ bool ga10b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
|
||||
|
||||
recover = true;
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, pbdma_id,
|
||||
GPU_HOST_PBDMA_HCE_ERROR, pbdma_intr_1);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PBDMA_HCE_ERROR);
|
||||
|
||||
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",
|
||||
|
||||
@@ -87,8 +87,8 @@ static void report_pbdma_error(struct gk20a *g, u32 pbdma_id,
|
||||
err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR;
|
||||
}
|
||||
if (err_type != GPU_HOST_INVALID_ERROR) {
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST,
|
||||
pbdma_id, err_type, pbdma_intr_0);
|
||||
nvgpu_log_info(g, "pbdma id:%u", pbdma_id);
|
||||
nvgpu_report_err_to_sdl(g, err_type);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -190,8 +190,7 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
|
||||
|
||||
recover = true;
|
||||
|
||||
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, pbdma_id,
|
||||
GPU_HOST_PBDMA_HCE_ERROR, pbdma_intr_1);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HOST_PBDMA_HCE_ERROR);
|
||||
|
||||
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",
|
||||
|
||||
@@ -195,8 +195,7 @@ static u32 ga10b_gr_intr_check_gr_mme_fe1_exception(struct gk20a *g,
|
||||
info_mthd = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd_r());
|
||||
info_mthd2 = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd2_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0, GPU_PGRAPH_MME_FE1_EXCEPTION,
|
||||
mme_fe1_hww_esr, 0U);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MME_FE1_EXCEPTION);
|
||||
nvgpu_err(g, "mme_fe1 exception: esr 0x%08x, info 0x%08x,"
|
||||
"info_mthd 0x%08x, info_mthd2 0x%08x",
|
||||
mme_fe1_hww_esr, info, info_mthd, info_mthd2);
|
||||
@@ -366,31 +365,29 @@ void ga10b_gr_intr_enable_exceptions(struct gk20a *g,
|
||||
}
|
||||
|
||||
static void ga10b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
u32 ecc_status, u32 gpc, u32 correct_err, u32 uncorrect_err)
|
||||
u32 ecc_status, u32 gpc)
|
||||
{
|
||||
(void)correct_err;
|
||||
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
|
||||
nvgpu_err(g, "corrected ecc sa data error. "
|
||||
"gpc_id(%d)", gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
|
||||
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
|
||||
0U, uncorrect_err);
|
||||
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc sa data error"
|
||||
"gpc_id(%d)", gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
|
||||
nvgpu_err(g, "corrected ecc fa data error"
|
||||
"gpc_id(%d)", gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
|
||||
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
|
||||
0U, uncorrect_err);
|
||||
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc fa data error"
|
||||
"gpc_id(%d)", gpc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -467,9 +464,7 @@ void ga10b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
|
||||
nvgpu_log(g, gpu_dbg_intr,
|
||||
"mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
|
||||
|
||||
ga10b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc,
|
||||
(u32)*corrected_err,
|
||||
(u32)*uncorrected_err);
|
||||
ga10b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc);
|
||||
|
||||
nvgpu_log(g, gpu_dbg_intr,
|
||||
"ecc error address: 0x%x", ecc_addr);
|
||||
@@ -747,15 +742,13 @@ static void ga10b_gr_intr_report_tpc_sm_rams_ecc_err(struct gk20a *g,
|
||||
|
||||
for (i = 0U; i < ecc_status->err_count; i++) {
|
||||
if (ecc_status->err_id[i] == GPU_SM_RAMS_ECC_CORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED, 0,
|
||||
g->ecc.gr.sm_rams_ecc_corrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_CORRECTED);
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_corrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
} else {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
|
||||
g->ecc.gr.sm_rams_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,9 +106,7 @@ u32 gm20b_gr_intr_check_gr_ssync_exception(struct gk20a *g, u32 exception)
|
||||
g->ops.gr.intr.handle_ssync_hww(g, &ssync_esr);
|
||||
reset_gpc = 1U;
|
||||
}
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_SSYNC_EXCEPTION,
|
||||
ssync_esr, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SSYNC_EXCEPTION);
|
||||
}
|
||||
return reset_gpc;
|
||||
}
|
||||
@@ -119,9 +117,7 @@ u32 gm20b_gr_intr_check_gr_mme_exception(struct gk20a *g, u32 exception)
|
||||
u32 mme = nvgpu_readl(g, gr_mme_hww_esr_r());
|
||||
u32 info = nvgpu_readl(g, gr_mme_hww_esr_info_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_MME_EXCEPTION,
|
||||
mme, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MME_EXCEPTION);
|
||||
nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
|
||||
mme, info);
|
||||
#ifdef CONFIG_NVGPU_DGPU
|
||||
@@ -141,9 +137,7 @@ u32 gm20b_gr_intr_check_gr_sked_exception(struct gk20a *g, u32 exception)
|
||||
if ((exception & gr_exception_sked_m()) != 0U) {
|
||||
u32 sked = nvgpu_readl(g, gr_sked_hww_esr_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_SKED_EXCEPTION,
|
||||
sked, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SKED_EXCEPTION);
|
||||
nvgpu_err(g, "sked exception: esr 0x%08x", sked);
|
||||
nvgpu_writel(g, gr_sked_hww_esr_r(),
|
||||
gr_sked_hww_esr_reset_active_f());
|
||||
@@ -158,10 +152,8 @@ static u32 gr_gm20b_intr_check_gr_be_crop_exception(struct gk20a *g,
|
||||
if ((exception & gr_pri_be0_becs_be_exception_crop_m()) != 0U) {
|
||||
u32 crop = nvgpu_readl(g, gr_crop_hww_esr_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_BE_EXCEPTION,
|
||||
crop, GPU_PGRAPH_BE_EXCEPTION_CROP);
|
||||
nvgpu_err(g, "crop exception: esr 0x%08x", crop);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_BE_EXCEPTION);
|
||||
nvgpu_err(g, "BE exception: crop exception: esr 0x%08x", crop);
|
||||
nvgpu_writel(g, gr_crop_hww_esr_r(),
|
||||
gr_crop_hww_esr_reset_active_f());
|
||||
return 1U;
|
||||
@@ -175,10 +167,8 @@ static u32 gr_gm20b_intr_check_gr_be_zrop_exception(struct gk20a *g,
|
||||
if ((exception & gr_pri_be0_becs_be_exception_zrop_m()) != 0U) {
|
||||
u32 zrop = nvgpu_readl(g, gr_zrop_hww_esr_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_BE_EXCEPTION,
|
||||
zrop, GPU_PGRAPH_BE_EXCEPTION_ZROP);
|
||||
nvgpu_err(g, "zrop exception: esr 0x%08x", zrop);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_BE_EXCEPTION);
|
||||
nvgpu_err(g, "BE exception: zrop exception: esr 0x%08x", zrop);
|
||||
nvgpu_writel(g, gr_zrop_hww_esr_r(),
|
||||
gr_zrop_hww_esr_reset_active_f());
|
||||
return 1U;
|
||||
@@ -192,9 +182,7 @@ u32 gm20b_gr_intr_check_gr_fe_exception(struct gk20a *g, u32 exception)
|
||||
u32 fe = nvgpu_readl(g, gr_fe_hww_esr_r());
|
||||
u32 info = nvgpu_readl(g, gr_fe_hww_esr_info_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_FE_EXCEPTION,
|
||||
fe, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_FE_EXCEPTION);
|
||||
nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
|
||||
fe, info);
|
||||
nvgpu_writel(g, gr_fe_hww_esr_r(),
|
||||
@@ -209,9 +197,7 @@ u32 gm20b_gr_intr_check_gr_memfmt_exception(struct gk20a *g, u32 exception)
|
||||
if ((exception & gr_exception_memfmt_m()) != 0U) {
|
||||
u32 memfmt = nvgpu_readl(g, gr_memfmt_hww_esr_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_MEMFMT_EXCEPTION,
|
||||
memfmt, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MEMFMT_EXCEPTION);
|
||||
nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
|
||||
nvgpu_writel(g, gr_memfmt_hww_esr_r(),
|
||||
gr_memfmt_hww_esr_reset_active_f());
|
||||
@@ -225,9 +211,7 @@ u32 gm20b_gr_intr_check_gr_pd_exception(struct gk20a *g, u32 exception)
|
||||
if ((exception & gr_exception_pd_m()) != 0U) {
|
||||
u32 pd = nvgpu_readl(g, gr_pd_hww_esr_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_PD_EXCEPTION,
|
||||
pd, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_PD_EXCEPTION);
|
||||
nvgpu_err(g, "pd exception: esr 0x%08x", pd);
|
||||
nvgpu_writel(g, gr_pd_hww_esr_r(),
|
||||
gr_pd_hww_esr_reset_active_f());
|
||||
@@ -241,9 +225,7 @@ u32 gm20b_gr_intr_check_gr_scc_exception(struct gk20a *g, u32 exception)
|
||||
if ((exception & gr_exception_scc_m()) != 0U) {
|
||||
u32 scc = nvgpu_readl(g, gr_scc_hww_esr_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_SCC_EXCEPTION,
|
||||
scc, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SCC_EXCEPTION);
|
||||
nvgpu_err(g, "scc exception: esr 0x%08x", scc);
|
||||
nvgpu_writel(g, gr_scc_hww_esr_r(),
|
||||
gr_scc_hww_esr_reset_active_f());
|
||||
@@ -257,9 +239,7 @@ u32 gm20b_gr_intr_check_gr_ds_exception(struct gk20a *g, u32 exception)
|
||||
if ((exception & gr_exception_ds_m()) != 0U) {
|
||||
u32 ds = nvgpu_readl(g, gr_ds_hww_esr_r());
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_DS_EXCEPTION,
|
||||
ds, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_DS_EXCEPTION);
|
||||
nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
|
||||
nvgpu_writel(g, gr_ds_hww_esr_r(),
|
||||
gr_ds_hww_esr_reset_task_f());
|
||||
|
||||
@@ -88,18 +88,12 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g)
|
||||
fecs_ecc_status.uncorrected_delta);
|
||||
|
||||
if (fecs_ecc_status.imem_corrected_err) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
|
||||
GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
|
||||
fecs_ecc_status.ecc_addr,
|
||||
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_IMEM_ECC_CORRECTED);
|
||||
nvgpu_err(g, "imem ecc error corrected - error count:%d",
|
||||
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
|
||||
}
|
||||
if (fecs_ecc_status.imem_uncorrected_err) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
|
||||
GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
|
||||
fecs_ecc_status.ecc_addr,
|
||||
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "imem ecc error uncorrected - error count:%d",
|
||||
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
|
||||
}
|
||||
@@ -112,10 +106,7 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g)
|
||||
BUG();
|
||||
}
|
||||
if (fecs_ecc_status.dmem_uncorrected_err) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
|
||||
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
|
||||
fecs_ecc_status.ecc_addr,
|
||||
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "dmem ecc error uncorrected - error count %d",
|
||||
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
|
||||
}
|
||||
@@ -350,9 +341,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
|
||||
}
|
||||
*uncorrected_err = nvgpu_safe_add_u32(*uncorrected_err,
|
||||
gcc_l15_uncorrected_err_count_delta);
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc,
|
||||
GPU_GCC_L15_ECC_UNCORRECTED,
|
||||
0, *uncorrected_err);
|
||||
nvgpu_report_err_to_sdl(g, GPU_GCC_L15_ECC_UNCORRECTED);
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r(), offset),
|
||||
0);
|
||||
@@ -364,11 +353,8 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
|
||||
}
|
||||
|
||||
static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
u32 ecc_status, u32 gpc,
|
||||
u32 correct_err, u32 uncorrect_err)
|
||||
u32 ecc_status, u32 gpc)
|
||||
{
|
||||
(void)correct_err;
|
||||
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) !=
|
||||
0U) {
|
||||
@@ -381,10 +367,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
|
||||
0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
|
||||
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
|
||||
0, uncorrect_err);
|
||||
nvgpu_err(g, "uncorrected ecc sa data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) !=
|
||||
@@ -398,10 +382,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
|
||||
0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
|
||||
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
|
||||
0, uncorrect_err);
|
||||
nvgpu_err(g, "uncorrected ecc fa data error");
|
||||
nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -482,8 +464,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
|
||||
nvgpu_err(g, "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x",
|
||||
gpc, hww_esr);
|
||||
|
||||
gv11b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc,
|
||||
(u32)*corrected_err, (u32)*uncorrected_err);
|
||||
gv11b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc);
|
||||
|
||||
nvgpu_err(g, "ecc error address: 0x%x", ecc_addr);
|
||||
nvgpu_err(g, "ecc error count corrected: %d, uncorrected %d",
|
||||
@@ -491,22 +472,19 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
|
||||
}
|
||||
|
||||
static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g,
|
||||
u32 ecc_status, u32 ecc_addr, u32 gpc,
|
||||
u32 correct_err, u32 uncorrect_err)
|
||||
u32 ecc_status, u32 ecc_addr, u32 gpc)
|
||||
{
|
||||
if ((ecc_status &
|
||||
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS,
|
||||
gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
|
||||
ecc_addr, correct_err);
|
||||
nvgpu_err(g, "imem ecc error corrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED);
|
||||
nvgpu_err(g, "imem ecc error corrected"
|
||||
"ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS,
|
||||
gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED,
|
||||
ecc_addr, uncorrect_err);
|
||||
nvgpu_err(g, "imem ecc error uncorrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "imem ecc error uncorrected"
|
||||
"ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
|
||||
@@ -518,10 +496,9 @@ static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g,
|
||||
}
|
||||
if ((ecc_status &
|
||||
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS,
|
||||
gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
|
||||
ecc_addr, uncorrect_err);
|
||||
nvgpu_err(g, "dmem ecc error uncorrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "dmem ecc error uncorrected"
|
||||
"ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -538,9 +515,7 @@ void gv11b_gr_intr_handle_gpc_prop_exception(struct gk20a *g, u32 gpc,
|
||||
hww_esr = nvgpu_readl(g,
|
||||
nvgpu_safe_add_u32(gr_gpc0_prop_hww_esr_r(), offset));
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, (gpc << 8U),
|
||||
GPU_PGRAPH_GPC_GFX_EXCEPTION,
|
||||
hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_PROP);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
|
||||
|
||||
/*
|
||||
* print additional diagnostic information.
|
||||
@@ -584,9 +559,7 @@ void gv11b_gr_intr_handle_gpc_zcull_exception(struct gk20a *g, u32 gpc,
|
||||
hww_esr = nvgpu_readl(g,
|
||||
nvgpu_safe_add_u32(gr_gpc0_zcull_hww_esr_r(), offset));
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, (gpc << 8U),
|
||||
GPU_PGRAPH_GPC_GFX_EXCEPTION,
|
||||
hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_ZCULL);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
|
||||
|
||||
/* clear the interrupt */
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
@@ -610,9 +583,7 @@ void gv11b_gr_intr_handle_gpc_setup_exception(struct gk20a *g, u32 gpc,
|
||||
hww_esr = nvgpu_readl(g,
|
||||
nvgpu_safe_add_u32(gr_gpc0_setup_hww_esr_r(), offset));
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, (gpc << 8U),
|
||||
GPU_PGRAPH_GPC_GFX_EXCEPTION,
|
||||
hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_SETUP);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
|
||||
|
||||
/* clear the interrupt */
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
@@ -627,7 +598,7 @@ void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc,
|
||||
u32 gpc_exception)
|
||||
{
|
||||
u32 offset = nvgpu_gr_gpc_offset(g, gpc);
|
||||
u32 hww_esr, sub_err_type;
|
||||
u32 hww_esr;
|
||||
|
||||
if (((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) == 0U) &&
|
||||
((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m())
|
||||
@@ -638,17 +609,7 @@ void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc,
|
||||
hww_esr = nvgpu_readl(g,
|
||||
nvgpu_safe_add_u32(gr_gpc0_ppc0_pes_hww_esr_r(), offset));
|
||||
|
||||
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) != 0U) {
|
||||
sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES0;
|
||||
}
|
||||
|
||||
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m()) != 0U) {
|
||||
sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES1;
|
||||
}
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, (gpc << 8U),
|
||||
GPU_PGRAPH_GPC_GFX_EXCEPTION,
|
||||
hww_esr, sub_err_type);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
|
||||
|
||||
/* clear the interrupt */
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
@@ -725,8 +686,7 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
|
||||
|
||||
nvgpu_err(g, "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
|
||||
|
||||
gv11b_gr_intr_report_gpccs_ecc_err(g, ecc_status, ecc_addr, gpc,
|
||||
(u32)*corrected_err, (u32)*uncorrected_err);
|
||||
gv11b_gr_intr_report_gpccs_ecc_err(g, ecc_status, ecc_addr, gpc);
|
||||
|
||||
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
|
||||
nvgpu_err(g, "gpccs ecc counter overflow!");
|
||||
@@ -753,9 +713,7 @@ void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
|
||||
offset));
|
||||
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "mpc hww esr 0x%08x", esr);
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
|
||||
GPU_PGRAPH_MPC_EXCEPTION,
|
||||
esr, 0);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MPC_EXCEPTION);
|
||||
|
||||
esr = nvgpu_readl(g,
|
||||
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),
|
||||
@@ -781,9 +739,7 @@ void gv11b_gr_intr_handle_tpc_pe_exception(struct gk20a *g, u32 gpc, u32 tpc)
|
||||
|
||||
esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_tpc0_pe_hww_esr_r(),
|
||||
offset));
|
||||
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
|
||||
GPU_PGRAPH_GPC_GFX_EXCEPTION,
|
||||
esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_TPC_PE);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
|
||||
|
||||
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "pe hww esr 0x%08x", esr);
|
||||
|
||||
@@ -938,24 +894,21 @@ static void gv11b_gr_intr_report_l1_tag_uncorrected_err(struct gk20a *g,
|
||||
|
||||
for (i = 0U; i < ecc_status->err_count; i++) {
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_UNCORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
|
||||
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_uncorrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0,
|
||||
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0,
|
||||
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -973,10 +926,9 @@ static void gv11b_gr_intr_report_l1_tag_corrected_err(struct gk20a *g,
|
||||
|
||||
for (i = 0U; i < ecc_status->err_count; i++) {
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_CORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED, 0,
|
||||
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_corrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_CORRECTED);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1296,10 +1248,7 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
|
||||
nvgpu_safe_add_u32(
|
||||
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
|
||||
lrf_uncorrected_err_count_delta);
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_LRF_ECC_UNCORRECTED, 0,
|
||||
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_LRF_ECC_UNCORRECTED);
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset),
|
||||
0U);
|
||||
@@ -1431,10 +1380,7 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
|
||||
nvgpu_safe_add_u32(
|
||||
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter,
|
||||
cbu_uncorrected_err_count_delta);
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_CBU_ECC_UNCORRECTED,
|
||||
0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_CBU_ECC_UNCORRECTED);
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset),
|
||||
0U);
|
||||
@@ -1562,10 +1508,7 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
|
||||
nvgpu_safe_add_u32(
|
||||
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter,
|
||||
l1_data_uncorrected_err_count_delta);
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_L1_DATA_ECC_UNCORRECTED,
|
||||
0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_L1_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset),
|
||||
0U);
|
||||
@@ -1588,31 +1531,27 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g,
|
||||
|
||||
for (i = 0U; i < ecc_status->err_count; i++) {
|
||||
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED,
|
||||
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
|
||||
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED,
|
||||
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
|
||||
(gpc << SHIFT_8_BITS) | tpc,
|
||||
GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED,
|
||||
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
|
||||
nvgpu_err(g, "sm_icache_l1_predecode_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,6 +285,9 @@
|
||||
#include <nvgpu/grmgr.h>
|
||||
#endif
|
||||
|
||||
#include "hal/cic/mon/cic_gv11b.h"
|
||||
#include <nvgpu/cic_mon.h>
|
||||
|
||||
static int ga10b_init_gpu_characteristics(struct gk20a *g)
|
||||
{
|
||||
int err;
|
||||
@@ -1713,6 +1716,11 @@ static const struct gops_mssnvlink ga10b_ops_mssnvlink = {
|
||||
};
|
||||
#endif
|
||||
|
||||
static const struct gops_cic_mon ga10b_ops_cic_mon = {
|
||||
.init = gv11b_cic_mon_init,
|
||||
.report_err = nvgpu_cic_mon_report_err_safety_services
|
||||
};
|
||||
|
||||
int ga10b_init_hal(struct gk20a *g)
|
||||
{
|
||||
struct gpu_ops *gops = &g->ops;
|
||||
@@ -1812,6 +1820,7 @@ int ga10b_init_hal(struct gk20a *g)
|
||||
gops->tpc_pg = ga10b_ops_tpc_pg;
|
||||
#endif
|
||||
gops->grmgr = ga10b_ops_grmgr;
|
||||
gops->cic_mon = ga10b_ops_cic_mon;
|
||||
gops->chip_init_gpu_characteristics = ga10b_init_gpu_characteristics;
|
||||
gops->get_litter_value = ga10b_get_litter_value;
|
||||
gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;
|
||||
|
||||
@@ -411,11 +411,7 @@ static void ga10b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
|
||||
nvgpu_wrapping_add_u32(
|
||||
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr,
|
||||
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) {
|
||||
@@ -446,11 +442,7 @@ static void ga10b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
||||
nvgpu_wrapping_add_u32(
|
||||
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
|
||||
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) {
|
||||
@@ -516,11 +508,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
|
||||
corrected_delta);
|
||||
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr,
|
||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
|
||||
|
||||
/*
|
||||
* Using a SEC code will allow correction of an SBE (Single Bit
|
||||
@@ -551,11 +539,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr,
|
||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
|
||||
} else if (ga10b_ltc_intr_is_dstg_be_ram(ecc_addr)) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected");
|
||||
|
||||
@@ -564,11 +548,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
||||
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
|
||||
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED);
|
||||
} else {
|
||||
nvgpu_err(g, "unsupported uncorrected dstg ecc error");
|
||||
BUG();
|
||||
|
||||
@@ -126,12 +126,9 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
||||
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
|
||||
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "tstg ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,12 +145,9 @@ void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
|
||||
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
|
||||
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "dstg be ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -287,11 +281,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
|
||||
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||
ecc_stats_reg_val);
|
||||
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, dstg_ecc_addr,
|
||||
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
|
||||
nvgpu_err(g, "dstg ecc error corrected. "
|
||||
"ecc_addr(0x%x)", dstg_ecc_addr);
|
||||
|
||||
/*
|
||||
* Using a SEC code will allow correction of an SBE (Single Bit
|
||||
@@ -335,11 +327,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
|
||||
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||
ecc_stats_reg_val);
|
||||
|
||||
nvgpu_report_ecc_err(g,
|
||||
NVGPU_ERR_MODULE_LTC,
|
||||
(ltc << 8U) | slice,
|
||||
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, dstg_ecc_addr,
|
||||
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
|
||||
nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "dstg ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", dstg_ecc_addr);
|
||||
}
|
||||
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(ltc_ltc0_lts0_intr_r(), offset),
|
||||
|
||||
@@ -521,11 +521,10 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g,
|
||||
}
|
||||
#endif
|
||||
|
||||
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR,
|
||||
mmufault,
|
||||
fault_status,
|
||||
sub_err_type);
|
||||
nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
nvgpu_err(g, "sub_er_type = 0x%x, "
|
||||
"fault_status = 0x%x",
|
||||
sub_err_type, fault_status);
|
||||
|
||||
nvgpu_assert(get_indx < U32_MAX);
|
||||
nvgpu_assert(entries != 0U);
|
||||
|
||||
@@ -141,24 +141,20 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
|
||||
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
|
||||
GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
|
||||
ecc_addr,
|
||||
g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_IMEM_ECC_CORRECTED);
|
||||
nvgpu_err(g, "falcon imem ecc error corrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
|
||||
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
|
||||
ecc_addr,
|
||||
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "falcon imem ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
ret = -EFAULT;
|
||||
}
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
|
||||
nvgpu_err(g, "falcon dmem ecc error corrected");
|
||||
/* This error is not expected to occur in gv11b and hence,
|
||||
* this scenario is considered as a fatal error.
|
||||
*/
|
||||
@@ -167,11 +163,9 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
|
||||
}
|
||||
if ((ecc_status &
|
||||
pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
|
||||
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
|
||||
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
|
||||
ecc_addr,
|
||||
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
|
||||
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
|
||||
nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "falcon dmem ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
ret = -EFAULT;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* GA10B priv ring
|
||||
*
|
||||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -378,8 +378,7 @@ void ga10b_priv_ring_decode_error_code(struct gk20a *g, u32 error_code)
|
||||
size_t lookup_table_size = 1;
|
||||
size_t index = 0;
|
||||
|
||||
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, 0,
|
||||
GPU_PRI_ACCESS_VIOLATION, 0, error_code);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PRI_ACCESS_VIOLATION);
|
||||
|
||||
err_code = pri_sys_pri_error_code_v(error_code);
|
||||
error_extra = pri_sys_pri_error_extra_v(error_code);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* GP10B priv ring
|
||||
*
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -71,8 +71,7 @@ void gp10b_priv_ring_decode_error_code(struct gk20a *g,
|
||||
{
|
||||
u32 error_type_index;
|
||||
|
||||
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, 0,
|
||||
GPU_PRI_ACCESS_VIOLATION, 0, error_code);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PRI_ACCESS_VIOLATION);
|
||||
|
||||
error_type_index = (error_code & 0x00000f00U) >> 8U;
|
||||
error_code = error_code & 0xBADFf000U;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -32,7 +32,6 @@
|
||||
void ga10b_ptimer_isr(struct gk20a *g)
|
||||
{
|
||||
u32 save0, save1, fecs_errcode = 0;
|
||||
u32 inst = 0U;
|
||||
u32 error_addr;
|
||||
|
||||
save0 = nvgpu_readl(g, timer_pri_timeout_save_0_r());
|
||||
@@ -62,13 +61,7 @@ void ga10b_ptimer_isr(struct gk20a *g)
|
||||
g->ops.priv_ring.decode_error_code(g,
|
||||
fecs_errcode);
|
||||
}
|
||||
/* FECS was the target of PRI access */
|
||||
inst = 1U;
|
||||
/* SAVE_0_ADDR cannot be used in this case */
|
||||
error_addr = 0U;
|
||||
}
|
||||
|
||||
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI,
|
||||
inst, GPU_PRI_TIMEOUT_ERROR,
|
||||
error_addr, fecs_errcode);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PRI_TIMEOUT_ERROR);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -32,7 +32,6 @@
|
||||
void gk20a_ptimer_isr(struct gk20a *g)
|
||||
{
|
||||
u32 save0, save1, fecs_errcode = 0;
|
||||
u32 inst = 0U;
|
||||
u32 error_addr;
|
||||
|
||||
save0 = gk20a_readl(g, timer_pri_timeout_save_0_r());
|
||||
@@ -55,10 +54,6 @@ void gk20a_ptimer_isr(struct gk20a *g)
|
||||
g->ops.priv_ring.decode_error_code(g,
|
||||
fecs_errcode);
|
||||
}
|
||||
/* FECS was the target of PRI access */
|
||||
inst = 1U;
|
||||
/* SAVE_0_ADDR cannot be used in this case */
|
||||
error_addr = 0U;
|
||||
}
|
||||
|
||||
nvgpu_err(g, "PRI timeout: ADR 0x%08x "
|
||||
@@ -70,9 +65,7 @@ void gk20a_ptimer_isr(struct gk20a *g)
|
||||
gk20a_writel(g, timer_pri_timeout_save_0_r(), 0);
|
||||
gk20a_writel(g, timer_pri_timeout_save_1_r(), 0);
|
||||
|
||||
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI,
|
||||
inst, GPU_PRI_TIMEOUT_ERROR,
|
||||
error_addr, fecs_errcode);
|
||||
nvgpu_report_err_to_sdl(g, GPU_PRI_TIMEOUT_ERROR);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVGPU_IOCTL_NON_FUSA
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -375,25 +375,14 @@ int nvgpu_cic_mon_get_err_desc(struct gk20a *g, u32 hw_unit_id,
|
||||
* used by sub-units in nvgpu-rm and SDL unit.
|
||||
*
|
||||
* @param g [in] - The GPU driver struct.
|
||||
* @param err_info [in] - Error message.
|
||||
* @param err_size [in] - Size of the error message.
|
||||
* @param is_critical [in] - Criticality of the error being reported.
|
||||
* @param err_id [in] - Error ID.
|
||||
*
|
||||
* On QNX:
|
||||
* - Checks whether SDL is initialized.
|
||||
* - Enqueues \a err_info into error message queue.
|
||||
* - Signals the workqueue condition variable.
|
||||
* - If the reported error is critical, invokes #nvgpu_sw_quiesce() api.
|
||||
*
|
||||
* on Linux:
|
||||
* - NOP currently as safety services are absent in Linux
|
||||
* - Reports the errors to Safety_Services.
|
||||
*
|
||||
* @return 0 in case of success, <0 in case of failure.
|
||||
* @retval -EAGAIN if SDL not initialized.
|
||||
* @retval -ENOMEM if sufficient memory is not available.
|
||||
*/
|
||||
int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g,
|
||||
void *err_info, size_t err_size, bool is_critical);
|
||||
u32 err_id);
|
||||
|
||||
/**
|
||||
* @brief Get the number of HW modules supported by CIC.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -53,15 +53,11 @@ struct gops_cic_mon {
|
||||
* @brief Report error to safety services.
|
||||
*
|
||||
* @param g [in] Pointer to GPU driver struct.
|
||||
* @param err_pkt [in] Pointer to struct holding err details.
|
||||
* @param err_size [in] Size of err_pkt.
|
||||
* @param is_critical [in] Flag indicating criticality of error.
|
||||
* @param err_id [in] Error ID.
|
||||
*
|
||||
* @return 0 in case of success, < 0 in case of failure.
|
||||
*/
|
||||
int (*report_err)(struct gk20a *g,
|
||||
void *err_pkt, size_t err_size,
|
||||
bool is_critical);
|
||||
int (*report_err)(struct gk20a *g, u32 err_id);
|
||||
};
|
||||
|
||||
#endif/*NVGPU_GOPS_CIC_MON_H*/
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -125,15 +125,11 @@ struct gops_ltc_intr {
|
||||
* -# Increment g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter
|
||||
* with uncorrected counter delta with
|
||||
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
|
||||
* "nvgpu_report_ecc_err" with following parameters:
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
|
||||
* "nvgpu_report_err_to_sdl" with following parameters:
|
||||
* -# \a g
|
||||
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
|
||||
* -# (\a ltc << 8U) | \a slice
|
||||
* -# \ref GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED
|
||||
* "GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED"
|
||||
* -# ecc address read above
|
||||
* -# g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter
|
||||
* -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() is
|
||||
* set in ecc status, then it is considered as fatal error as it is not
|
||||
* expected and call \ref BUG "BUG()".
|
||||
@@ -143,15 +139,11 @@ struct gops_ltc_intr {
|
||||
* -# Increment g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter
|
||||
* with uncorrected counter delta with
|
||||
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
|
||||
* "nvgpu_report_ecc_err" with following parameters:
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
|
||||
* "nvgpu_report_err_to_sdl" with following parameters:
|
||||
* -# \a g
|
||||
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
|
||||
* -# (\a ltc << 8U) | \a slice
|
||||
* -# \ref GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED
|
||||
* "GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED"
|
||||
* -# ecc address read above
|
||||
* -# g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter
|
||||
* -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() is
|
||||
* set in ecc status, then it is considered as fatal error as it is not
|
||||
* expected and call \ref BUG "BUG()".
|
||||
@@ -162,15 +154,11 @@ struct gops_ltc_intr {
|
||||
* -# Increment g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter
|
||||
* with corrected counter delta with
|
||||
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
|
||||
* "nvgpu_report_ecc_err" with following parameters:
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
|
||||
* "nvgpu_report_err_to_sdl" with following parameters:
|
||||
* -# \a g
|
||||
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
|
||||
* -# (\a ltc << 8U) | \a slice
|
||||
* -# \ref GPU_LTC_CACHE_DSTG_ECC_CORRECTED
|
||||
* "GPU_LTC_CACHE_DSTG_ECC_CORRECTED"
|
||||
* -# ecc address read above.
|
||||
* -# g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter
|
||||
* -# Flush the L2 cache by calling
|
||||
* \ref gops_mm_cache.l2_flush "gops_mm_cache.l2_flush".
|
||||
* -# If it fails then call \ref BUG "BUG()".
|
||||
@@ -182,28 +170,20 @@ struct gops_ltc_intr {
|
||||
* -# Increment g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter
|
||||
* with uncorrected counter delta with
|
||||
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
|
||||
* "nvgpu_report_ecc_err" with following parameters:
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
|
||||
* "nvgpu_report_err_to_sdl" with following parameters:
|
||||
* -# \a g
|
||||
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
|
||||
* -# (\a ltc << 8U) | \a slice
|
||||
* -# \ref GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED
|
||||
* "GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED"
|
||||
* -# ecc address read above.
|
||||
* -# g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter
|
||||
* -# Else if the ECC address correspongs to DSTG BE RAM:
|
||||
* -# Increment g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter
|
||||
* with uncorrected counter delta with
|
||||
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
|
||||
* "nvgpu_report_ecc_err" with following parameters:
|
||||
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
|
||||
* "nvgpu_report_err_to_sdl" with following parameters:
|
||||
* -# \a g
|
||||
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
|
||||
* -# (\a ltc << 8U) | \a slice
|
||||
* -# \ref GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
|
||||
* "GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED"
|
||||
* -# ecc address read above
|
||||
* -# g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter
|
||||
* -# Else call \ref BUG "BUG()" as this type of ECC error is not supported.
|
||||
* -# Clear the register ltc_ltc0_lts0_intr3_r() by writing the read value.
|
||||
* - return 0
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -284,8 +284,8 @@ struct gops_priv_ring {
|
||||
* "pri route error"
|
||||
* };
|
||||
* \endcode
|
||||
* - Invoke \ref #nvgpu_report_pri_err "nvgpu_report_pri_err" with parameters \a g,
|
||||
* #NVGPU_ERR_MODULE_PRI, #GPU_PRI_ACCESS_VIOLATION, 0, error_code respectively.
|
||||
* - Invoke \ref #nvgpu_report_pri_err "nvgpu_report_err_to_sdl" with parameters \a g,
|
||||
* #GPU_PRI_ACCESS_VIOLATION, respectively.
|
||||
* - Declare a variable error_type_index and store the bits [8-12] as below.
|
||||
* error_type_index will be used as an index to the above error tables.
|
||||
* error_code is also updated.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -91,14 +91,10 @@ struct gops_ptimer {
|
||||
* - Clear timer_pri_timeout_save_0_r() and timer_pri_timeout_save_1_r()
|
||||
* registers so that the next pri access error can be recorded. Write
|
||||
* 0 to these two registers to clear the previous error information.
|
||||
* - Report the PRI_TIMEOUT_ERROR to SDL unit using \ref nvgpu_report_pri_err()
|
||||
* API. The inputs to \ref nvgpu_report_pri_err() are -
|
||||
* - Report the PRI_TIMEOUT_ERROR to SDL unit using \ref nvgpu_report_err_to_sdl()
|
||||
* API. The inputs to \ref nvgpu_report_err_to_sdl() are -
|
||||
* - g,
|
||||
* - NVGPU_ERR_MODULE_PRI,
|
||||
* - inst,
|
||||
* - GPU_PRI_TIMEOUT_ERROR,
|
||||
* - error_addr,
|
||||
* - fecs_errcode
|
||||
* - GPU_PRI_TIMEOUT_ERROR.
|
||||
*/
|
||||
void (*isr)(struct gk20a *g);
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -112,23 +112,6 @@ void nvgpu_gr_intr_handle_notify_pending(struct gk20a *g,
|
||||
void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g,
|
||||
struct nvgpu_gr_isr_data *isr_data);
|
||||
|
||||
/**
|
||||
* @brief Report GR exceptions to qnx.sdl unit.
|
||||
*
|
||||
* @param g [in] Pointer to GPU driver struct.
|
||||
* @param inst [in] Unit instance ID.
|
||||
* @param err_type [in] Error type.
|
||||
* @param status [in] Exception status value.
|
||||
* @param sub_err_type [in] Sub error type.
|
||||
*
|
||||
* This function reports all GR exceptions to qnx.sdl unit.
|
||||
*
|
||||
* Other interrupt handling functions like #nvgpu_gr_intr_handle_fecs_error()
|
||||
* call this function to report exceptions to qnx.sdl.
|
||||
*/
|
||||
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
||||
u32 err_type, u32 status, u32 sub_err_type);
|
||||
|
||||
/**
|
||||
* @brief Translate context to channel ID.
|
||||
*
|
||||
@@ -223,7 +206,6 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
|
||||
* @see nvgpu_gr_intr_handle_notify_pending
|
||||
* @see nvgpu_gr_intr_handle_semaphore_pending
|
||||
* @see nvgpu_gr_intr_handle_sm_exception
|
||||
* @see nvgpu_gr_intr_report_exception
|
||||
* @see nvgpu_gr_intr_set_error_notifier
|
||||
*/
|
||||
int nvgpu_gr_intr_stall_isr(struct gk20a *g);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -1195,4 +1195,16 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
|
||||
void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
|
||||
u32 mailbox_value);
|
||||
|
||||
/**
|
||||
* @brief This is a wrapper function to report ECC errors from HUBMMU to SDL.
|
||||
*
|
||||
* @param g [in] - The GPU driver struct.
|
||||
* @param err_id [in] - Error ID.
|
||||
*
|
||||
* Calls nvgpu_report_err_to_ss to report errors to Safety_Services.
|
||||
*
|
||||
* @return None
|
||||
*/
|
||||
void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id);
|
||||
|
||||
#endif /* NVGPU_NVGPU_ERR_H */
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2021-2022, NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
@@ -20,7 +20,11 @@
|
||||
struct gk20a;
|
||||
|
||||
int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g,
|
||||
void *err_info, size_t err_size, bool is_critical)
|
||||
u32 metadata)
|
||||
{
|
||||
/**
|
||||
* ToDo: Add MISC_EC API to report error.
|
||||
* Decide on triggering SW quiesce for UE.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
@@ -50,11 +50,9 @@ void nvgpu_ecc_sysfs_remove(struct gk20a *g)
|
||||
#endif
|
||||
|
||||
int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g,
|
||||
void *err_info, size_t err_size, bool is_critical)
|
||||
u32 err_id)
|
||||
{
|
||||
(void)g;
|
||||
(void)err_info;
|
||||
(void)err_size;
|
||||
(void)is_critical;
|
||||
(void)err_id;
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user