gpu: nvgpu: merge error reporting apis

In DRIVE 6.0, NvGPU is allowed to report only 32-bit metadata to
Safety_Services. So, there is no need to have distinct APIs for
reporting errors from units like GR, MM, FIFO to SDL unit. All
these error reporting APIs will be replaced with a single API. To
meet this objective, this patch does the following changes:
- Replaces nvgpu_report_*_err with nvgpu_report_err_to_sdl.
- Removes the reporting of error messages.
- Replaces nvgpu_log() with nvgpu_err(), for error reporting.
- Removes error reporting to Safety_Services from nvgpu_report_*_err.

However, nvgpu_report_*_err APIs and their related files are not
removed. During the creation of nvgpu-mon, they will be moved under
nvgpu-rm, in debug builds.

Note:
- There will be a follow-up patch to fix error IDs.
- As discussed in https://nvbugs/3491596 (comment #12), the high
level expectation is to report only errors.

JIRA NVGPU-7450

Change-Id: I428f2a9043086462754ac36a15edf6094985316f
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2662590
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Rajesh Devaraj
2022-02-01 16:04:37 +00:00
committed by mobile promotions
parent 2a98d20263
commit 7dc013d242
51 changed files with 310 additions and 658 deletions

View File

@@ -1135,6 +1135,7 @@ cic:
common/cic/mon/mon_pri.c, common/cic/mon/mon_pri.c,
common/cic/mon/mon_pmu.c, common/cic/mon/mon_pmu.c,
common/cic/mon/mon_mmu.c, common/cic/mon/mon_mmu.c,
common/cic/mon/mon_report_err.c,
common/cic/mon/cic_mon_priv.h, common/cic/mon/cic_mon_priv.h,
include/nvgpu/gops/cic_mon.h, include/nvgpu/gops/cic_mon.h,
include/nvgpu/cic_mon.h, include/nvgpu/cic_mon.h,

View File

@@ -237,7 +237,7 @@ vm:
os/linux/nvgpu_ivm.c ] os/linux/nvgpu_ivm.c ]
cic: cic:
sources: [ os/linux/cic/cic_stub.c ] sources: [ os/linux/cic/cic_report_err.c ]
# Group all the Linux headers for now. # Group all the Linux headers for now.
headers: headers:

View File

@@ -342,6 +342,7 @@ nvgpu-y += \
common/cic/mon/mon_pri.o \ common/cic/mon/mon_pri.o \
common/cic/mon/mon_pmu.o \ common/cic/mon/mon_pmu.o \
common/cic/mon/mon_mmu.o \ common/cic/mon/mon_mmu.o \
common/cic/mon/mon_report_err.o \
common/cic/rm/rm_init.o \ common/cic/rm/rm_init.o \
common/cic/rm/rm_intr.o \ common/cic/rm/rm_intr.o \
hal/bus/bus_gk20a.o \ hal/bus/bus_gk20a.o \
@@ -472,7 +473,7 @@ nvgpu-y += \
os/linux/dt.o \ os/linux/dt.o \
os/linux/ecc_sysfs.o \ os/linux/ecc_sysfs.o \
os/linux/bsearch.o \ os/linux/bsearch.o \
os/linux/cic/cic_stub.o \ os/linux/cic/cic_report_err.o \
os/linux/dmabuf_priv.o \ os/linux/dmabuf_priv.o \
os/linux/power_ops.o os/linux/power_ops.o

View File

@@ -168,6 +168,7 @@ srcs += common/device.c \
common/cic/mon/mon_pri.c \ common/cic/mon/mon_pri.c \
common/cic/mon/mon_pmu.c \ common/cic/mon/mon_pmu.c \
common/cic/mon/mon_mmu.c \ common/cic/mon/mon_mmu.c \
common/cic/mon/mon_report_err.c \
common/cic/rm/rm_init.c \ common/cic/rm/rm_init.c \
common/cic/rm/rm_intr.c \ common/cic/rm/rm_intr.c \
hal/init/hal_gv11b.c \ hal/init/hal_gv11b.c \

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -69,15 +69,6 @@ void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.ce_info)); sizeof(err_pkt.err_info.ce_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report CE error: "
"inst=%u err_id=%u intr_info=%u",
inst, err_id, intr_info);
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -72,15 +72,6 @@ void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.ctxsw_info)); sizeof(err_pkt.err_info.ctxsw_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report CTXSW error: "
"err_id=%u, mailbox_val=%u",
err_id, err_info->mailbox_value);
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -60,15 +60,6 @@ void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.ecc_info)); sizeof(err_pkt.err_info.ecc_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report ECC error: hw_unit=%u, inst=%u, "
"err_id=%u, err_addr=%llu, err_count=%llu",
hw_unit, inst, err_id, err_addr, err_count);
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -99,26 +99,6 @@ void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
nvpgu_report_fill_err_info(hw_unit, &err_pkt, err_info); nvpgu_report_fill_err_info(hw_unit, &err_pkt, err_info);
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(sizeof(err_pkt.err_info)); err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(sizeof(err_pkt.err_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
if (hw_unit == NVGPU_ERR_MODULE_SM) {
nvgpu_err(g, "Failed to report SM exception"
"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
err_pkt.err_info.sm_info.gpc,
err_pkt.err_info.sm_info.tpc,
err_pkt.err_info.sm_info.sm,
err_pkt.err_info.sm_info.warp_esr_status);
}
if (hw_unit == NVGPU_ERR_MODULE_PGRAPH) {
nvgpu_err(g, "Failed to report PGRAPH"
"exception: inst=%u, err_id=%u, "
"status=%u", inst, err_id,
err_pkt.err_info.gr_info.status);
}
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -69,15 +69,6 @@ void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.host_info)); sizeof(err_pkt.err_info.host_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report HOST error: "
"inst=%u, err_id=%u, intr_info=%u",
inst, err_id, intr_info);
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -105,15 +105,6 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.mmu_info)); sizeof(err_pkt.err_info.mmu_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report MMU fault: hw_unit=%u, "
"err_id=%u, sub_err_type=%u, status=%u",
hw_unit, err_id, sub_err_type, status);
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -65,15 +65,6 @@ void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.pmu_err_info)); sizeof(err_pkt.err_info.pmu_err_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report PMU error: "
"err_id=%u, sub_err_type=%u, status=%u",
err_id, sub_err_type, status);
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -69,15 +69,6 @@ void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst,
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.pri_info)); sizeof(err_pkt.err_info.pri_info));
if (g->ops.cic_mon.report_err != NULL) {
err = g->ops.cic_mon.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report PRI error: "
"inst=%u, err_id=%u, err_code=%u",
inst, err_id, err_code);
}
}
handle_report_failure: handle_report_failure:
if (err != 0) { if (err != 0) {
nvgpu_sw_quiesce(g); nvgpu_sw_quiesce(g);

View File

@@ -0,0 +1,42 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic_mon.h>
#include "cic_mon_priv.h"
void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id)
{
if (g->ops.cic_mon.report_err == NULL) {
return;
}
if (g->ops.cic_mon.report_err(g, err_id) != 0) {
nvgpu_err(g, "Failed to report an error: err_id=%x",
err_id);
nvgpu_sw_quiesce(g);
}
}

View File

@@ -817,7 +817,7 @@ static int gr_init_ctxsw_falcon_support(struct gk20a *g, struct nvgpu_gr *gr)
err = nvgpu_gr_falcon_init_ctxsw(g, gr->falcon); err = nvgpu_gr_falcon_init_ctxsw(g, gr->falcon);
if (err != 0) { if (err != 0) {
gr_intr_report_ctxsw_error(g, GPU_FECS_CTXSW_INIT_ERROR, 0, 0); nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_INIT_ERROR);
return err; return err;
} }

View File

@@ -42,21 +42,6 @@
#include "gr_intr_priv.h" #include "gr_intr_priv.h"
void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
u32 mailbox_value)
{
struct ctxsw_err_info err_info;
err_info.curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
err_info.ctxsw_status0 = g->ops.gr.falcon.read_fecs_ctxsw_status0(g);
err_info.ctxsw_status1 = g->ops.gr.falcon.read_fecs_ctxsw_status1(g);
err_info.mailbox_value = mailbox_value;
err_info.chid = chid;
nvgpu_report_ctxsw_err(g, NVGPU_ERR_MODULE_FECS,
err_type, (void *)&err_info);
}
static int gr_intr_handle_pending_tpc_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, static int gr_intr_handle_pending_tpc_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct nvgpu_channel *fault_ch, bool *post_event, struct nvgpu_channel *fault_ch,
u32 *hww_global_esr) u32 *hww_global_esr)
@@ -201,41 +186,6 @@ static void gr_intr_handle_class_error(struct gk20a *g,
NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY); NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
} }
static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
{
struct gr_sm_mcerr_info err_info;
struct nvgpu_channel *ch;
struct gr_err_info info;
u32 tsgid, chid, curr_ctx, inst = 0;
tsgid = NVGPU_INVALID_TSG_ID;
curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
if (curr_ctx == 0U) {
return;
}
ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID;
if (ch != NULL) {
nvgpu_channel_put(ch);
}
(void) memset(&err_info, 0, sizeof(err_info));
(void) memset(&info, 0, sizeof(info));
err_info.curr_ctx = curr_ctx;
err_info.chid = chid;
err_info.tsgid = tsgid;
err_info.hww_warp_esr_pc = hww_warp_esr_pc;
err_info.hww_warp_esr_status = hww_warp_esr_status;
err_info.gpc = gpc;
err_info.tpc = tpc;
err_info.sm = sm;
info.sm_mcerr_info = &err_info;
nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst,
GPU_SM_MACHINE_CHECK_ERROR, &info, 0U);
}
/* Used by sw interrupt thread to translate current ctx to chid. /* Used by sw interrupt thread to translate current ctx to chid.
* Also used by regops to translate current ctx to chid and tsgid. * Also used by regops to translate current ctx to chid and tsgid.
* For performance, we don't want to go through 128 channels every time. * For performance, we don't want to go through 128 channels every time.
@@ -318,35 +268,6 @@ unlock:
return ret_ch; return ret_ch;
} }
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
u32 err_type, u32 status, u32 sub_err_type)
{
struct nvgpu_channel *ch = NULL;
struct gr_exception_info err_info;
struct gr_err_info info;
u32 tsgid, chid, curr_ctx;
tsgid = NVGPU_INVALID_TSG_ID;
curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
if (curr_ctx != 0U) {
ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
}
chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID;
if (ch != NULL) {
nvgpu_channel_put(ch);
}
(void) memset(&err_info, 0, sizeof(err_info));
(void) memset(&info, 0, sizeof(info));
err_info.curr_ctx = curr_ctx;
err_info.chid = chid;
err_info.tsgid = tsgid;
err_info.status = status;
info.exception_info = &err_info;
nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH,
inst, err_type, &info, sub_err_type);
}
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g, void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
struct nvgpu_gr_isr_data *isr_data, u32 error_notifier) struct nvgpu_gr_isr_data *isr_data, u32 error_notifier)
{ {
@@ -372,22 +293,6 @@ static bool is_global_esr_error(u32 global_esr, u32 global_mask)
return ((global_esr & ~global_mask) != 0U) ? true: false; return ((global_esr & ~global_mask) != 0U) ? true: false;
} }
static void gr_intr_report_warp_error(struct gk20a *g, u32 gpc, u32 tpc,
u32 sm, u32 global_esr, u32 warp_esr,
u32 global_mask, u32 offset)
{
u64 hww_warp_esr_pc = 0;
if (is_global_esr_error(global_esr, global_mask)) {
if (g->ops.gr.intr.get_sm_hww_warp_esr_pc != NULL) {
hww_warp_esr_pc = g->ops.gr.intr.get_sm_hww_warp_esr_pc(g,
offset);
}
gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr,
hww_warp_esr_pc);
}
}
#ifdef CONFIG_NVGPU_DEBUGGER #ifdef CONFIG_NVGPU_DEBUGGER
static int gr_intr_sm_exception_warp_sync(struct gk20a *g, static int gr_intr_sm_exception_warp_sync(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm, u32 gpc, u32 tpc, u32 sm,
@@ -454,8 +359,11 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
/* /*
* Check and report any fatal warp errors. * Check and report any fatal warp errors.
*/ */
gr_intr_report_warp_error(g, gpc, tpc, sm, global_esr, warp_esr, if (is_global_esr_error(global_esr, global_mask)) {
global_mask, offset); nvgpu_report_err_to_sdl(g, GPU_SM_MACHINE_CHECK_ERROR);
nvgpu_err(g, "sm machine check err. gpc_id(%d), tpc_id(%d), "
"offset(%d)", gpc, tpc, offset);
}
(void)nvgpu_pg_elpg_protected_call(g, (void)nvgpu_pg_elpg_protected_call(g,
nvgpu_safe_cast_u32_to_s32( nvgpu_safe_cast_u32_to_s32(
@@ -570,9 +478,7 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch,
&& (mailbox_value == && (mailbox_value ==
g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val())) { g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val())) {
gr_intr_report_ctxsw_error(g, nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_CRC_MISMATCH);
GPU_FECS_CTXSW_CRC_MISMATCH,
chid, mailbox_value);
nvgpu_err(g, "ctxsw intr0 set by ucode, " nvgpu_err(g, "ctxsw intr0 set by ucode, "
"ctxsw checksum mismatch"); "ctxsw checksum mismatch");
ret = -1; ret = -1;
@@ -582,9 +488,7 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch,
* recovery is initiated and error is reported to * recovery is initiated and error is reported to
* 3LSS. * 3LSS.
*/ */
gr_intr_report_ctxsw_error(g, nvgpu_report_err_to_sdl(g, GPU_FECS_FAULT_DURING_CTXSW);
GPU_FECS_FAULT_DURING_CTXSW,
chid, mailbox_value);
nvgpu_err(g, nvgpu_err(g,
"ctxsw intr0 set by ucode, error_code: 0x%08x", "ctxsw intr0 set by ucode, error_code: 0x%08x",
mailbox_value); mailbox_value);
@@ -593,17 +497,13 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch,
} }
if (fecs_host_intr->fault_during_ctxsw_active) { if (fecs_host_intr->fault_during_ctxsw_active) {
gr_intr_report_ctxsw_error(g, nvgpu_report_err_to_sdl(g, GPU_FECS_FAULT_DURING_CTXSW);
GPU_FECS_FAULT_DURING_CTXSW,
chid, 0);
nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid); nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid);
ret = -1; ret = -1;
} }
if (fecs_host_intr->watchdog_active) { if (fecs_host_intr->watchdog_active) {
gr_intr_report_ctxsw_error(g, nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT);
GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
chid, 0);
/* currently, recovery is not initiated */ /* currently, recovery is not initiated */
nvgpu_err(g, "fecs watchdog triggered for channel %u, " nvgpu_err(g, "fecs watchdog triggered for channel %u, "
"cannot ctxsw anymore !!", chid); "cannot ctxsw anymore !!", chid);
@@ -861,7 +761,7 @@ static u32 gr_intr_handle_exception_interrupts(struct gk20a *g,
} }
static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g, static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
u32 gr_intr, u32 *clear_intr, u32 *clear_intr,
struct nvgpu_gr_intr_info *intr_info, struct nvgpu_gr_intr_info *intr_info,
struct nvgpu_gr_isr_data *isr_data) struct nvgpu_gr_isr_data *isr_data)
{ {
@@ -870,9 +770,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
if (intr_info->illegal_notify != 0U) { if (intr_info->illegal_notify != 0U) {
nvgpu_err(g, "illegal notify pending"); nvgpu_err(g, "illegal notify pending");
nvgpu_gr_intr_report_exception(g, 0U, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_ILLEGAL_NOTIFY);
nvgpu_gr_intr_set_error_notifier(g, isr_data, nvgpu_gr_intr_set_error_notifier(g, isr_data,
NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY); NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
do_reset = 1U; do_reset = 1U;
@@ -881,9 +779,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
if (intr_info->illegal_method != 0U) { if (intr_info->illegal_method != 0U) {
if (gr_intr_handle_illegal_method(g, isr_data) != 0) { if (gr_intr_handle_illegal_method(g, isr_data) != 0) {
nvgpu_gr_intr_report_exception(g, 0U, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_ILLEGAL_METHOD);
do_reset = 1U; do_reset = 1U;
} }
@@ -891,9 +787,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
} }
if (intr_info->illegal_class != 0U) { if (intr_info->illegal_class != 0U) {
nvgpu_gr_intr_report_exception(g, 0U, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_ILLEGAL_CLASS);
nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x", nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x",
isr_data->class_num, isr_data->offset); isr_data->class_num, isr_data->offset);
@@ -906,7 +800,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
} }
static u32 gr_intr_handle_error_interrupts(struct gk20a *g, static u32 gr_intr_handle_error_interrupts(struct gk20a *g,
u32 gr_intr, u32 *clear_intr, u32 *clear_intr,
struct nvgpu_gr_intr_info *intr_info, struct nvgpu_gr_intr_info *intr_info,
struct nvgpu_gr_isr_data *isr_data) struct nvgpu_gr_isr_data *isr_data)
{ {
@@ -923,9 +817,7 @@ static u32 gr_intr_handle_error_interrupts(struct gk20a *g,
} }
if (intr_info->class_error != 0U) { if (intr_info->class_error != 0U) {
nvgpu_gr_intr_report_exception(g, 0U, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR);
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_CLASS_ERROR);
gr_intr_handle_class_error(g, isr_data); gr_intr_handle_class_error(g, isr_data);
do_reset = 1U; do_reset = 1U;
*clear_intr &= ~intr_info->class_error; *clear_intr &= ~intr_info->class_error;
@@ -1073,10 +965,10 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
gr_intr_handle_pending_interrupts(g, &clear_intr, gr_intr_handle_pending_interrupts(g, &clear_intr,
&intr_info, &isr_data); &intr_info, &isr_data);
need_reset |= gr_intr_handle_illegal_interrupts(g, gr_intr, need_reset |= gr_intr_handle_illegal_interrupts(g,
&clear_intr, &intr_info, &isr_data); &clear_intr, &intr_info, &isr_data);
need_reset |= gr_intr_handle_error_interrupts(g, gr_intr, need_reset |= gr_intr_handle_error_interrupts(g,
&clear_intr, &intr_info, &isr_data); &clear_intr, &intr_info, &isr_data);
need_reset |= gr_intr_handle_exception_interrupts(g, &clear_intr, need_reset |= gr_intr_handle_exception_interrupts(g, &clear_intr,

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -34,8 +34,9 @@
void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status, void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status,
u32 error_type) u32 error_type)
{ {
nvgpu_report_pmu_err(g, NVGPU_ERR_MODULE_PMU, nvgpu_report_err_to_sdl(g, GPU_PMU_BAR0_ERROR_TIMEOUT);
GPU_PMU_BAR0_ERROR_TIMEOUT, error_type, bar0_status); nvgpu_err(g, "Falcon mem scrubbing timeout. status(0x%x), "
"error_type(0x%x)", bar0_status, error_type);
} }
/* PMU engine reset functions */ /* PMU engine reset functions */

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -156,6 +156,6 @@ void ga10b_bus_isr(struct gk20a *g)
bus_intr_0 & ~bus_intr_0_handled); bus_intr_0 & ~bus_intr_0_handled);
} }
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, err_type, bus_intr_0); nvgpu_report_err_to_sdl(g, err_type);
nvgpu_writel(g, bus_intr_0_r(), bus_intr_0); nvgpu_writel(g, bus_intr_0_r(), bus_intr_0);
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -89,7 +89,6 @@ void gk20a_bus_isr(struct gk20a *g)
*/ */
err_type = GPU_HOST_PBUS_TIMEOUT_ERROR; err_type = GPU_HOST_PBUS_TIMEOUT_ERROR;
} }
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_report_err_to_sdl(g, err_type);
0, err_type, val);
nvgpu_writel(g, bus_intr_0_r(), val); nvgpu_writel(g, bus_intr_0_r(), val);
} }

View File

@@ -43,15 +43,13 @@ void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
/* clear blocking interrupts: they exibit broken behavior */ /* clear blocking interrupts: they exibit broken behavior */
if ((ce_intr & ce_intr_status_blockpipe_pending_f()) != 0U) { if ((ce_intr & ce_intr_status_blockpipe_pending_f()) != 0U) {
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, nvgpu_report_err_to_sdl(g, GPU_CE_BLOCK_PIPE);
GPU_CE_BLOCK_PIPE, ce_intr);
nvgpu_err(g, "ce blocking pipe interrupt"); nvgpu_err(g, "ce blocking pipe interrupt");
clear_intr |= ce_intr_status_blockpipe_pending_f(); clear_intr |= ce_intr_status_blockpipe_pending_f();
} }
if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) { if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) {
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, nvgpu_report_err_to_sdl(g, GPU_CE_LAUNCH_ERROR);
GPU_CE_LAUNCH_ERROR, ce_intr);
nvgpu_err(g, "ce launch error interrupt"); nvgpu_err(g, "ce launch error interrupt");
clear_intr |= ce_intr_status_launcherr_pending_f(); clear_intr |= ce_intr_status_launcherr_pending_f();
} }

View File

@@ -1,7 +1,7 @@
/* /*
* Volta GPU series Copy Engine. * Volta GPU series Copy Engine.
* *
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -61,8 +61,7 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
* reset to get back to a working state. * reset to get back to a working state.
*/ */
if ((ce_intr & ce_intr_status_invalid_config_pending_f()) != 0U) { if ((ce_intr & ce_intr_status_invalid_config_pending_f()) != 0U) {
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, nvgpu_report_err_to_sdl(g, GPU_CE_INVALID_CONFIG);
GPU_CE_INVALID_CONFIG, ce_intr);
nvgpu_err(g, "ce: inst %d: invalid config", inst_id); nvgpu_err(g, "ce: inst %d: invalid config", inst_id);
clear_intr |= ce_intr_status_invalid_config_reset_f(); clear_intr |= ce_intr_status_invalid_config_reset_f();
} }
@@ -74,8 +73,7 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
* reset before operations can start again, if not the entire GPU. * reset before operations can start again, if not the entire GPU.
*/ */
if ((ce_intr & ce_intr_status_mthd_buffer_fault_pending_f()) != 0U) { if ((ce_intr & ce_intr_status_mthd_buffer_fault_pending_f()) != 0U) {
nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, nvgpu_report_err_to_sdl(g, GPU_CE_METHOD_BUFFER_FAULT);
GPU_CE_METHOD_BUFFER_FAULT, ce_intr);
nvgpu_err(g, "ce: inst %d: mthd buffer fault", inst_id); nvgpu_err(g, "ce: inst %d: mthd buffer fault", inst_id);
clear_intr |= ce_intr_status_mthd_buffer_fault_reset_f(); clear_intr |= ce_intr_status_mthd_buffer_fault_reset_f();
} }

View File

@@ -511,11 +511,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
if ((niso_intr & if ((niso_intr &
fb_niso_intr_mmu_other_fault_notify_m()) != 0U) { fb_niso_intr_mmu_other_fault_notify_m()) != 0U) {
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
GPU_HUBMMU_PAGE_FAULT_ERROR, nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
NULL, "sub-err: OTHER_FAULT_NOTIFY. "
fault_status, "fault_status(0x%x)", fault_status);
GPU_HUBMMU_OTHER_FAULT_NOTIFY);
gv11b_fb_handle_dropped_mmu_fault(g, fault_status); gv11b_fb_handle_dropped_mmu_fault(g, fault_status);
@@ -540,11 +539,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
if ((niso_intr & if ((niso_intr &
fb_niso_intr_mmu_nonreplayable_fault_overflow_m()) != 0U) { fb_niso_intr_mmu_nonreplayable_fault_overflow_m()) != 0U) {
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
GPU_HUBMMU_PAGE_FAULT_ERROR, nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
NULL, "sub-err: NONREPLAYABLE_FAULT_OVERFLOW. "
fault_status, "fault_status(0x%x)", fault_status);
GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW);
gv11b_fb_handle_nonreplay_fault_overflow(g, gv11b_fb_handle_nonreplay_fault_overflow(g,
fault_status); fault_status);
@@ -565,11 +563,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
if ((niso_intr & if ((niso_intr &
fb_niso_intr_mmu_replayable_fault_overflow_m()) != 0U) { fb_niso_intr_mmu_replayable_fault_overflow_m()) != 0U) {
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
GPU_HUBMMU_PAGE_FAULT_ERROR, nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
NULL, "sub-err: REPLAYABLE_FAULT_OVERFLOW. "
fault_status, "fault_status(0x%x)", fault_status);
GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW);
gv11b_fb_handle_replay_fault_overflow(g, gv11b_fb_handle_replay_fault_overflow(g,
fault_status); fault_status);

View File

@@ -1,7 +1,7 @@
/* /*
* GV11B ECC INTR * GV11B ECC INTR
* *
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -49,11 +49,9 @@ static void gv11b_fb_intr_handle_ecc_l2tlb_errs(struct gk20a *g,
BUG(); BUG();
} }
if ((ecc_status & uncorrected_error_mask) != 0U) { if ((ecc_status & uncorrected_error_mask) != 0U) {
nvgpu_report_fb_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED);
GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc sa data error. "
ecc_addr, "ecc_addr(0x%x)", ecc_addr);
g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
} }
} }
@@ -137,11 +135,9 @@ static void gv11b_fb_intr_handle_ecc_hubtlb_errs(struct gk20a *g,
} }
if ((ecc_status & if ((ecc_status &
fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) { fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) {
nvgpu_report_fb_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED);
GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc sa data error. "
ecc_addr, "ecc_addr(0x%x)", ecc_addr);
g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
} }
} }
@@ -228,15 +224,14 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g,
if ((ecc_status & if ((ecc_status &
fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m()) fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m())
!= 0U) { != 0U) {
nvgpu_report_fb_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED);
GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc pte data error. "
ecc_addr, "ecc_addr(0x%x)", ecc_addr);
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pte data error");
} }
if ((ecc_status & if ((ecc_status &
fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) { fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "corrected ecc pde0 data error"); nvgpu_log(g, gpu_dbg_intr, "corrected ecc pde0 data error"
"ecc_addr(0x%x)", ecc_addr);
/* This error is not expected to occur in gv11b and hence, /* This error is not expected to occur in gv11b and hence,
* this scenario is considered as a fatal error. * this scenario is considered as a fatal error.
*/ */
@@ -246,11 +241,9 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g,
if ((ecc_status & if ((ecc_status &
fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m()) fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m())
!= 0U) { != 0U) {
nvgpu_report_fb_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED);
GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc pde0 data error. "
ecc_addr, "ecc_addr(0x%x)", ecc_addr);
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pde0 data error");
} }
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -272,8 +272,7 @@ void ga10b_fifo_ctxsw_timeout_isr(struct gk20a *g,
continue; continue;
} }
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR);
0, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR, tsgid);
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
recover = g->ops.tsg.check_ctxsw_timeout(tsg, &debug_dump, &ms); recover = g->ops.tsg.check_ctxsw_timeout(tsg, &debug_dump, &ms);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -215,9 +215,7 @@ bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g)
continue; continue;
} }
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR);
0, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR,
tsgid);
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
recover = g->ops.tsg.check_ctxsw_timeout(tsg, recover = g->ops.tsg.check_ctxsw_timeout(tsg,

View File

@@ -294,8 +294,7 @@ static void ga10b_fifo_handle_bad_tsg(struct gk20a *g,
nvgpu_err(g, "runlist bad tsg error code not supported"); nvgpu_err(g, "runlist bad tsg error code not supported");
} }
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_SCHED_ERROR);
0, GPU_HOST_PFIFO_SCHED_ERROR, bad_tsg_code);
/* id is unknown, preempt all runlists and do recovery */ /* id is unknown, preempt all runlists and do recovery */
/* TBD: nvgpu_rc_sched_error_bad_tsg(g); */ /* TBD: nvgpu_rc_sched_error_bad_tsg(g); */

View File

@@ -142,8 +142,7 @@ static u32 gk20a_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr)
} }
if ((fifo_intr & fifo_intr_0_fb_flush_timeout_pending_f()) != 0U) { if ((fifo_intr & fifo_intr_0_fb_flush_timeout_pending_f()) != 0U) {
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR);
0, GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR, 0);
nvgpu_err(g, "fifo fb flush timeout error"); nvgpu_err(g, "fifo fb flush timeout error");
handled |= fifo_intr_0_fb_flush_timeout_pending_f(); handled |= fifo_intr_0_fb_flush_timeout_pending_f();
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -70,8 +70,7 @@ void gk20a_fifo_intr_handle_chsw_error(struct gk20a *g)
u32 intr; u32 intr;
intr = nvgpu_readl(g, fifo_intr_chsw_error_r()); intr = nvgpu_readl(g, fifo_intr_chsw_error_r());
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CHSW_ERROR);
0, GPU_HOST_PFIFO_CHSW_ERROR, intr);
nvgpu_err(g, "chsw: %08x", intr); nvgpu_err(g, "chsw: %08x", intr);
g->ops.gr.falcon.dump_stats(g); g->ops.gr.falcon.dump_stats(g);
nvgpu_writel(g, fifo_intr_chsw_error_r(), intr); nvgpu_writel(g, fifo_intr_chsw_error_r(), intr);

View File

@@ -132,8 +132,7 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g)
nvgpu_err(g, "fifo sched error code not supported"); nvgpu_err(g, "fifo sched error code not supported");
} }
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_SCHED_ERROR);
0, GPU_HOST_PFIFO_SCHED_ERROR, sched_error);
if (sched_error == SCHED_ERROR_CODE_BAD_TSG) { if (sched_error == SCHED_ERROR_CODE_BAD_TSG) {
/* id is unknown, preempt all runlists and do recovery */ /* id is unknown, preempt all runlists and do recovery */
@@ -151,8 +150,7 @@ static u32 gv11b_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr)
if ((fifo_intr & fifo_intr_0_bind_error_pending_f()) != 0U) { if ((fifo_intr & fifo_intr_0_bind_error_pending_f()) != 0U) {
u32 bind_error = nvgpu_readl(g, fifo_intr_bind_error_r()); u32 bind_error = nvgpu_readl(g, fifo_intr_bind_error_r());
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_BIND_ERROR);
GPU_HOST_PFIFO_BIND_ERROR, bind_error);
nvgpu_err(g, "fifo bind error: 0x%08x", bind_error); nvgpu_err(g, "fifo bind error: 0x%08x", bind_error);
handled |= fifo_intr_0_bind_error_pending_f(); handled |= fifo_intr_0_bind_error_pending_f();
} }
@@ -163,17 +161,13 @@ static u32 gv11b_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr)
} }
if ((fifo_intr & fifo_intr_0_memop_timeout_pending_f()) != 0U) { if ((fifo_intr & fifo_intr_0_memop_timeout_pending_f()) != 0U) {
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR);
GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR, 0);
nvgpu_err(g, "fifo memop timeout error"); nvgpu_err(g, "fifo memop timeout error");
handled |= fifo_intr_0_memop_timeout_pending_f(); handled |= fifo_intr_0_memop_timeout_pending_f();
} }
if ((fifo_intr & fifo_intr_0_lb_error_pending_f()) != 0U) { if ((fifo_intr & fifo_intr_0_lb_error_pending_f()) != 0U) {
u32 lb_error = nvgpu_readl(g, fifo_intr_lb_error_r()); nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_LB_ERROR);
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0,
GPU_HOST_PFIFO_LB_ERROR, lb_error);
nvgpu_err(g, "fifo lb error"); nvgpu_err(g, "fifo lb error");
handled |= fifo_intr_0_lb_error_pending_f(); handled |= fifo_intr_0_lb_error_pending_f();
} }

View File

@@ -326,8 +326,9 @@ static void report_pbdma_error(struct gk20a *g, u32 pbdma_id,
err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR; err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR;
} }
if (err_type != GPU_HOST_INVALID_ERROR) { if (err_type != GPU_HOST_INVALID_ERROR) {
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_err(g, "pbdma_intr_0(%d)= 0x%08x ",
pbdma_id, err_type, pbdma_intr_0); pbdma_id, pbdma_intr_0);
nvgpu_report_err_to_sdl(g, err_type);
} }
return; return;
} }
@@ -536,8 +537,7 @@ bool ga10b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
recover = true; recover = true;
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, pbdma_id, nvgpu_report_err_to_sdl(g, GPU_HOST_PBDMA_HCE_ERROR);
GPU_HOST_PBDMA_HCE_ERROR, pbdma_intr_1);
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) { if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d", nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",

View File

@@ -87,8 +87,8 @@ static void report_pbdma_error(struct gk20a *g, u32 pbdma_id,
err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR; err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR;
} }
if (err_type != GPU_HOST_INVALID_ERROR) { if (err_type != GPU_HOST_INVALID_ERROR) {
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, nvgpu_log_info(g, "pbdma id:%u", pbdma_id);
pbdma_id, err_type, pbdma_intr_0); nvgpu_report_err_to_sdl(g, err_type);
} }
return; return;
} }
@@ -190,8 +190,7 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
recover = true; recover = true;
nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, pbdma_id, nvgpu_report_err_to_sdl(g, GPU_HOST_PBDMA_HCE_ERROR);
GPU_HOST_PBDMA_HCE_ERROR, pbdma_intr_1);
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) { if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d", nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",

View File

@@ -195,8 +195,7 @@ static u32 ga10b_gr_intr_check_gr_mme_fe1_exception(struct gk20a *g,
info_mthd = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd_r()); info_mthd = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd_r());
info_mthd2 = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd2_r()); info_mthd2 = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd2_r());
nvgpu_gr_intr_report_exception(g, 0, GPU_PGRAPH_MME_FE1_EXCEPTION, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MME_FE1_EXCEPTION);
mme_fe1_hww_esr, 0U);
nvgpu_err(g, "mme_fe1 exception: esr 0x%08x, info 0x%08x," nvgpu_err(g, "mme_fe1 exception: esr 0x%08x, info 0x%08x,"
"info_mthd 0x%08x, info_mthd2 0x%08x", "info_mthd 0x%08x, info_mthd2 0x%08x",
mme_fe1_hww_esr, info, info_mthd, info_mthd2); mme_fe1_hww_esr, info, info_mthd, info_mthd2);
@@ -366,31 +365,29 @@ void ga10b_gr_intr_enable_exceptions(struct gk20a *g,
} }
static void ga10b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, static void ga10b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
u32 ecc_status, u32 gpc, u32 correct_err, u32 uncorrect_err) u32 ecc_status, u32 gpc)
{ {
(void)correct_err;
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); nvgpu_err(g, "corrected ecc sa data error. "
"gpc_id(%d)", gpc);
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc sa data error"
0U, uncorrect_err); "gpc_id(%d)", gpc);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); nvgpu_err(g, "corrected ecc fa data error"
"gpc_id(%d)", gpc);
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc fa data error"
0U, uncorrect_err); "gpc_id(%d)", gpc);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
} }
} }
@@ -467,9 +464,7 @@ void ga10b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
ga10b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc, ga10b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc);
(u32)*corrected_err,
(u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, nvgpu_log(g, gpu_dbg_intr,
"ecc error address: 0x%x", ecc_addr); "ecc error address: 0x%x", ecc_addr);
@@ -747,15 +742,13 @@ static void ga10b_gr_intr_report_tpc_sm_rams_ecc_err(struct gk20a *g,
for (i = 0U; i < ecc_status->err_count; i++) { for (i = 0U; i < ecc_status->err_count; i++) {
if (ecc_status->err_id[i] == GPU_SM_RAMS_ECC_CORRECTED) { if (ecc_status->err_id[i] == GPU_SM_RAMS_ECC_CORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_CORRECTED);
(gpc << SHIFT_8_BITS) | tpc, nvgpu_err(g, "sm_l1_tag_ecc_corrected. "
GPU_SM_L1_TAG_ECC_CORRECTED, 0, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
g->ecc.gr.sm_rams_ecc_corrected_err_count[gpc][tpc].counter);
} else { } else {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_UNCORRECTED);
(gpc << SHIFT_8_BITS) | tpc, nvgpu_err(g, "sm_l1_tag_ecc_uncorrected. "
GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
g->ecc.gr.sm_rams_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
} }
} }

View File

@@ -106,9 +106,7 @@ u32 gm20b_gr_intr_check_gr_ssync_exception(struct gk20a *g, u32 exception)
g->ops.gr.intr.handle_ssync_hww(g, &ssync_esr); g->ops.gr.intr.handle_ssync_hww(g, &ssync_esr);
reset_gpc = 1U; reset_gpc = 1U;
} }
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SSYNC_EXCEPTION);
GPU_PGRAPH_SSYNC_EXCEPTION,
ssync_esr, 0);
} }
return reset_gpc; return reset_gpc;
} }
@@ -119,9 +117,7 @@ u32 gm20b_gr_intr_check_gr_mme_exception(struct gk20a *g, u32 exception)
u32 mme = nvgpu_readl(g, gr_mme_hww_esr_r()); u32 mme = nvgpu_readl(g, gr_mme_hww_esr_r());
u32 info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); u32 info = nvgpu_readl(g, gr_mme_hww_esr_info_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MME_EXCEPTION);
GPU_PGRAPH_MME_EXCEPTION,
mme, 0);
nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x", nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
mme, info); mme, info);
#ifdef CONFIG_NVGPU_DGPU #ifdef CONFIG_NVGPU_DGPU
@@ -141,9 +137,7 @@ u32 gm20b_gr_intr_check_gr_sked_exception(struct gk20a *g, u32 exception)
if ((exception & gr_exception_sked_m()) != 0U) { if ((exception & gr_exception_sked_m()) != 0U) {
u32 sked = nvgpu_readl(g, gr_sked_hww_esr_r()); u32 sked = nvgpu_readl(g, gr_sked_hww_esr_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SKED_EXCEPTION);
GPU_PGRAPH_SKED_EXCEPTION,
sked, 0);
nvgpu_err(g, "sked exception: esr 0x%08x", sked); nvgpu_err(g, "sked exception: esr 0x%08x", sked);
nvgpu_writel(g, gr_sked_hww_esr_r(), nvgpu_writel(g, gr_sked_hww_esr_r(),
gr_sked_hww_esr_reset_active_f()); gr_sked_hww_esr_reset_active_f());
@@ -158,10 +152,8 @@ static u32 gr_gm20b_intr_check_gr_be_crop_exception(struct gk20a *g,
if ((exception & gr_pri_be0_becs_be_exception_crop_m()) != 0U) { if ((exception & gr_pri_be0_becs_be_exception_crop_m()) != 0U) {
u32 crop = nvgpu_readl(g, gr_crop_hww_esr_r()); u32 crop = nvgpu_readl(g, gr_crop_hww_esr_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_BE_EXCEPTION);
GPU_PGRAPH_BE_EXCEPTION, nvgpu_err(g, "BE exception: crop exception: esr 0x%08x", crop);
crop, GPU_PGRAPH_BE_EXCEPTION_CROP);
nvgpu_err(g, "crop exception: esr 0x%08x", crop);
nvgpu_writel(g, gr_crop_hww_esr_r(), nvgpu_writel(g, gr_crop_hww_esr_r(),
gr_crop_hww_esr_reset_active_f()); gr_crop_hww_esr_reset_active_f());
return 1U; return 1U;
@@ -175,10 +167,8 @@ static u32 gr_gm20b_intr_check_gr_be_zrop_exception(struct gk20a *g,
if ((exception & gr_pri_be0_becs_be_exception_zrop_m()) != 0U) { if ((exception & gr_pri_be0_becs_be_exception_zrop_m()) != 0U) {
u32 zrop = nvgpu_readl(g, gr_zrop_hww_esr_r()); u32 zrop = nvgpu_readl(g, gr_zrop_hww_esr_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_BE_EXCEPTION);
GPU_PGRAPH_BE_EXCEPTION, nvgpu_err(g, "BE exception: zrop exception: esr 0x%08x", zrop);
zrop, GPU_PGRAPH_BE_EXCEPTION_ZROP);
nvgpu_err(g, "zrop exception: esr 0x%08x", zrop);
nvgpu_writel(g, gr_zrop_hww_esr_r(), nvgpu_writel(g, gr_zrop_hww_esr_r(),
gr_zrop_hww_esr_reset_active_f()); gr_zrop_hww_esr_reset_active_f());
return 1U; return 1U;
@@ -192,9 +182,7 @@ u32 gm20b_gr_intr_check_gr_fe_exception(struct gk20a *g, u32 exception)
u32 fe = nvgpu_readl(g, gr_fe_hww_esr_r()); u32 fe = nvgpu_readl(g, gr_fe_hww_esr_r());
u32 info = nvgpu_readl(g, gr_fe_hww_esr_info_r()); u32 info = nvgpu_readl(g, gr_fe_hww_esr_info_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_FE_EXCEPTION);
GPU_PGRAPH_FE_EXCEPTION,
fe, 0);
nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x", nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
fe, info); fe, info);
nvgpu_writel(g, gr_fe_hww_esr_r(), nvgpu_writel(g, gr_fe_hww_esr_r(),
@@ -209,9 +197,7 @@ u32 gm20b_gr_intr_check_gr_memfmt_exception(struct gk20a *g, u32 exception)
if ((exception & gr_exception_memfmt_m()) != 0U) { if ((exception & gr_exception_memfmt_m()) != 0U) {
u32 memfmt = nvgpu_readl(g, gr_memfmt_hww_esr_r()); u32 memfmt = nvgpu_readl(g, gr_memfmt_hww_esr_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MEMFMT_EXCEPTION);
GPU_PGRAPH_MEMFMT_EXCEPTION,
memfmt, 0);
nvgpu_err(g, "memfmt exception: esr %08x", memfmt); nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
nvgpu_writel(g, gr_memfmt_hww_esr_r(), nvgpu_writel(g, gr_memfmt_hww_esr_r(),
gr_memfmt_hww_esr_reset_active_f()); gr_memfmt_hww_esr_reset_active_f());
@@ -225,9 +211,7 @@ u32 gm20b_gr_intr_check_gr_pd_exception(struct gk20a *g, u32 exception)
if ((exception & gr_exception_pd_m()) != 0U) { if ((exception & gr_exception_pd_m()) != 0U) {
u32 pd = nvgpu_readl(g, gr_pd_hww_esr_r()); u32 pd = nvgpu_readl(g, gr_pd_hww_esr_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_PD_EXCEPTION);
GPU_PGRAPH_PD_EXCEPTION,
pd, 0);
nvgpu_err(g, "pd exception: esr 0x%08x", pd); nvgpu_err(g, "pd exception: esr 0x%08x", pd);
nvgpu_writel(g, gr_pd_hww_esr_r(), nvgpu_writel(g, gr_pd_hww_esr_r(),
gr_pd_hww_esr_reset_active_f()); gr_pd_hww_esr_reset_active_f());
@@ -241,9 +225,7 @@ u32 gm20b_gr_intr_check_gr_scc_exception(struct gk20a *g, u32 exception)
if ((exception & gr_exception_scc_m()) != 0U) { if ((exception & gr_exception_scc_m()) != 0U) {
u32 scc = nvgpu_readl(g, gr_scc_hww_esr_r()); u32 scc = nvgpu_readl(g, gr_scc_hww_esr_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SCC_EXCEPTION);
GPU_PGRAPH_SCC_EXCEPTION,
scc, 0);
nvgpu_err(g, "scc exception: esr 0x%08x", scc); nvgpu_err(g, "scc exception: esr 0x%08x", scc);
nvgpu_writel(g, gr_scc_hww_esr_r(), nvgpu_writel(g, gr_scc_hww_esr_r(),
gr_scc_hww_esr_reset_active_f()); gr_scc_hww_esr_reset_active_f());
@@ -257,9 +239,7 @@ u32 gm20b_gr_intr_check_gr_ds_exception(struct gk20a *g, u32 exception)
if ((exception & gr_exception_ds_m()) != 0U) { if ((exception & gr_exception_ds_m()) != 0U) {
u32 ds = nvgpu_readl(g, gr_ds_hww_esr_r()); u32 ds = nvgpu_readl(g, gr_ds_hww_esr_r());
nvgpu_gr_intr_report_exception(g, 0, nvgpu_report_err_to_sdl(g, GPU_PGRAPH_DS_EXCEPTION);
GPU_PGRAPH_DS_EXCEPTION,
ds, 0);
nvgpu_err(g, "ds exception: esr: 0x%08x", ds); nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
nvgpu_writel(g, gr_ds_hww_esr_r(), nvgpu_writel(g, gr_ds_hww_esr_r(),
gr_ds_hww_esr_reset_task_f()); gr_ds_hww_esr_reset_task_f());

View File

@@ -88,18 +88,12 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g)
fecs_ecc_status.uncorrected_delta); fecs_ecc_status.uncorrected_delta);
if (fecs_ecc_status.imem_corrected_err) { if (fecs_ecc_status.imem_corrected_err) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_IMEM_ECC_CORRECTED);
GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
fecs_ecc_status.ecc_addr,
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
nvgpu_err(g, "imem ecc error corrected - error count:%d", nvgpu_err(g, "imem ecc error corrected - error count:%d",
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
} }
if (fecs_ecc_status.imem_uncorrected_err) { if (fecs_ecc_status.imem_uncorrected_err) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED);
GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
fecs_ecc_status.ecc_addr,
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
nvgpu_err(g, "imem ecc error uncorrected - error count:%d", nvgpu_err(g, "imem ecc error uncorrected - error count:%d",
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
} }
@@ -112,10 +106,7 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g)
BUG(); BUG();
} }
if (fecs_ecc_status.dmem_uncorrected_err) { if (fecs_ecc_status.dmem_uncorrected_err) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED);
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
fecs_ecc_status.ecc_addr,
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
nvgpu_err(g, "dmem ecc error uncorrected - error count %d", nvgpu_err(g, "dmem ecc error uncorrected - error count %d",
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
} }
@@ -350,9 +341,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
} }
*uncorrected_err = nvgpu_safe_add_u32(*uncorrected_err, *uncorrected_err = nvgpu_safe_add_u32(*uncorrected_err,
gcc_l15_uncorrected_err_count_delta); gcc_l15_uncorrected_err_count_delta);
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc, nvgpu_report_err_to_sdl(g, GPU_GCC_L15_ECC_UNCORRECTED);
GPU_GCC_L15_ECC_UNCORRECTED,
0, *uncorrected_err);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r(), offset),
0); 0);
@@ -364,11 +353,8 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
} }
static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
u32 ecc_status, u32 gpc, u32 ecc_status, u32 gpc)
u32 correct_err, u32 uncorrect_err)
{ {
(void)correct_err;
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) !=
0U) { 0U) {
@@ -381,10 +367,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
0U) { 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc);
0, uncorrect_err);
nvgpu_err(g, "uncorrected ecc sa data error");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) !=
@@ -398,10 +382,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
if ((ecc_status & if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
0U) { 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc);
0, uncorrect_err);
nvgpu_err(g, "uncorrected ecc fa data error");
} }
} }
@@ -482,8 +464,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
nvgpu_err(g, "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", nvgpu_err(g, "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x",
gpc, hww_esr); gpc, hww_esr);
gv11b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc, gv11b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc);
(u32)*corrected_err, (u32)*uncorrected_err);
nvgpu_err(g, "ecc error address: 0x%x", ecc_addr); nvgpu_err(g, "ecc error address: 0x%x", ecc_addr);
nvgpu_err(g, "ecc error count corrected: %d, uncorrected %d", nvgpu_err(g, "ecc error count corrected: %d, uncorrected %d",
@@ -491,22 +472,19 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
} }
static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g, static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g,
u32 ecc_status, u32 ecc_addr, u32 gpc, u32 ecc_status, u32 ecc_addr, u32 gpc)
u32 correct_err, u32 uncorrect_err)
{ {
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED);
gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, nvgpu_err(g, "imem ecc error corrected"
ecc_addr, correct_err); "ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc);
nvgpu_err(g, "imem ecc error corrected");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED);
gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, nvgpu_err(g, "imem ecc error uncorrected"
ecc_addr, uncorrect_err); "ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc);
nvgpu_err(g, "imem ecc error uncorrected");
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
@@ -518,10 +496,9 @@ static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g,
} }
if ((ecc_status & if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED);
gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, nvgpu_err(g, "dmem ecc error uncorrected"
ecc_addr, uncorrect_err); "ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc);
nvgpu_err(g, "dmem ecc error uncorrected");
} }
} }
@@ -538,9 +515,7 @@ void gv11b_gr_intr_handle_gpc_prop_exception(struct gk20a *g, u32 gpc,
hww_esr = nvgpu_readl(g, hww_esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_prop_hww_esr_r(), offset)); nvgpu_safe_add_u32(gr_gpc0_prop_hww_esr_r(), offset));
nvgpu_gr_intr_report_exception(g, (gpc << 8U), nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
GPU_PGRAPH_GPC_GFX_EXCEPTION,
hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_PROP);
/* /*
* print additional diagnostic information. * print additional diagnostic information.
@@ -584,9 +559,7 @@ void gv11b_gr_intr_handle_gpc_zcull_exception(struct gk20a *g, u32 gpc,
hww_esr = nvgpu_readl(g, hww_esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_zcull_hww_esr_r(), offset)); nvgpu_safe_add_u32(gr_gpc0_zcull_hww_esr_r(), offset));
nvgpu_gr_intr_report_exception(g, (gpc << 8U), nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
GPU_PGRAPH_GPC_GFX_EXCEPTION,
hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_ZCULL);
/* clear the interrupt */ /* clear the interrupt */
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -610,9 +583,7 @@ void gv11b_gr_intr_handle_gpc_setup_exception(struct gk20a *g, u32 gpc,
hww_esr = nvgpu_readl(g, hww_esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_setup_hww_esr_r(), offset)); nvgpu_safe_add_u32(gr_gpc0_setup_hww_esr_r(), offset));
nvgpu_gr_intr_report_exception(g, (gpc << 8U), nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
GPU_PGRAPH_GPC_GFX_EXCEPTION,
hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_SETUP);
/* clear the interrupt */ /* clear the interrupt */
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -627,7 +598,7 @@ void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception) u32 gpc_exception)
{ {
u32 offset = nvgpu_gr_gpc_offset(g, gpc); u32 offset = nvgpu_gr_gpc_offset(g, gpc);
u32 hww_esr, sub_err_type; u32 hww_esr;
if (((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) == 0U) && if (((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) == 0U) &&
((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m()) ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m())
@@ -638,17 +609,7 @@ void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc,
hww_esr = nvgpu_readl(g, hww_esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_ppc0_pes_hww_esr_r(), offset)); nvgpu_safe_add_u32(gr_gpc0_ppc0_pes_hww_esr_r(), offset));
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) != 0U) { nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES0;
}
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m()) != 0U) {
sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES1;
}
nvgpu_gr_intr_report_exception(g, (gpc << 8U),
GPU_PGRAPH_GPC_GFX_EXCEPTION,
hww_esr, sub_err_type);
/* clear the interrupt */ /* clear the interrupt */
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -725,8 +686,7 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
nvgpu_err(g, "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); nvgpu_err(g, "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
gv11b_gr_intr_report_gpccs_ecc_err(g, ecc_status, ecc_addr, gpc, gv11b_gr_intr_report_gpccs_ecc_err(g, ecc_status, ecc_addr, gpc);
(u32)*corrected_err, (u32)*uncorrected_err);
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
nvgpu_err(g, "gpccs ecc counter overflow!"); nvgpu_err(g, "gpccs ecc counter overflow!");
@@ -753,9 +713,7 @@ void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
offset)); offset));
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "mpc hww esr 0x%08x", esr); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "mpc hww esr 0x%08x", esr);
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc), nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MPC_EXCEPTION);
GPU_PGRAPH_MPC_EXCEPTION,
esr, 0);
esr = nvgpu_readl(g, esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(), nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),
@@ -781,9 +739,7 @@ void gv11b_gr_intr_handle_tpc_pe_exception(struct gk20a *g, u32 gpc, u32 tpc)
esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_tpc0_pe_hww_esr_r(), esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_tpc0_pe_hww_esr_r(),
offset)); offset));
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc), nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION);
GPU_PGRAPH_GPC_GFX_EXCEPTION,
esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_TPC_PE);
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "pe hww esr 0x%08x", esr); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "pe hww esr 0x%08x", esr);
@@ -938,24 +894,21 @@ static void gv11b_gr_intr_report_l1_tag_uncorrected_err(struct gk20a *g,
for (i = 0U; i < ecc_status->err_count; i++) { for (i = 0U; i < ecc_status->err_count; i++) {
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_UNCORRECTED) { if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_UNCORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_l1_tag_ecc_uncorrected "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_UNCORRECTED);
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED) { if (ecc_status->err_id[i] == GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED);
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED) { if (ecc_status->err_id[i] == GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED);
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
} }
} }
@@ -973,10 +926,9 @@ static void gv11b_gr_intr_report_l1_tag_corrected_err(struct gk20a *g,
for (i = 0U; i < ecc_status->err_count; i++) { for (i = 0U; i < ecc_status->err_count; i++) {
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_CORRECTED) { if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_CORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_l1_tag_ecc_corrected "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_L1_TAG_ECC_CORRECTED, 0, nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_CORRECTED);
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
} }
} }
} }
@@ -1296,10 +1248,7 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
nvgpu_safe_add_u32( nvgpu_safe_add_u32(
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter, g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
lrf_uncorrected_err_count_delta); lrf_uncorrected_err_count_delta);
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_report_err_to_sdl(g, GPU_SM_LRF_ECC_UNCORRECTED);
(gpc << SHIFT_8_BITS) | tpc,
GPU_SM_LRF_ECC_UNCORRECTED, 0,
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset),
0U); 0U);
@@ -1431,10 +1380,7 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
nvgpu_safe_add_u32( nvgpu_safe_add_u32(
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter,
cbu_uncorrected_err_count_delta); cbu_uncorrected_err_count_delta);
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_report_err_to_sdl(g, GPU_SM_CBU_ECC_UNCORRECTED);
(gpc << SHIFT_8_BITS) | tpc,
GPU_SM_CBU_ECC_UNCORRECTED,
0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset),
0U); 0U);
@@ -1562,10 +1508,7 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
nvgpu_safe_add_u32( nvgpu_safe_add_u32(
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter,
l1_data_uncorrected_err_count_delta); l1_data_uncorrected_err_count_delta);
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_report_err_to_sdl(g, GPU_SM_L1_DATA_ECC_UNCORRECTED);
(gpc << SHIFT_8_BITS) | tpc,
GPU_SM_L1_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset),
0U); 0U);
@@ -1588,31 +1531,27 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g,
for (i = 0U; i < ecc_status->err_count; i++) { for (i = 0U; i < ecc_status->err_count; i++) {
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) { if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED);
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) { if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED);
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED) { if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected. "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED) { if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, nvgpu_err(g, "sm_icache_l1_predecode_ecc_uncorrected. "
(gpc << SHIFT_8_BITS) | tpc, "gpc_id(%d), tpc_id(%d)", gpc, tpc);
GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED);
0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter);
} }
} }
} }

View File

@@ -285,6 +285,9 @@
#include <nvgpu/grmgr.h> #include <nvgpu/grmgr.h>
#endif #endif
#include "hal/cic/mon/cic_gv11b.h"
#include <nvgpu/cic_mon.h>
static int ga10b_init_gpu_characteristics(struct gk20a *g) static int ga10b_init_gpu_characteristics(struct gk20a *g)
{ {
int err; int err;
@@ -1713,6 +1716,11 @@ static const struct gops_mssnvlink ga10b_ops_mssnvlink = {
}; };
#endif #endif
static const struct gops_cic_mon ga10b_ops_cic_mon = {
.init = gv11b_cic_mon_init,
.report_err = nvgpu_cic_mon_report_err_safety_services
};
int ga10b_init_hal(struct gk20a *g) int ga10b_init_hal(struct gk20a *g)
{ {
struct gpu_ops *gops = &g->ops; struct gpu_ops *gops = &g->ops;
@@ -1812,6 +1820,7 @@ int ga10b_init_hal(struct gk20a *g)
gops->tpc_pg = ga10b_ops_tpc_pg; gops->tpc_pg = ga10b_ops_tpc_pg;
#endif #endif
gops->grmgr = ga10b_ops_grmgr; gops->grmgr = ga10b_ops_grmgr;
gops->cic_mon = ga10b_ops_cic_mon;
gops->chip_init_gpu_characteristics = ga10b_init_gpu_characteristics; gops->chip_init_gpu_characteristics = ga10b_init_gpu_characteristics;
gops->get_litter_value = ga10b_get_litter_value; gops->get_litter_value = ga10b_get_litter_value;
gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;

View File

@@ -411,11 +411,7 @@ static void ga10b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g,
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter, g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter,
uncorrected_delta); uncorrected_delta);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED);
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter);
} }
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) {
@@ -446,11 +442,7 @@ static void ga10b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
uncorrected_delta); uncorrected_delta);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter);
} }
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) {
@@ -516,11 +508,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter, g->ecc.ltc.ecc_sec_count[ltc][slice].counter,
corrected_delta); corrected_delta);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
/* /*
* Using a SEC code will allow correction of an SBE (Single Bit * Using a SEC code will allow correction of an SBE (Single Bit
@@ -551,11 +539,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter, g->ecc.ltc.ecc_ded_count[ltc][slice].counter,
uncorrected_delta); uncorrected_delta);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
} else if (ga10b_ltc_intr_is_dstg_be_ram(ecc_addr)) { } else if (ga10b_ltc_intr_is_dstg_be_ram(ecc_addr)) {
nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected"); nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected");
@@ -564,11 +548,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter, g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter,
uncorrected_delta); uncorrected_delta);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED);
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter);
} else { } else {
nvgpu_err(g, "unsupported uncorrected dstg ecc error"); nvgpu_err(g, "unsupported uncorrected dstg ecc error");
BUG(); BUG();

View File

@@ -126,12 +126,9 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
uncorrected_delta); uncorrected_delta);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
NVGPU_ERR_MODULE_LTC, nvgpu_err(g, "tstg ecc error uncorrected. "
(ltc << 8U) | slice, "ecc_addr(0x%x)", ecc_addr);
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected");
} }
} }
@@ -148,12 +145,9 @@ void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g,
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter, g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter,
uncorrected_delta); uncorrected_delta);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED);
NVGPU_ERR_MODULE_LTC, nvgpu_err(g, "dstg be ecc error uncorrected. "
(ltc << 8U) | slice, "ecc_addr(0x%x)", ecc_addr);
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter);
nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected");
} }
} }
@@ -287,11 +281,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
ecc_stats_reg_val); ecc_stats_reg_val);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
NVGPU_ERR_MODULE_LTC, nvgpu_err(g, "dstg ecc error corrected. "
(ltc << 8U) | slice, "ecc_addr(0x%x)", dstg_ecc_addr);
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, dstg_ecc_addr,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
/* /*
* Using a SEC code will allow correction of an SBE (Single Bit * Using a SEC code will allow correction of an SBE (Single Bit
@@ -335,11 +327,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
ecc_stats_reg_val); ecc_stats_reg_val);
nvgpu_report_ecc_err(g, nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
NVGPU_ERR_MODULE_LTC, nvgpu_err(g, "dstg ecc error uncorrected. "
(ltc << 8U) | slice, "ecc_addr(0x%x)", dstg_ecc_addr);
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, dstg_ecc_addr,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
} }
nvgpu_writel(g, nvgpu_safe_add_u32(ltc_ltc0_lts0_intr_r(), offset), nvgpu_writel(g, nvgpu_safe_add_u32(ltc_ltc0_lts0_intr_r(), offset),

View File

@@ -521,11 +521,10 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g,
} }
#endif #endif
nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR);
GPU_HUBMMU_PAGE_FAULT_ERROR, nvgpu_err(g, "sub_er_type = 0x%x, "
mmufault, "fault_status = 0x%x",
fault_status, sub_err_type, fault_status);
sub_err_type);
nvgpu_assert(get_indx < U32_MAX); nvgpu_assert(get_indx < U32_MAX);
nvgpu_assert(entries != 0U); nvgpu_assert(entries != 0U);

View File

@@ -141,24 +141,20 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
if ((ecc_status & if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_IMEM_ECC_CORRECTED);
GPU_PMU_FALCON_IMEM_ECC_CORRECTED, nvgpu_err(g, "falcon imem ecc error corrected. "
ecc_addr, "ecc_addr(0x%x)", ecc_addr);
g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
} }
if ((ecc_status & if ((ecc_status &
pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED);
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, nvgpu_err(g, "falcon imem ecc error uncorrected. "
ecc_addr, "ecc_addr(0x%x)", ecc_addr);
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
ret = -EFAULT; ret = -EFAULT;
} }
if ((ecc_status & if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); nvgpu_err(g, "falcon dmem ecc error corrected");
/* This error is not expected to occur in gv11b and hence, /* This error is not expected to occur in gv11b and hence,
* this scenario is considered as a fatal error. * this scenario is considered as a fatal error.
*/ */
@@ -167,11 +163,9 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
} }
if ((ecc_status & if ((ecc_status &
pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED);
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, nvgpu_err(g, "falcon dmem ecc error uncorrected. "
ecc_addr, "ecc_addr(0x%x)", ecc_addr);
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
ret = -EFAULT; ret = -EFAULT;
} }

View File

@@ -1,7 +1,7 @@
/* /*
* GA10B priv ring * GA10B priv ring
* *
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -378,8 +378,7 @@ void ga10b_priv_ring_decode_error_code(struct gk20a *g, u32 error_code)
size_t lookup_table_size = 1; size_t lookup_table_size = 1;
size_t index = 0; size_t index = 0;
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, 0, nvgpu_report_err_to_sdl(g, GPU_PRI_ACCESS_VIOLATION);
GPU_PRI_ACCESS_VIOLATION, 0, error_code);
err_code = pri_sys_pri_error_code_v(error_code); err_code = pri_sys_pri_error_code_v(error_code);
error_extra = pri_sys_pri_error_extra_v(error_code); error_extra = pri_sys_pri_error_extra_v(error_code);

View File

@@ -1,7 +1,7 @@
/* /*
* GP10B priv ring * GP10B priv ring
* *
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -71,8 +71,7 @@ void gp10b_priv_ring_decode_error_code(struct gk20a *g,
{ {
u32 error_type_index; u32 error_type_index;
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, 0, nvgpu_report_err_to_sdl(g, GPU_PRI_ACCESS_VIOLATION);
GPU_PRI_ACCESS_VIOLATION, 0, error_code);
error_type_index = (error_code & 0x00000f00U) >> 8U; error_type_index = (error_code & 0x00000f00U) >> 8U;
error_code = error_code & 0xBADFf000U; error_code = error_code & 0xBADFf000U;

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -32,7 +32,6 @@
void ga10b_ptimer_isr(struct gk20a *g) void ga10b_ptimer_isr(struct gk20a *g)
{ {
u32 save0, save1, fecs_errcode = 0; u32 save0, save1, fecs_errcode = 0;
u32 inst = 0U;
u32 error_addr; u32 error_addr;
save0 = nvgpu_readl(g, timer_pri_timeout_save_0_r()); save0 = nvgpu_readl(g, timer_pri_timeout_save_0_r());
@@ -62,13 +61,7 @@ void ga10b_ptimer_isr(struct gk20a *g)
g->ops.priv_ring.decode_error_code(g, g->ops.priv_ring.decode_error_code(g,
fecs_errcode); fecs_errcode);
} }
/* FECS was the target of PRI access */
inst = 1U;
/* SAVE_0_ADDR cannot be used in this case */
error_addr = 0U;
} }
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, nvgpu_report_err_to_sdl(g, GPU_PRI_TIMEOUT_ERROR);
inst, GPU_PRI_TIMEOUT_ERROR,
error_addr, fecs_errcode);
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -32,7 +32,6 @@
void gk20a_ptimer_isr(struct gk20a *g) void gk20a_ptimer_isr(struct gk20a *g)
{ {
u32 save0, save1, fecs_errcode = 0; u32 save0, save1, fecs_errcode = 0;
u32 inst = 0U;
u32 error_addr; u32 error_addr;
save0 = gk20a_readl(g, timer_pri_timeout_save_0_r()); save0 = gk20a_readl(g, timer_pri_timeout_save_0_r());
@@ -55,10 +54,6 @@ void gk20a_ptimer_isr(struct gk20a *g)
g->ops.priv_ring.decode_error_code(g, g->ops.priv_ring.decode_error_code(g,
fecs_errcode); fecs_errcode);
} }
/* FECS was the target of PRI access */
inst = 1U;
/* SAVE_0_ADDR cannot be used in this case */
error_addr = 0U;
} }
nvgpu_err(g, "PRI timeout: ADR 0x%08x " nvgpu_err(g, "PRI timeout: ADR 0x%08x "
@@ -70,9 +65,7 @@ void gk20a_ptimer_isr(struct gk20a *g)
gk20a_writel(g, timer_pri_timeout_save_0_r(), 0); gk20a_writel(g, timer_pri_timeout_save_0_r(), 0);
gk20a_writel(g, timer_pri_timeout_save_1_r(), 0); gk20a_writel(g, timer_pri_timeout_save_1_r(), 0);
nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, nvgpu_report_err_to_sdl(g, GPU_PRI_TIMEOUT_ERROR);
inst, GPU_PRI_TIMEOUT_ERROR,
error_addr, fecs_errcode);
} }
#ifdef CONFIG_NVGPU_IOCTL_NON_FUSA #ifdef CONFIG_NVGPU_IOCTL_NON_FUSA

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -375,25 +375,14 @@ int nvgpu_cic_mon_get_err_desc(struct gk20a *g, u32 hw_unit_id,
* used by sub-units in nvgpu-rm and SDL unit. * used by sub-units in nvgpu-rm and SDL unit.
* *
* @param g [in] - The GPU driver struct. * @param g [in] - The GPU driver struct.
* @param err_info [in] - Error message. * @param err_id [in] - Error ID.
* @param err_size [in] - Size of the error message.
* @param is_critical [in] - Criticality of the error being reported.
* *
* On QNX: * - Reports the errors to Safety_Services.
* - Checks whether SDL is initialized.
* - Enqueues \a err_info into error message queue.
* - Signals the workqueue condition variable.
* - If the reported error is critical, invokes #nvgpu_sw_quiesce() api.
*
* on Linux:
* - NOP currently as safety services are absent in Linux
* *
* @return 0 in case of success, <0 in case of failure. * @return 0 in case of success, <0 in case of failure.
* @retval -EAGAIN if SDL not initialized.
* @retval -ENOMEM if sufficient memory is not available.
*/ */
int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g,
void *err_info, size_t err_size, bool is_critical); u32 err_id);
/** /**
* @brief Get the number of HW modules supported by CIC. * @brief Get the number of HW modules supported by CIC.

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -53,15 +53,11 @@ struct gops_cic_mon {
* @brief Report error to safety services. * @brief Report error to safety services.
* *
* @param g [in] Pointer to GPU driver struct. * @param g [in] Pointer to GPU driver struct.
* @param err_pkt [in] Pointer to struct holding err details. * @param err_id [in] Error ID.
* @param err_size [in] Size of err_pkt.
* @param is_critical [in] Flag indicating criticality of error.
* *
* @return 0 in case of success, < 0 in case of failure. * @return 0 in case of success, < 0 in case of failure.
*/ */
int (*report_err)(struct gk20a *g, int (*report_err)(struct gk20a *g, u32 err_id);
void *err_pkt, size_t err_size,
bool is_critical);
}; };
#endif/*NVGPU_GOPS_CIC_MON_H*/ #endif/*NVGPU_GOPS_CIC_MON_H*/

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -125,15 +125,11 @@ struct gops_ltc_intr {
* -# Increment g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter * -# Increment g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter
* with uncorrected counter delta with * with uncorrected counter delta with
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_ecc_err" with following parameters: * "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g * -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
* -# (\a ltc << 8U) | \a slice
* -# \ref GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED * -# \ref GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED
* "GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED" * "GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED"
* -# ecc address read above
* -# g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter
* -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() is * -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() is
* set in ecc status, then it is considered as fatal error as it is not * set in ecc status, then it is considered as fatal error as it is not
* expected and call \ref BUG "BUG()". * expected and call \ref BUG "BUG()".
@@ -143,15 +139,11 @@ struct gops_ltc_intr {
* -# Increment g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter * -# Increment g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter
* with uncorrected counter delta with * with uncorrected counter delta with
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_ecc_err" with following parameters: * "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g * -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
* -# (\a ltc << 8U) | \a slice
* -# \ref GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED * -# \ref GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED
* "GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED" * "GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED"
* -# ecc address read above
* -# g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter
* -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() is * -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() is
* set in ecc status, then it is considered as fatal error as it is not * set in ecc status, then it is considered as fatal error as it is not
* expected and call \ref BUG "BUG()". * expected and call \ref BUG "BUG()".
@@ -162,15 +154,11 @@ struct gops_ltc_intr {
* -# Increment g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter * -# Increment g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter
* with corrected counter delta with * with corrected counter delta with
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_ecc_err" with following parameters: * "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g * -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
* -# (\a ltc << 8U) | \a slice
* -# \ref GPU_LTC_CACHE_DSTG_ECC_CORRECTED * -# \ref GPU_LTC_CACHE_DSTG_ECC_CORRECTED
* "GPU_LTC_CACHE_DSTG_ECC_CORRECTED" * "GPU_LTC_CACHE_DSTG_ECC_CORRECTED"
* -# ecc address read above.
* -# g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter
* -# Flush the L2 cache by calling * -# Flush the L2 cache by calling
* \ref gops_mm_cache.l2_flush "gops_mm_cache.l2_flush". * \ref gops_mm_cache.l2_flush "gops_mm_cache.l2_flush".
* -# If it fails then call \ref BUG "BUG()". * -# If it fails then call \ref BUG "BUG()".
@@ -182,28 +170,20 @@ struct gops_ltc_intr {
* -# Increment g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter * -# Increment g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter
* with uncorrected counter delta with * with uncorrected counter delta with
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_ecc_err" with following parameters: * "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g * -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
* -# (\a ltc << 8U) | \a slice
* -# \ref GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED * -# \ref GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED
* "GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED" * "GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED"
* -# ecc address read above.
* -# g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter
* -# Else if the ECC address correspongs to DSTG BE RAM: * -# Else if the ECC address correspongs to DSTG BE RAM:
* -# Increment g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter * -# Increment g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter
* with uncorrected counter delta with * with uncorrected counter delta with
* \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
* -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl
* "nvgpu_report_ecc_err" with following parameters: * "nvgpu_report_err_to_sdl" with following parameters:
* -# \a g * -# \a g
* -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
* -# (\a ltc << 8U) | \a slice
* -# \ref GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED * -# \ref GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
* "GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED" * "GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED"
* -# ecc address read above
* -# g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter
* -# Else call \ref BUG "BUG()" as this type of ECC error is not supported. * -# Else call \ref BUG "BUG()" as this type of ECC error is not supported.
* -# Clear the register ltc_ltc0_lts0_intr3_r() by writing the read value. * -# Clear the register ltc_ltc0_lts0_intr3_r() by writing the read value.
* - return 0 * - return 0

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -284,8 +284,8 @@ struct gops_priv_ring {
* "pri route error" * "pri route error"
* }; * };
* \endcode * \endcode
* - Invoke \ref #nvgpu_report_pri_err "nvgpu_report_pri_err" with parameters \a g, * - Invoke \ref #nvgpu_report_pri_err "nvgpu_report_err_to_sdl" with parameters \a g,
* #NVGPU_ERR_MODULE_PRI, #GPU_PRI_ACCESS_VIOLATION, 0, error_code respectively. * #GPU_PRI_ACCESS_VIOLATION, respectively.
* - Declare a variable error_type_index and store the bits [8-12] as below. * - Declare a variable error_type_index and store the bits [8-12] as below.
* error_type_index will be used as an index to the above error tables. * error_type_index will be used as an index to the above error tables.
* error_code is also updated. * error_code is also updated.

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -91,14 +91,10 @@ struct gops_ptimer {
* - Clear timer_pri_timeout_save_0_r() and timer_pri_timeout_save_1_r() * - Clear timer_pri_timeout_save_0_r() and timer_pri_timeout_save_1_r()
* registers so that the next pri access error can be recorded. Write * registers so that the next pri access error can be recorded. Write
* 0 to these two registers to clear the previous error information. * 0 to these two registers to clear the previous error information.
* - Report the PRI_TIMEOUT_ERROR to SDL unit using \ref nvgpu_report_pri_err() * - Report the PRI_TIMEOUT_ERROR to SDL unit using \ref nvgpu_report_err_to_sdl()
* API. The inputs to \ref nvgpu_report_pri_err() are - * API. The inputs to \ref nvgpu_report_err_to_sdl() are -
* - g, * - g,
* - NVGPU_ERR_MODULE_PRI, * - GPU_PRI_TIMEOUT_ERROR.
* - inst,
* - GPU_PRI_TIMEOUT_ERROR,
* - error_addr,
* - fecs_errcode
*/ */
void (*isr)(struct gk20a *g); void (*isr)(struct gk20a *g);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -112,23 +112,6 @@ void nvgpu_gr_intr_handle_notify_pending(struct gk20a *g,
void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g, void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g,
struct nvgpu_gr_isr_data *isr_data); struct nvgpu_gr_isr_data *isr_data);
/**
* @brief Report GR exceptions to qnx.sdl unit.
*
* @param g [in] Pointer to GPU driver struct.
* @param inst [in] Unit instance ID.
* @param err_type [in] Error type.
* @param status [in] Exception status value.
* @param sub_err_type [in] Sub error type.
*
* This function reports all GR exceptions to qnx.sdl unit.
*
* Other interrupt handling functions like #nvgpu_gr_intr_handle_fecs_error()
* call this function to report exceptions to qnx.sdl.
*/
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
u32 err_type, u32 status, u32 sub_err_type);
/** /**
* @brief Translate context to channel ID. * @brief Translate context to channel ID.
* *
@@ -223,7 +206,6 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
* @see nvgpu_gr_intr_handle_notify_pending * @see nvgpu_gr_intr_handle_notify_pending
* @see nvgpu_gr_intr_handle_semaphore_pending * @see nvgpu_gr_intr_handle_semaphore_pending
* @see nvgpu_gr_intr_handle_sm_exception * @see nvgpu_gr_intr_handle_sm_exception
* @see nvgpu_gr_intr_report_exception
* @see nvgpu_gr_intr_set_error_notifier * @see nvgpu_gr_intr_set_error_notifier
*/ */
int nvgpu_gr_intr_stall_isr(struct gk20a *g); int nvgpu_gr_intr_stall_isr(struct gk20a *g);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -1195,4 +1195,16 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid, void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
u32 mailbox_value); u32 mailbox_value);
/**
* @brief This is a wrapper function to report ECC errors from HUBMMU to SDL.
*
* @param g [in] - The GPU driver struct.
* @param err_id [in] - Error ID.
*
* Calls nvgpu_report_err_to_ss to report errors to Safety_Services.
*
* @return None
*/
void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id);
#endif /* NVGPU_NVGPU_ERR_H */ #endif /* NVGPU_NVGPU_ERR_H */

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA Corporation. All rights reserved. * Copyright (c) 2021-2022, NVIDIA Corporation. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
@@ -20,7 +20,11 @@
struct gk20a; struct gk20a;
int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g,
void *err_info, size_t err_size, bool is_critical) u32 metadata)
{ {
/**
* ToDo: Add MISC_EC API to report error.
* Decide on triggering SW quiesce for UE.
*/
return 0; return 0;
} }

View File

@@ -50,11 +50,9 @@ void nvgpu_ecc_sysfs_remove(struct gk20a *g)
#endif #endif
int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g,
void *err_info, size_t err_size, bool is_critical) u32 err_id)
{ {
(void)g; (void)g;
(void)err_info; (void)err_id;
(void)err_size;
(void)is_critical;
return 0; return 0;
} }