From 7dc013d2426b6280b7ee85bbcbcb5de2cf1714fa Mon Sep 17 00:00:00 2001 From: Rajesh Devaraj Date: Tue, 1 Feb 2022 16:04:37 +0000 Subject: [PATCH] gpu: nvgpu: merge error reporting apis In DRIVE 6.0, NvGPU is allowed to report only 32-bit metadata to Safety_Services. So, there is no need to have distinct APIs for reporting errors from units like GR, MM, FIFO to SDL unit. All these error reporting APIs will be replaced with a single API. To meet this objective, this patch does the following changes: - Replaces nvgpu_report_*_err with nvgpu_report_err_to_sdl. - Removes the reporting of error messages. - Replaces nvgpu_log() with nvgpu_err(), for error reporting. - Removes error reporting to Safety_Services from nvgpu_report_*_err. However, nvgpu_report_*_err APIs and their related files are not removed. During the creation of nvgpu-mon, they will be moved under nvgpu-rm, in debug builds. Note: - There will be a follow-up patch to fix error IDs. - As discussed in https://nvbugs/3491596 (comment #12), the high level expectation is to report only errors. JIRA NVGPU-7450 Change-Id: I428f2a9043086462754ac36a15edf6094985316f Signed-off-by: Rajesh Devaraj Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2662590 Tested-by: mobile promotions Reviewed-by: mobile promotions --- arch/nvgpu-common.yaml | 1 + arch/nvgpu-linux.yaml | 2 +- drivers/gpu/nvgpu/Makefile | 3 +- drivers/gpu/nvgpu/Makefile.sources | 1 + drivers/gpu/nvgpu/common/cic/mon/mon_ce.c | 11 +- drivers/gpu/nvgpu/common/cic/mon/mon_ctxsw.c | 9 - drivers/gpu/nvgpu/common/cic/mon/mon_ecc.c | 11 +- drivers/gpu/nvgpu/common/cic/mon/mon_gr.c | 20 -- drivers/gpu/nvgpu/common/cic/mon/mon_host.c | 11 +- drivers/gpu/nvgpu/common/cic/mon/mon_mmu.c | 9 - drivers/gpu/nvgpu/common/cic/mon/mon_pmu.c | 11 +- drivers/gpu/nvgpu/common/cic/mon/mon_pri.c | 11 +- .../gpu/nvgpu/common/cic/mon/mon_report_err.c | 42 +++++ drivers/gpu/nvgpu/common/gr/gr.c | 2 +- drivers/gpu/nvgpu/common/gr/gr_intr.c | 142 ++------------- drivers/gpu/nvgpu/common/pmu/pmu.c | 7 +- drivers/gpu/nvgpu/hal/bus/bus_ga10b.c | 4 +- drivers/gpu/nvgpu/hal/bus/bus_gk20a_fusa.c | 5 +- drivers/gpu/nvgpu/hal/ce/ce_gp10b_fusa.c | 6 +- drivers/gpu/nvgpu/hal/ce/ce_gv11b_fusa.c | 8 +- .../nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c | 27 ++- .../hal/fb/intr/fb_intr_ecc_gv11b_fusa.c | 37 ++-- .../nvgpu/hal/fifo/ctxsw_timeout_ga10b_fusa.c | 5 +- .../nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c | 6 +- .../gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c | 3 +- drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c | 3 +- .../gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c | 5 +- .../gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c | 14 +- drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c | 8 +- drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c | 7 +- .../nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c | 45 ++--- .../nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c | 44 ++--- .../nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c | 171 ++++++------------ drivers/gpu/nvgpu/hal/init/hal_ga10b.c | 9 + .../nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c | 30 +-- .../nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c | 34 ++-- .../hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c | 9 +- drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c | 26 +-- .../hal/priv_ring/priv_ring_ga10b_fusa.c | 5 +- .../hal/priv_ring/priv_ring_gp10b_fusa.c | 5 +- .../gpu/nvgpu/hal/ptimer/ptimer_ga10b_fusa.c | 11 +- .../gpu/nvgpu/hal/ptimer/ptimer_gk20a_fusa.c | 11 +- drivers/gpu/nvgpu/include/nvgpu/cic_mon.h | 19 +- .../gpu/nvgpu/include/nvgpu/gops/cic_mon.h | 10 +- drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h | 42 ++--- .../gpu/nvgpu/include/nvgpu/gops/priv_ring.h | 6 +- drivers/gpu/nvgpu/include/nvgpu/gops/ptimer.h | 12 +- drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h | 20 +- drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 14 +- .../cic/{cic_stub.c => cic_report_err.c} | 8 +- drivers/gpu/nvgpu/os/posix/stubs.c | 6 +- 51 files changed, 310 insertions(+), 658 deletions(-) create mode 100644 drivers/gpu/nvgpu/common/cic/mon/mon_report_err.c rename drivers/gpu/nvgpu/os/linux/cic/{cic_stub.c => cic_report_err.c} (80%) diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml index 66d20fa15..07fd88557 100644 --- a/arch/nvgpu-common.yaml +++ b/arch/nvgpu-common.yaml @@ -1135,6 +1135,7 @@ cic: common/cic/mon/mon_pri.c, common/cic/mon/mon_pmu.c, common/cic/mon/mon_mmu.c, + common/cic/mon/mon_report_err.c, common/cic/mon/cic_mon_priv.h, include/nvgpu/gops/cic_mon.h, include/nvgpu/cic_mon.h, diff --git a/arch/nvgpu-linux.yaml b/arch/nvgpu-linux.yaml index 072a69d1b..594feb17d 100644 --- a/arch/nvgpu-linux.yaml +++ b/arch/nvgpu-linux.yaml @@ -237,7 +237,7 @@ vm: os/linux/nvgpu_ivm.c ] cic: - sources: [ os/linux/cic/cic_stub.c ] + sources: [ os/linux/cic/cic_report_err.c ] # Group all the Linux headers for now. headers: diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 8dc0cadca..dfa830f05 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -342,6 +342,7 @@ nvgpu-y += \ common/cic/mon/mon_pri.o \ common/cic/mon/mon_pmu.o \ common/cic/mon/mon_mmu.o \ + common/cic/mon/mon_report_err.o \ common/cic/rm/rm_init.o \ common/cic/rm/rm_intr.o \ hal/bus/bus_gk20a.o \ @@ -472,7 +473,7 @@ nvgpu-y += \ os/linux/dt.o \ os/linux/ecc_sysfs.o \ os/linux/bsearch.o \ - os/linux/cic/cic_stub.o \ + os/linux/cic/cic_report_err.o \ os/linux/dmabuf_priv.o \ os/linux/power_ops.o diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources index 4418a52eb..4701c615c 100644 --- a/drivers/gpu/nvgpu/Makefile.sources +++ b/drivers/gpu/nvgpu/Makefile.sources @@ -168,6 +168,7 @@ srcs += common/device.c \ common/cic/mon/mon_pri.c \ common/cic/mon/mon_pmu.c \ common/cic/mon/mon_mmu.c \ + common/cic/mon/mon_report_err.c \ common/cic/rm/rm_init.c \ common/cic/rm/rm_intr.c \ hal/init/hal_gv11b.c \ diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_ce.c b/drivers/gpu/nvgpu/common/cic/mon/mon_ce.c index c4f129856..cc172e935 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_ce.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_ce.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -69,15 +69,6 @@ void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit, err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( sizeof(err_pkt.err_info.ce_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - nvgpu_err(g, "Failed to report CE error: " - "inst=%u err_id=%u intr_info=%u", - inst, err_id, intr_info); - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_ctxsw.c b/drivers/gpu/nvgpu/common/cic/mon/mon_ctxsw.c index 6895fa7cf..4c7fb2a42 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_ctxsw.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_ctxsw.c @@ -72,15 +72,6 @@ void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id, err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( sizeof(err_pkt.err_info.ctxsw_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - nvgpu_err(g, "Failed to report CTXSW error: " - "err_id=%u, mailbox_val=%u", - err_id, err_info->mailbox_value); - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_ecc.c b/drivers/gpu/nvgpu/common/cic/mon/mon_ecc.c index 4f42776d2..f7b0c2ace 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_ecc.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_ecc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -60,15 +60,6 @@ void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( sizeof(err_pkt.err_info.ecc_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - nvgpu_err(g, "Failed to report ECC error: hw_unit=%u, inst=%u, " - "err_id=%u, err_addr=%llu, err_count=%llu", - hw_unit, inst, err_id, err_addr, err_count); - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_gr.c b/drivers/gpu/nvgpu/common/cic/mon/mon_gr.c index 79b309258..61b05f0de 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_gr.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_gr.c @@ -99,26 +99,6 @@ void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst, nvpgu_report_fill_err_info(hw_unit, &err_pkt, err_info); err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(sizeof(err_pkt.err_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - if (hw_unit == NVGPU_ERR_MODULE_SM) { - nvgpu_err(g, "Failed to report SM exception" - "gpc=%u, tpc=%u, sm=%u, esr_status=%x", - err_pkt.err_info.sm_info.gpc, - err_pkt.err_info.sm_info.tpc, - err_pkt.err_info.sm_info.sm, - err_pkt.err_info.sm_info.warp_esr_status); - } - if (hw_unit == NVGPU_ERR_MODULE_PGRAPH) { - nvgpu_err(g, "Failed to report PGRAPH" - "exception: inst=%u, err_id=%u, " - "status=%u", inst, err_id, - err_pkt.err_info.gr_info.status); - } - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_host.c b/drivers/gpu/nvgpu/common/cic/mon/mon_host.c index 234ea44f6..5f8d70eda 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_host.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_host.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -69,15 +69,6 @@ void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( sizeof(err_pkt.err_info.host_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - nvgpu_err(g, "Failed to report HOST error: " - "inst=%u, err_id=%u, intr_info=%u", - inst, err_id, intr_info); - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_mmu.c b/drivers/gpu/nvgpu/common/cic/mon/mon_mmu.c index b4b523689..7fe4de93e 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_mmu.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_mmu.c @@ -105,15 +105,6 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, u32 err_id, err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( sizeof(err_pkt.err_info.mmu_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - nvgpu_err(g, "Failed to report MMU fault: hw_unit=%u, " - "err_id=%u, sub_err_type=%u, status=%u", - hw_unit, err_id, sub_err_type, status); - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_pmu.c b/drivers/gpu/nvgpu/common/cic/mon/mon_pmu.c index 4ffcda91e..aa66f4704 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_pmu.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_pmu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -65,15 +65,6 @@ void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id, err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( sizeof(err_pkt.err_info.pmu_err_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - nvgpu_err(g, "Failed to report PMU error: " - "err_id=%u, sub_err_type=%u, status=%u", - err_id, sub_err_type, status); - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_pri.c b/drivers/gpu/nvgpu/common/cic/mon/mon_pri.c index 60dbc6867..3508cc606 100644 --- a/drivers/gpu/nvgpu/common/cic/mon/mon_pri.c +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_pri.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -69,15 +69,6 @@ void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst, err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( sizeof(err_pkt.err_info.pri_info)); - if (g->ops.cic_mon.report_err != NULL) { - err = g->ops.cic_mon.report_err(g, (void *)&err_pkt, - sizeof(err_pkt), err_desc->is_critical); - if (err != 0) { - nvgpu_err(g, "Failed to report PRI error: " - "inst=%u, err_id=%u, err_code=%u", - inst, err_id, err_code); - } - } handle_report_failure: if (err != 0) { nvgpu_sw_quiesce(g); diff --git a/drivers/gpu/nvgpu/common/cic/mon/mon_report_err.c b/drivers/gpu/nvgpu/common/cic/mon/mon_report_err.c new file mode 100644 index 000000000..67475309c --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/mon/mon_report_err.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_mon_priv.h" + +void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id) +{ + if (g->ops.cic_mon.report_err == NULL) { + return; + } + + if (g->ops.cic_mon.report_err(g, err_id) != 0) { + nvgpu_err(g, "Failed to report an error: err_id=%x", + err_id); + nvgpu_sw_quiesce(g); + } +} diff --git a/drivers/gpu/nvgpu/common/gr/gr.c b/drivers/gpu/nvgpu/common/gr/gr.c index c33f4a376..ce8b08a15 100644 --- a/drivers/gpu/nvgpu/common/gr/gr.c +++ b/drivers/gpu/nvgpu/common/gr/gr.c @@ -817,7 +817,7 @@ static int gr_init_ctxsw_falcon_support(struct gk20a *g, struct nvgpu_gr *gr) err = nvgpu_gr_falcon_init_ctxsw(g, gr->falcon); if (err != 0) { - gr_intr_report_ctxsw_error(g, GPU_FECS_CTXSW_INIT_ERROR, 0, 0); + nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_INIT_ERROR); return err; } diff --git a/drivers/gpu/nvgpu/common/gr/gr_intr.c b/drivers/gpu/nvgpu/common/gr/gr_intr.c index 94c3a99ab..4e8ca3d0e 100644 --- a/drivers/gpu/nvgpu/common/gr/gr_intr.c +++ b/drivers/gpu/nvgpu/common/gr/gr_intr.c @@ -42,21 +42,6 @@ #include "gr_intr_priv.h" -void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid, - u32 mailbox_value) -{ - struct ctxsw_err_info err_info; - - err_info.curr_ctx = g->ops.gr.falcon.get_current_ctx(g); - err_info.ctxsw_status0 = g->ops.gr.falcon.read_fecs_ctxsw_status0(g); - err_info.ctxsw_status1 = g->ops.gr.falcon.read_fecs_ctxsw_status1(g); - err_info.mailbox_value = mailbox_value; - err_info.chid = chid; - - nvgpu_report_ctxsw_err(g, NVGPU_ERR_MODULE_FECS, - err_type, (void *)&err_info); -} - static int gr_intr_handle_pending_tpc_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct nvgpu_channel *fault_ch, u32 *hww_global_esr) @@ -201,41 +186,6 @@ static void gr_intr_handle_class_error(struct gk20a *g, NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY); } -static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, - u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc) -{ - struct gr_sm_mcerr_info err_info; - struct nvgpu_channel *ch; - struct gr_err_info info; - u32 tsgid, chid, curr_ctx, inst = 0; - - tsgid = NVGPU_INVALID_TSG_ID; - curr_ctx = g->ops.gr.falcon.get_current_ctx(g); - if (curr_ctx == 0U) { - return; - } - - ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid); - chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID; - if (ch != NULL) { - nvgpu_channel_put(ch); - } - - (void) memset(&err_info, 0, sizeof(err_info)); - (void) memset(&info, 0, sizeof(info)); - err_info.curr_ctx = curr_ctx; - err_info.chid = chid; - err_info.tsgid = tsgid; - err_info.hww_warp_esr_pc = hww_warp_esr_pc; - err_info.hww_warp_esr_status = hww_warp_esr_status; - err_info.gpc = gpc; - err_info.tpc = tpc; - err_info.sm = sm; - info.sm_mcerr_info = &err_info; - nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst, - GPU_SM_MACHINE_CHECK_ERROR, &info, 0U); -} - /* Used by sw interrupt thread to translate current ctx to chid. * Also used by regops to translate current ctx to chid and tsgid. * For performance, we don't want to go through 128 channels every time. @@ -318,35 +268,6 @@ unlock: return ret_ch; } -void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst, - u32 err_type, u32 status, u32 sub_err_type) -{ - struct nvgpu_channel *ch = NULL; - struct gr_exception_info err_info; - struct gr_err_info info; - u32 tsgid, chid, curr_ctx; - - tsgid = NVGPU_INVALID_TSG_ID; - curr_ctx = g->ops.gr.falcon.get_current_ctx(g); - if (curr_ctx != 0U) { - ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid); - } - chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID; - if (ch != NULL) { - nvgpu_channel_put(ch); - } - - (void) memset(&err_info, 0, sizeof(err_info)); - (void) memset(&info, 0, sizeof(info)); - err_info.curr_ctx = curr_ctx; - err_info.chid = chid; - err_info.tsgid = tsgid; - err_info.status = status; - info.exception_info = &err_info; - nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH, - inst, err_type, &info, sub_err_type); -} - void nvgpu_gr_intr_set_error_notifier(struct gk20a *g, struct nvgpu_gr_isr_data *isr_data, u32 error_notifier) { @@ -372,22 +293,6 @@ static bool is_global_esr_error(u32 global_esr, u32 global_mask) return ((global_esr & ~global_mask) != 0U) ? true: false; } -static void gr_intr_report_warp_error(struct gk20a *g, u32 gpc, u32 tpc, - u32 sm, u32 global_esr, u32 warp_esr, - u32 global_mask, u32 offset) -{ - u64 hww_warp_esr_pc = 0; - - if (is_global_esr_error(global_esr, global_mask)) { - if (g->ops.gr.intr.get_sm_hww_warp_esr_pc != NULL) { - hww_warp_esr_pc = g->ops.gr.intr.get_sm_hww_warp_esr_pc(g, - offset); - } - gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr, - hww_warp_esr_pc); - } -} - #ifdef CONFIG_NVGPU_DEBUGGER static int gr_intr_sm_exception_warp_sync(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, @@ -454,8 +359,11 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, /* * Check and report any fatal warp errors. */ - gr_intr_report_warp_error(g, gpc, tpc, sm, global_esr, warp_esr, - global_mask, offset); + if (is_global_esr_error(global_esr, global_mask)) { + nvgpu_report_err_to_sdl(g, GPU_SM_MACHINE_CHECK_ERROR); + nvgpu_err(g, "sm machine check err. gpc_id(%d), tpc_id(%d), " + "offset(%d)", gpc, tpc, offset); + } (void)nvgpu_pg_elpg_protected_call(g, nvgpu_safe_cast_u32_to_s32( @@ -570,9 +478,7 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch, && (mailbox_value == g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val())) { - gr_intr_report_ctxsw_error(g, - GPU_FECS_CTXSW_CRC_MISMATCH, - chid, mailbox_value); + nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_CRC_MISMATCH); nvgpu_err(g, "ctxsw intr0 set by ucode, " "ctxsw checksum mismatch"); ret = -1; @@ -582,9 +488,7 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch, * recovery is initiated and error is reported to * 3LSS. */ - gr_intr_report_ctxsw_error(g, - GPU_FECS_FAULT_DURING_CTXSW, - chid, mailbox_value); + nvgpu_report_err_to_sdl(g, GPU_FECS_FAULT_DURING_CTXSW); nvgpu_err(g, "ctxsw intr0 set by ucode, error_code: 0x%08x", mailbox_value); @@ -593,17 +497,13 @@ int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch, } if (fecs_host_intr->fault_during_ctxsw_active) { - gr_intr_report_ctxsw_error(g, - GPU_FECS_FAULT_DURING_CTXSW, - chid, 0); + nvgpu_report_err_to_sdl(g, GPU_FECS_FAULT_DURING_CTXSW); nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid); ret = -1; } if (fecs_host_intr->watchdog_active) { - gr_intr_report_ctxsw_error(g, - GPU_FECS_CTXSW_WATCHDOG_TIMEOUT, - chid, 0); + nvgpu_report_err_to_sdl(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT); /* currently, recovery is not initiated */ nvgpu_err(g, "fecs watchdog triggered for channel %u, " "cannot ctxsw anymore !!", chid); @@ -861,7 +761,7 @@ static u32 gr_intr_handle_exception_interrupts(struct gk20a *g, } static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g, - u32 gr_intr, u32 *clear_intr, + u32 *clear_intr, struct nvgpu_gr_intr_info *intr_info, struct nvgpu_gr_isr_data *isr_data) { @@ -870,9 +770,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g, if (intr_info->illegal_notify != 0U) { nvgpu_err(g, "illegal notify pending"); - nvgpu_gr_intr_report_exception(g, 0U, - GPU_PGRAPH_ILLEGAL_ERROR, gr_intr, - GPU_PGRAPH_ILLEGAL_NOTIFY); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR); nvgpu_gr_intr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY); do_reset = 1U; @@ -881,9 +779,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g, if (intr_info->illegal_method != 0U) { if (gr_intr_handle_illegal_method(g, isr_data) != 0) { - nvgpu_gr_intr_report_exception(g, 0U, - GPU_PGRAPH_ILLEGAL_ERROR, gr_intr, - GPU_PGRAPH_ILLEGAL_METHOD); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR); do_reset = 1U; } @@ -891,9 +787,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g, } if (intr_info->illegal_class != 0U) { - nvgpu_gr_intr_report_exception(g, 0U, - GPU_PGRAPH_ILLEGAL_ERROR, gr_intr, - GPU_PGRAPH_ILLEGAL_CLASS); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR); nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x", isr_data->class_num, isr_data->offset); @@ -906,7 +800,7 @@ static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g, } static u32 gr_intr_handle_error_interrupts(struct gk20a *g, - u32 gr_intr, u32 *clear_intr, + u32 *clear_intr, struct nvgpu_gr_intr_info *intr_info, struct nvgpu_gr_isr_data *isr_data) { @@ -923,9 +817,7 @@ static u32 gr_intr_handle_error_interrupts(struct gk20a *g, } if (intr_info->class_error != 0U) { - nvgpu_gr_intr_report_exception(g, 0U, - GPU_PGRAPH_ILLEGAL_ERROR, gr_intr, - GPU_PGRAPH_CLASS_ERROR); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_ILLEGAL_ERROR); gr_intr_handle_class_error(g, isr_data); do_reset = 1U; *clear_intr &= ~intr_info->class_error; @@ -1073,10 +965,10 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g) gr_intr_handle_pending_interrupts(g, &clear_intr, &intr_info, &isr_data); - need_reset |= gr_intr_handle_illegal_interrupts(g, gr_intr, + need_reset |= gr_intr_handle_illegal_interrupts(g, &clear_intr, &intr_info, &isr_data); - need_reset |= gr_intr_handle_error_interrupts(g, gr_intr, + need_reset |= gr_intr_handle_error_interrupts(g, &clear_intr, &intr_info, &isr_data); need_reset |= gr_intr_handle_exception_interrupts(g, &clear_intr, diff --git a/drivers/gpu/nvgpu/common/pmu/pmu.c b/drivers/gpu/nvgpu/common/pmu/pmu.c index 2718aa77c..2e8ec7e55 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu.c +++ b/drivers/gpu/nvgpu/common/pmu/pmu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -34,8 +34,9 @@ void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status, u32 error_type) { - nvgpu_report_pmu_err(g, NVGPU_ERR_MODULE_PMU, - GPU_PMU_BAR0_ERROR_TIMEOUT, error_type, bar0_status); + nvgpu_report_err_to_sdl(g, GPU_PMU_BAR0_ERROR_TIMEOUT); + nvgpu_err(g, "Falcon mem scrubbing timeout. status(0x%x), " + "error_type(0x%x)", bar0_status, error_type); } /* PMU engine reset functions */ diff --git a/drivers/gpu/nvgpu/hal/bus/bus_ga10b.c b/drivers/gpu/nvgpu/hal/bus/bus_ga10b.c index 7c7759914..1b1a0401b 100644 --- a/drivers/gpu/nvgpu/hal/bus/bus_ga10b.c +++ b/drivers/gpu/nvgpu/hal/bus/bus_ga10b.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -156,6 +156,6 @@ void ga10b_bus_isr(struct gk20a *g) bus_intr_0 & ~bus_intr_0_handled); } - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, err_type, bus_intr_0); + nvgpu_report_err_to_sdl(g, err_type); nvgpu_writel(g, bus_intr_0_r(), bus_intr_0); } diff --git a/drivers/gpu/nvgpu/hal/bus/bus_gk20a_fusa.c b/drivers/gpu/nvgpu/hal/bus/bus_gk20a_fusa.c index d043b16e0..98dea07d0 100644 --- a/drivers/gpu/nvgpu/hal/bus/bus_gk20a_fusa.c +++ b/drivers/gpu/nvgpu/hal/bus/bus_gk20a_fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -89,7 +89,6 @@ void gk20a_bus_isr(struct gk20a *g) */ err_type = GPU_HOST_PBUS_TIMEOUT_ERROR; } - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - 0, err_type, val); + nvgpu_report_err_to_sdl(g, err_type); nvgpu_writel(g, bus_intr_0_r(), val); } diff --git a/drivers/gpu/nvgpu/hal/ce/ce_gp10b_fusa.c b/drivers/gpu/nvgpu/hal/ce/ce_gp10b_fusa.c index 313daf861..e08902cb0 100644 --- a/drivers/gpu/nvgpu/hal/ce/ce_gp10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ce/ce_gp10b_fusa.c @@ -43,15 +43,13 @@ void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base) /* clear blocking interrupts: they exibit broken behavior */ if ((ce_intr & ce_intr_status_blockpipe_pending_f()) != 0U) { - nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, - GPU_CE_BLOCK_PIPE, ce_intr); + nvgpu_report_err_to_sdl(g, GPU_CE_BLOCK_PIPE); nvgpu_err(g, "ce blocking pipe interrupt"); clear_intr |= ce_intr_status_blockpipe_pending_f(); } if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) { - nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, - GPU_CE_LAUNCH_ERROR, ce_intr); + nvgpu_report_err_to_sdl(g, GPU_CE_LAUNCH_ERROR); nvgpu_err(g, "ce launch error interrupt"); clear_intr |= ce_intr_status_launcherr_pending_f(); } diff --git a/drivers/gpu/nvgpu/hal/ce/ce_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/ce/ce_gv11b_fusa.c index e4d614b3d..234b0c83f 100644 --- a/drivers/gpu/nvgpu/hal/ce/ce_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ce/ce_gv11b_fusa.c @@ -1,7 +1,7 @@ /* * Volta GPU series Copy Engine. * - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -61,8 +61,7 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base) * reset to get back to a working state. */ if ((ce_intr & ce_intr_status_invalid_config_pending_f()) != 0U) { - nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, - GPU_CE_INVALID_CONFIG, ce_intr); + nvgpu_report_err_to_sdl(g, GPU_CE_INVALID_CONFIG); nvgpu_err(g, "ce: inst %d: invalid config", inst_id); clear_intr |= ce_intr_status_invalid_config_reset_f(); } @@ -74,8 +73,7 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base) * reset before operations can start again, if not the entire GPU. */ if ((ce_intr & ce_intr_status_mthd_buffer_fault_pending_f()) != 0U) { - nvgpu_report_ce_err(g, NVGPU_ERR_MODULE_CE, inst_id, - GPU_CE_METHOD_BUFFER_FAULT, ce_intr); + nvgpu_report_err_to_sdl(g, GPU_CE_METHOD_BUFFER_FAULT); nvgpu_err(g, "ce: inst %d: mthd buffer fault", inst_id); clear_intr |= ce_intr_status_mthd_buffer_fault_reset_f(); } diff --git a/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c index cba4a230d..7114051cb 100644 --- a/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c @@ -511,11 +511,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr) if ((niso_intr & fb_niso_intr_mmu_other_fault_notify_m()) != 0U) { - nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR, - NULL, - fault_status, - GPU_HUBMMU_OTHER_FAULT_NOTIFY); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR); + nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. " + "sub-err: OTHER_FAULT_NOTIFY. " + "fault_status(0x%x)", fault_status); gv11b_fb_handle_dropped_mmu_fault(g, fault_status); @@ -540,11 +539,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr) if ((niso_intr & fb_niso_intr_mmu_nonreplayable_fault_overflow_m()) != 0U) { - nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR, - NULL, - fault_status, - GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR); + nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. " + "sub-err: NONREPLAYABLE_FAULT_OVERFLOW. " + "fault_status(0x%x)", fault_status); gv11b_fb_handle_nonreplay_fault_overflow(g, fault_status); @@ -565,11 +563,10 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr) if ((niso_intr & fb_niso_intr_mmu_replayable_fault_overflow_m()) != 0U) { - nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR, - NULL, - fault_status, - GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR); + nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. " + "sub-err: REPLAYABLE_FAULT_OVERFLOW. " + "fault_status(0x%x)", fault_status); gv11b_fb_handle_replay_fault_overflow(g, fault_status); diff --git a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b_fusa.c index d04d9c8f8..935f20a7a 100644 --- a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b_fusa.c @@ -1,7 +1,7 @@ /* * GV11B ECC INTR * - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -49,11 +49,9 @@ static void gv11b_fb_intr_handle_ecc_l2tlb_errs(struct gk20a *g, BUG(); } if ((ecc_status & uncorrected_error_mask) != 0U) { - nvgpu_report_fb_ecc_err(g, - GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, - ecc_addr, - g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc sa data error. " + "ecc_addr(0x%x)", ecc_addr); } } @@ -137,11 +135,9 @@ static void gv11b_fb_intr_handle_ecc_hubtlb_errs(struct gk20a *g, } if ((ecc_status & fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) { - nvgpu_report_fb_ecc_err(g, - GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, - ecc_addr, - g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc sa data error. " + "ecc_addr(0x%x)", ecc_addr); } } @@ -228,15 +224,14 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g, if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m()) != 0U) { - nvgpu_report_fb_ecc_err(g, - GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, - ecc_addr, - g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pte data error"); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc pte data error. " + "ecc_addr(0x%x)", ecc_addr); } if ((ecc_status & fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) { - nvgpu_log(g, gpu_dbg_intr, "corrected ecc pde0 data error"); + nvgpu_log(g, gpu_dbg_intr, "corrected ecc pde0 data error" + "ecc_addr(0x%x)", ecc_addr); /* This error is not expected to occur in gv11b and hence, * this scenario is considered as a fatal error. */ @@ -246,11 +241,9 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g, if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m()) != 0U) { - nvgpu_report_fb_ecc_err(g, - GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, - ecc_addr, - g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pde0 data error"); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc pde0 data error. " + "ecc_addr(0x%x)", ecc_addr); } } diff --git a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_ga10b_fusa.c index d81042b46..d43d984ac 100644 --- a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_ga10b_fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -272,8 +272,7 @@ void ga10b_fifo_ctxsw_timeout_isr(struct gk20a *g, continue; } - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - 0, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR, tsgid); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR); #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT recover = g->ops.tsg.check_ctxsw_timeout(tsg, &debug_dump, &ms); diff --git a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c index c3fc57e80..d0585ddc5 100644 --- a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -215,9 +215,7 @@ bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g) continue; } - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - 0, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR, - tsgid); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR); #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT recover = g->ops.tsg.check_ctxsw_timeout(tsg, diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c index 2f9f4086a..a977b3ac1 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c @@ -294,8 +294,7 @@ static void ga10b_fifo_handle_bad_tsg(struct gk20a *g, nvgpu_err(g, "runlist bad tsg error code not supported"); } - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - 0, GPU_HOST_PFIFO_SCHED_ERROR, bad_tsg_code); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_SCHED_ERROR); /* id is unknown, preempt all runlists and do recovery */ /* TBD: nvgpu_rc_sched_error_bad_tsg(g); */ diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c index 8becce208..ab038143a 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a.c @@ -142,8 +142,7 @@ static u32 gk20a_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr) } if ((fifo_intr & fifo_intr_0_fb_flush_timeout_pending_f()) != 0U) { - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - 0, GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR, 0); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR); nvgpu_err(g, "fifo fb flush timeout error"); handled |= fifo_intr_0_fb_flush_timeout_pending_f(); } diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c index 5c2ac20ee..ee44b2475 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -70,8 +70,7 @@ void gk20a_fifo_intr_handle_chsw_error(struct gk20a *g) u32 intr; intr = nvgpu_readl(g, fifo_intr_chsw_error_r()); - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - 0, GPU_HOST_PFIFO_CHSW_ERROR, intr); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_CHSW_ERROR); nvgpu_err(g, "chsw: %08x", intr); g->ops.gr.falcon.dump_stats(g); nvgpu_writel(g, fifo_intr_chsw_error_r(), intr); diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c index f80660062..9c9815461 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c @@ -132,8 +132,7 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g) nvgpu_err(g, "fifo sched error code not supported"); } - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - 0, GPU_HOST_PFIFO_SCHED_ERROR, sched_error); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_SCHED_ERROR); if (sched_error == SCHED_ERROR_CODE_BAD_TSG) { /* id is unknown, preempt all runlists and do recovery */ @@ -151,8 +150,7 @@ static u32 gv11b_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr) if ((fifo_intr & fifo_intr_0_bind_error_pending_f()) != 0U) { u32 bind_error = nvgpu_readl(g, fifo_intr_bind_error_r()); - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, - GPU_HOST_PFIFO_BIND_ERROR, bind_error); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_BIND_ERROR); nvgpu_err(g, "fifo bind error: 0x%08x", bind_error); handled |= fifo_intr_0_bind_error_pending_f(); } @@ -163,17 +161,13 @@ static u32 gv11b_fifo_intr_handle_errors(struct gk20a *g, u32 fifo_intr) } if ((fifo_intr & fifo_intr_0_memop_timeout_pending_f()) != 0U) { - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, - GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR, 0); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR); nvgpu_err(g, "fifo memop timeout error"); handled |= fifo_intr_0_memop_timeout_pending_f(); } if ((fifo_intr & fifo_intr_0_lb_error_pending_f()) != 0U) { - u32 lb_error = nvgpu_readl(g, fifo_intr_lb_error_r()); - - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, 0, - GPU_HOST_PFIFO_LB_ERROR, lb_error); + nvgpu_report_err_to_sdl(g, GPU_HOST_PFIFO_LB_ERROR); nvgpu_err(g, "fifo lb error"); handled |= fifo_intr_0_lb_error_pending_f(); } diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c index 59f0f7297..a70926d9a 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c @@ -326,8 +326,9 @@ static void report_pbdma_error(struct gk20a *g, u32 pbdma_id, err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR; } if (err_type != GPU_HOST_INVALID_ERROR) { - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - pbdma_id, err_type, pbdma_intr_0); + nvgpu_err(g, "pbdma_intr_0(%d)= 0x%08x ", + pbdma_id, pbdma_intr_0); + nvgpu_report_err_to_sdl(g, err_type); } return; } @@ -536,8 +537,7 @@ bool ga10b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1, recover = true; - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, pbdma_id, - GPU_HOST_PBDMA_HCE_ERROR, pbdma_intr_1); + nvgpu_report_err_to_sdl(g, GPU_HOST_PBDMA_HCE_ERROR); if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) { nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d", diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c index 2547477e2..11fd4529f 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c @@ -87,8 +87,8 @@ static void report_pbdma_error(struct gk20a *g, u32 pbdma_id, err_type = GPU_HOST_PBDMA_SIGNATURE_ERROR; } if (err_type != GPU_HOST_INVALID_ERROR) { - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, - pbdma_id, err_type, pbdma_intr_0); + nvgpu_log_info(g, "pbdma id:%u", pbdma_id); + nvgpu_report_err_to_sdl(g, err_type); } return; } @@ -190,8 +190,7 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1, recover = true; - nvgpu_report_host_err(g, NVGPU_ERR_MODULE_HOST, pbdma_id, - GPU_HOST_PBDMA_HCE_ERROR, pbdma_intr_1); + nvgpu_report_err_to_sdl(g, GPU_HOST_PBDMA_HCE_ERROR); if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) { nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d", diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c index 0efbf6287..8ab02b88c 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c @@ -195,8 +195,7 @@ static u32 ga10b_gr_intr_check_gr_mme_fe1_exception(struct gk20a *g, info_mthd = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd_r()); info_mthd2 = nvgpu_readl(g, gr_mme_fe1_hww_esr_info_mthd2_r()); - nvgpu_gr_intr_report_exception(g, 0, GPU_PGRAPH_MME_FE1_EXCEPTION, - mme_fe1_hww_esr, 0U); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MME_FE1_EXCEPTION); nvgpu_err(g, "mme_fe1 exception: esr 0x%08x, info 0x%08x," "info_mthd 0x%08x, info_mthd2 0x%08x", mme_fe1_hww_esr, info, info_mthd, info_mthd2); @@ -366,31 +365,29 @@ void ga10b_gr_intr_enable_exceptions(struct gk20a *g, } static void ga10b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, - u32 ecc_status, u32 gpc, u32 correct_err, u32 uncorrect_err) + u32 ecc_status, u32 gpc) { - (void)correct_err; - if ((ecc_status & gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { - nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); + nvgpu_err(g, "corrected ecc sa data error. " + "gpc_id(%d)", gpc); } if ((ecc_status & gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, - 0U, uncorrect_err); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); + nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc sa data error" + "gpc_id(%d)", gpc); } if ((ecc_status & gr_gpc0_mmu0_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { - nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); + nvgpu_err(g, "corrected ecc fa data error" + "gpc_id(%d)", gpc); } if ((ecc_status & gr_gpc0_mmu0_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, - 0U, uncorrect_err); - nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); + nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc fa data error" + "gpc_id(%d)", gpc); } } @@ -467,9 +464,7 @@ void ga10b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, nvgpu_log(g, gpu_dbg_intr, "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - ga10b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc, - (u32)*corrected_err, - (u32)*uncorrected_err); + ga10b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc); nvgpu_log(g, gpu_dbg_intr, "ecc error address: 0x%x", ecc_addr); @@ -747,15 +742,13 @@ static void ga10b_gr_intr_report_tpc_sm_rams_ecc_err(struct gk20a *g, for (i = 0U; i < ecc_status->err_count; i++) { if (ecc_status->err_id[i] == GPU_SM_RAMS_ECC_CORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_L1_TAG_ECC_CORRECTED, 0, - g->ecc.gr.sm_rams_ecc_corrected_err_count[gpc][tpc].counter); + nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_CORRECTED); + nvgpu_err(g, "sm_l1_tag_ecc_corrected. " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); } else { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_rams_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_UNCORRECTED); + nvgpu_err(g, "sm_l1_tag_ecc_uncorrected. " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); } } } diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c index e3ffb7def..9b9f87726 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c @@ -106,9 +106,7 @@ u32 gm20b_gr_intr_check_gr_ssync_exception(struct gk20a *g, u32 exception) g->ops.gr.intr.handle_ssync_hww(g, &ssync_esr); reset_gpc = 1U; } - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_SSYNC_EXCEPTION, - ssync_esr, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SSYNC_EXCEPTION); } return reset_gpc; } @@ -119,9 +117,7 @@ u32 gm20b_gr_intr_check_gr_mme_exception(struct gk20a *g, u32 exception) u32 mme = nvgpu_readl(g, gr_mme_hww_esr_r()); u32 info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_MME_EXCEPTION, - mme, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MME_EXCEPTION); nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x", mme, info); #ifdef CONFIG_NVGPU_DGPU @@ -141,9 +137,7 @@ u32 gm20b_gr_intr_check_gr_sked_exception(struct gk20a *g, u32 exception) if ((exception & gr_exception_sked_m()) != 0U) { u32 sked = nvgpu_readl(g, gr_sked_hww_esr_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_SKED_EXCEPTION, - sked, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SKED_EXCEPTION); nvgpu_err(g, "sked exception: esr 0x%08x", sked); nvgpu_writel(g, gr_sked_hww_esr_r(), gr_sked_hww_esr_reset_active_f()); @@ -158,10 +152,8 @@ static u32 gr_gm20b_intr_check_gr_be_crop_exception(struct gk20a *g, if ((exception & gr_pri_be0_becs_be_exception_crop_m()) != 0U) { u32 crop = nvgpu_readl(g, gr_crop_hww_esr_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_BE_EXCEPTION, - crop, GPU_PGRAPH_BE_EXCEPTION_CROP); - nvgpu_err(g, "crop exception: esr 0x%08x", crop); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_BE_EXCEPTION); + nvgpu_err(g, "BE exception: crop exception: esr 0x%08x", crop); nvgpu_writel(g, gr_crop_hww_esr_r(), gr_crop_hww_esr_reset_active_f()); return 1U; @@ -175,10 +167,8 @@ static u32 gr_gm20b_intr_check_gr_be_zrop_exception(struct gk20a *g, if ((exception & gr_pri_be0_becs_be_exception_zrop_m()) != 0U) { u32 zrop = nvgpu_readl(g, gr_zrop_hww_esr_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_BE_EXCEPTION, - zrop, GPU_PGRAPH_BE_EXCEPTION_ZROP); - nvgpu_err(g, "zrop exception: esr 0x%08x", zrop); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_BE_EXCEPTION); + nvgpu_err(g, "BE exception: zrop exception: esr 0x%08x", zrop); nvgpu_writel(g, gr_zrop_hww_esr_r(), gr_zrop_hww_esr_reset_active_f()); return 1U; @@ -192,9 +182,7 @@ u32 gm20b_gr_intr_check_gr_fe_exception(struct gk20a *g, u32 exception) u32 fe = nvgpu_readl(g, gr_fe_hww_esr_r()); u32 info = nvgpu_readl(g, gr_fe_hww_esr_info_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_FE_EXCEPTION, - fe, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_FE_EXCEPTION); nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x", fe, info); nvgpu_writel(g, gr_fe_hww_esr_r(), @@ -209,9 +197,7 @@ u32 gm20b_gr_intr_check_gr_memfmt_exception(struct gk20a *g, u32 exception) if ((exception & gr_exception_memfmt_m()) != 0U) { u32 memfmt = nvgpu_readl(g, gr_memfmt_hww_esr_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_MEMFMT_EXCEPTION, - memfmt, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MEMFMT_EXCEPTION); nvgpu_err(g, "memfmt exception: esr %08x", memfmt); nvgpu_writel(g, gr_memfmt_hww_esr_r(), gr_memfmt_hww_esr_reset_active_f()); @@ -225,9 +211,7 @@ u32 gm20b_gr_intr_check_gr_pd_exception(struct gk20a *g, u32 exception) if ((exception & gr_exception_pd_m()) != 0U) { u32 pd = nvgpu_readl(g, gr_pd_hww_esr_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_PD_EXCEPTION, - pd, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_PD_EXCEPTION); nvgpu_err(g, "pd exception: esr 0x%08x", pd); nvgpu_writel(g, gr_pd_hww_esr_r(), gr_pd_hww_esr_reset_active_f()); @@ -241,9 +225,7 @@ u32 gm20b_gr_intr_check_gr_scc_exception(struct gk20a *g, u32 exception) if ((exception & gr_exception_scc_m()) != 0U) { u32 scc = nvgpu_readl(g, gr_scc_hww_esr_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_SCC_EXCEPTION, - scc, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_SCC_EXCEPTION); nvgpu_err(g, "scc exception: esr 0x%08x", scc); nvgpu_writel(g, gr_scc_hww_esr_r(), gr_scc_hww_esr_reset_active_f()); @@ -257,9 +239,7 @@ u32 gm20b_gr_intr_check_gr_ds_exception(struct gk20a *g, u32 exception) if ((exception & gr_exception_ds_m()) != 0U) { u32 ds = nvgpu_readl(g, gr_ds_hww_esr_r()); - nvgpu_gr_intr_report_exception(g, 0, - GPU_PGRAPH_DS_EXCEPTION, - ds, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_DS_EXCEPTION); nvgpu_err(g, "ds exception: esr: 0x%08x", ds); nvgpu_writel(g, gr_ds_hww_esr_r(), gr_ds_hww_esr_reset_task_f()); diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c index 8d118e3c8..a615f07ba 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c @@ -88,18 +88,12 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g) fecs_ecc_status.uncorrected_delta); if (fecs_ecc_status.imem_corrected_err) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, - GPU_FECS_FALCON_IMEM_ECC_CORRECTED, - fecs_ecc_status.ecc_addr, - g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); + nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_IMEM_ECC_CORRECTED); nvgpu_err(g, "imem ecc error corrected - error count:%d", g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); } if (fecs_ecc_status.imem_uncorrected_err) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, - GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, - fecs_ecc_status.ecc_addr, - g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); + nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED); nvgpu_err(g, "imem ecc error uncorrected - error count:%d", g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); } @@ -112,10 +106,7 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g) BUG(); } if (fecs_ecc_status.dmem_uncorrected_err) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, - GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, - fecs_ecc_status.ecc_addr, - g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); + nvgpu_report_err_to_sdl(g, GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED); nvgpu_err(g, "dmem ecc error uncorrected - error count %d", g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); } @@ -350,9 +341,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, } *uncorrected_err = nvgpu_safe_add_u32(*uncorrected_err, gcc_l15_uncorrected_err_count_delta); - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc, - GPU_GCC_L15_ECC_UNCORRECTED, - 0, *uncorrected_err); + nvgpu_report_err_to_sdl(g, GPU_GCC_L15_ECC_UNCORRECTED); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r(), offset), 0); @@ -364,11 +353,8 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, } static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, - u32 ecc_status, u32 gpc, - u32 correct_err, u32 uncorrect_err) + u32 ecc_status, u32 gpc) { - (void)correct_err; - if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { @@ -381,10 +367,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, - 0, uncorrect_err); - nvgpu_err(g, "uncorrected ecc sa data error"); + nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc); } if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != @@ -398,10 +382,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, - 0, uncorrect_err); - nvgpu_err(g, "uncorrected ecc fa data error"); + nvgpu_report_err_to_sdl(g, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc); } } @@ -482,8 +464,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, nvgpu_err(g, "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - gv11b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc, - (u32)*corrected_err, (u32)*uncorrected_err); + gv11b_gr_intr_report_gpcmmu_ecc_err(g, ecc_status, gpc); nvgpu_err(g, "ecc error address: 0x%x", ecc_addr); nvgpu_err(g, "ecc error count corrected: %d, uncorrected %d", @@ -491,22 +472,19 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, } static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g, - u32 ecc_status, u32 ecc_addr, u32 gpc, - u32 correct_err, u32 uncorrect_err) + u32 ecc_status, u32 ecc_addr, u32 gpc) { if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, - gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, - ecc_addr, correct_err); - nvgpu_err(g, "imem ecc error corrected"); + nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED); + nvgpu_err(g, "imem ecc error corrected" + "ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, - gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, - ecc_addr, uncorrect_err); - nvgpu_err(g, "imem ecc error uncorrected"); + nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED); + nvgpu_err(g, "imem ecc error uncorrected" + "ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { @@ -518,10 +496,9 @@ static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g, } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, - gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, - ecc_addr, uncorrect_err); - nvgpu_err(g, "dmem ecc error uncorrected"); + nvgpu_report_err_to_sdl(g, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED); + nvgpu_err(g, "dmem ecc error uncorrected" + "ecc_addr(0x%x), gpc_id(%d)", ecc_addr, gpc); } } @@ -538,9 +515,7 @@ void gv11b_gr_intr_handle_gpc_prop_exception(struct gk20a *g, u32 gpc, hww_esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_prop_hww_esr_r(), offset)); - nvgpu_gr_intr_report_exception(g, (gpc << 8U), - GPU_PGRAPH_GPC_GFX_EXCEPTION, - hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_PROP); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION); /* * print additional diagnostic information. @@ -584,9 +559,7 @@ void gv11b_gr_intr_handle_gpc_zcull_exception(struct gk20a *g, u32 gpc, hww_esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_zcull_hww_esr_r(), offset)); - nvgpu_gr_intr_report_exception(g, (gpc << 8U), - GPU_PGRAPH_GPC_GFX_EXCEPTION, - hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_ZCULL); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION); /* clear the interrupt */ nvgpu_writel(g, nvgpu_safe_add_u32( @@ -610,9 +583,7 @@ void gv11b_gr_intr_handle_gpc_setup_exception(struct gk20a *g, u32 gpc, hww_esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_setup_hww_esr_r(), offset)); - nvgpu_gr_intr_report_exception(g, (gpc << 8U), - GPU_PGRAPH_GPC_GFX_EXCEPTION, - hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_SETUP); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION); /* clear the interrupt */ nvgpu_writel(g, nvgpu_safe_add_u32( @@ -627,7 +598,7 @@ void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc, u32 gpc_exception) { u32 offset = nvgpu_gr_gpc_offset(g, gpc); - u32 hww_esr, sub_err_type; + u32 hww_esr; if (((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) == 0U) && ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m()) @@ -638,17 +609,7 @@ void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc, hww_esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_ppc0_pes_hww_esr_r(), offset)); - if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) != 0U) { - sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES0; - } - - if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m()) != 0U) { - sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES1; - } - - nvgpu_gr_intr_report_exception(g, (gpc << 8U), - GPU_PGRAPH_GPC_GFX_EXCEPTION, - hww_esr, sub_err_type); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION); /* clear the interrupt */ nvgpu_writel(g, nvgpu_safe_add_u32( @@ -725,8 +686,7 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, nvgpu_err(g, "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); - gv11b_gr_intr_report_gpccs_ecc_err(g, ecc_status, ecc_addr, gpc, - (u32)*corrected_err, (u32)*uncorrected_err); + gv11b_gr_intr_report_gpccs_ecc_err(g, ecc_status, ecc_addr, gpc); if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) { nvgpu_err(g, "gpccs ecc counter overflow!"); @@ -753,9 +713,7 @@ void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc) offset)); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "mpc hww esr 0x%08x", esr); - nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc), - GPU_PGRAPH_MPC_EXCEPTION, - esr, 0); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_MPC_EXCEPTION); esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(), @@ -781,9 +739,7 @@ void gv11b_gr_intr_handle_tpc_pe_exception(struct gk20a *g, u32 gpc, u32 tpc) esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_tpc0_pe_hww_esr_r(), offset)); - nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc), - GPU_PGRAPH_GPC_GFX_EXCEPTION, - esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_TPC_PE); + nvgpu_report_err_to_sdl(g, GPU_PGRAPH_GPC_GFX_EXCEPTION); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "pe hww esr 0x%08x", esr); @@ -938,24 +894,21 @@ static void gv11b_gr_intr_report_l1_tag_uncorrected_err(struct gk20a *g, for (i = 0U; i < ecc_status->err_count; i++) { if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_UNCORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_l1_tag_ecc_uncorrected " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED); } } } @@ -973,10 +926,9 @@ static void gv11b_gr_intr_report_l1_tag_corrected_err(struct gk20a *g, for (i = 0U; i < ecc_status->err_count; i++) { if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_CORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_L1_TAG_ECC_CORRECTED, 0, - g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_l1_tag_ecc_corrected " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_L1_TAG_ECC_CORRECTED); } } } @@ -1296,10 +1248,7 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc nvgpu_safe_add_u32( g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter, lrf_uncorrected_err_count_delta); - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_LRF_ECC_UNCORRECTED, 0, - g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter); + nvgpu_report_err_to_sdl(g, GPU_SM_LRF_ECC_UNCORRECTED); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset), 0U); @@ -1431,10 +1380,7 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc nvgpu_safe_add_u32( g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter, cbu_uncorrected_err_count_delta); - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_CBU_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_report_err_to_sdl(g, GPU_SM_CBU_ECC_UNCORRECTED); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset), 0U); @@ -1562,10 +1508,7 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 nvgpu_safe_add_u32( g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter, l1_data_uncorrected_err_count_delta); - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_L1_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_report_err_to_sdl(g, GPU_SM_L1_DATA_ECC_UNCORRECTED); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset), 0U); @@ -1588,31 +1531,27 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g, for (i = 0U; i < ecc_status->err_count; i++) { if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected. " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, - (gpc << SHIFT_8_BITS) | tpc, - GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, - 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); + nvgpu_err(g, "sm_icache_l1_predecode_ecc_uncorrected. " + "gpc_id(%d), tpc_id(%d)", gpc, tpc); + nvgpu_report_err_to_sdl(g, GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED); } } } diff --git a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c index af8b79c4c..f09fed7f7 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c @@ -285,6 +285,9 @@ #include #endif +#include "hal/cic/mon/cic_gv11b.h" +#include + static int ga10b_init_gpu_characteristics(struct gk20a *g) { int err; @@ -1713,6 +1716,11 @@ static const struct gops_mssnvlink ga10b_ops_mssnvlink = { }; #endif +static const struct gops_cic_mon ga10b_ops_cic_mon = { + .init = gv11b_cic_mon_init, + .report_err = nvgpu_cic_mon_report_err_safety_services +}; + int ga10b_init_hal(struct gk20a *g) { struct gpu_ops *gops = &g->ops; @@ -1812,6 +1820,7 @@ int ga10b_init_hal(struct gk20a *g) gops->tpc_pg = ga10b_ops_tpc_pg; #endif gops->grmgr = ga10b_ops_grmgr; + gops->cic_mon = ga10b_ops_cic_mon; gops->chip_init_gpu_characteristics = ga10b_init_gpu_characteristics; gops->get_litter_value = ga10b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c index 54936384b..12cb614d2 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_ga10b_fusa.c @@ -411,11 +411,7 @@ static void ga10b_ltc_intr_handle_rstg_ecc_interrupts(struct gk20a *g, nvgpu_wrapping_add_u32( g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter, uncorrected_delta); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.rstg_ecc_parity_count[ltc][slice].counter); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { @@ -446,11 +442,7 @@ static void ga10b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g, nvgpu_wrapping_add_u32( g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, uncorrected_delta); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { @@ -516,11 +508,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g, g->ecc.ltc.ecc_sec_count[ltc][slice].counter, corrected_delta); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr, - g->ecc.ltc.ecc_sec_count[ltc][slice].counter); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_CORRECTED); /* * Using a SEC code will allow correction of an SBE (Single Bit @@ -551,11 +539,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g, g->ecc.ltc.ecc_ded_count[ltc][slice].counter, uncorrected_delta); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.ecc_ded_count[ltc][slice].counter); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED); } else if (ga10b_ltc_intr_is_dstg_be_ram(ecc_addr)) { nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected"); @@ -564,11 +548,7 @@ static void ga10b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g, g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter, uncorrected_delta); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED); } else { nvgpu_err(g, "unsupported uncorrected dstg ecc error"); BUG(); diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c index e7127531d..ab86a2a02 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c @@ -126,12 +126,9 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g, g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, uncorrected_delta); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter); - nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED); + nvgpu_err(g, "tstg ecc error uncorrected. " + "ecc_addr(0x%x)", ecc_addr); } } @@ -148,12 +145,9 @@ void gv11b_ltc_intr_handle_dstg_ecc_interrupts(struct gk20a *g, g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter, uncorrected_delta); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, - g->ecc.ltc.dstg_be_ecc_parity_count[ltc][slice].counter); - nvgpu_log(g, gpu_dbg_intr, "dstg be ecc error uncorrected"); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED); + nvgpu_err(g, "dstg be ecc error uncorrected. " + "ecc_addr(0x%x)", ecc_addr); } } @@ -287,11 +281,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ecc_stats_reg_val); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_ECC_CORRECTED, dstg_ecc_addr, - g->ecc.ltc.ecc_sec_count[ltc][slice].counter); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_CORRECTED); + nvgpu_err(g, "dstg ecc error corrected. " + "ecc_addr(0x%x)", dstg_ecc_addr); /* * Using a SEC code will allow correction of an SBE (Single Bit @@ -335,11 +327,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ecc_stats_reg_val); - nvgpu_report_ecc_err(g, - NVGPU_ERR_MODULE_LTC, - (ltc << 8U) | slice, - GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, dstg_ecc_addr, - g->ecc.ltc.ecc_ded_count[ltc][slice].counter); + nvgpu_report_err_to_sdl(g, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED); + nvgpu_err(g, "dstg ecc error uncorrected. " + "ecc_addr(0x%x)", dstg_ecc_addr); } nvgpu_writel(g, nvgpu_safe_add_u32(ltc_ltc0_lts0_intr_r(), offset), diff --git a/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c index 5f978f979..4542ca081 100644 --- a/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c @@ -521,11 +521,10 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g, } #endif - nvgpu_report_mmu_err(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR, - mmufault, - fault_status, - sub_err_type); + nvgpu_report_err_to_sdl(g, GPU_HUBMMU_PAGE_FAULT_ERROR); + nvgpu_err(g, "sub_er_type = 0x%x, " + "fault_status = 0x%x", + sub_err_type, fault_status); nvgpu_assert(get_indx < U32_MAX); nvgpu_assert(entries != 0U); diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c index d7b48bf2d..a4854c2ac 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b_fusa.c @@ -141,24 +141,20 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, - GPU_PMU_FALCON_IMEM_ECC_CORRECTED, - ecc_addr, - g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); + nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_IMEM_ECC_CORRECTED); + nvgpu_err(g, "falcon imem ecc error corrected. " + "ecc_addr(0x%x)", ecc_addr); } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, - GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, - ecc_addr, - g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); + nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED); + nvgpu_err(g, "falcon imem ecc error uncorrected. " + "ecc_addr(0x%x)", ecc_addr); ret = -EFAULT; } if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); + nvgpu_err(g, "falcon dmem ecc error corrected"); /* This error is not expected to occur in gv11b and hence, * this scenario is considered as a fatal error. */ @@ -167,11 +163,9 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, - GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, - ecc_addr, - g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); - nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); + nvgpu_report_err_to_sdl(g, GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED); + nvgpu_err(g, "falcon dmem ecc error uncorrected. " + "ecc_addr(0x%x)", ecc_addr); ret = -EFAULT; } diff --git a/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_ga10b_fusa.c index 6c52be5c7..ae38ebb5d 100644 --- a/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_ga10b_fusa.c @@ -1,7 +1,7 @@ /* * GA10B priv ring * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -378,8 +378,7 @@ void ga10b_priv_ring_decode_error_code(struct gk20a *g, u32 error_code) size_t lookup_table_size = 1; size_t index = 0; - nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, 0, - GPU_PRI_ACCESS_VIOLATION, 0, error_code); + nvgpu_report_err_to_sdl(g, GPU_PRI_ACCESS_VIOLATION); err_code = pri_sys_pri_error_code_v(error_code); error_extra = pri_sys_pri_error_extra_v(error_code); diff --git a/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c b/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c index 95e75adc4..c304783a3 100644 --- a/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/priv_ring/priv_ring_gp10b_fusa.c @@ -1,7 +1,7 @@ /* * GP10B priv ring * - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -71,8 +71,7 @@ void gp10b_priv_ring_decode_error_code(struct gk20a *g, { u32 error_type_index; - nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, 0, - GPU_PRI_ACCESS_VIOLATION, 0, error_code); + nvgpu_report_err_to_sdl(g, GPU_PRI_ACCESS_VIOLATION); error_type_index = (error_code & 0x00000f00U) >> 8U; error_code = error_code & 0xBADFf000U; diff --git a/drivers/gpu/nvgpu/hal/ptimer/ptimer_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/ptimer/ptimer_ga10b_fusa.c index 07bdc7c7a..e7bef49e9 100644 --- a/drivers/gpu/nvgpu/hal/ptimer/ptimer_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ptimer/ptimer_ga10b_fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -32,7 +32,6 @@ void ga10b_ptimer_isr(struct gk20a *g) { u32 save0, save1, fecs_errcode = 0; - u32 inst = 0U; u32 error_addr; save0 = nvgpu_readl(g, timer_pri_timeout_save_0_r()); @@ -62,13 +61,7 @@ void ga10b_ptimer_isr(struct gk20a *g) g->ops.priv_ring.decode_error_code(g, fecs_errcode); } - /* FECS was the target of PRI access */ - inst = 1U; - /* SAVE_0_ADDR cannot be used in this case */ - error_addr = 0U; } - nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, - inst, GPU_PRI_TIMEOUT_ERROR, - error_addr, fecs_errcode); + nvgpu_report_err_to_sdl(g, GPU_PRI_TIMEOUT_ERROR); } diff --git a/drivers/gpu/nvgpu/hal/ptimer/ptimer_gk20a_fusa.c b/drivers/gpu/nvgpu/hal/ptimer/ptimer_gk20a_fusa.c index 9d8e350a9..330aa761a 100644 --- a/drivers/gpu/nvgpu/hal/ptimer/ptimer_gk20a_fusa.c +++ b/drivers/gpu/nvgpu/hal/ptimer/ptimer_gk20a_fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -32,7 +32,6 @@ void gk20a_ptimer_isr(struct gk20a *g) { u32 save0, save1, fecs_errcode = 0; - u32 inst = 0U; u32 error_addr; save0 = gk20a_readl(g, timer_pri_timeout_save_0_r()); @@ -55,10 +54,6 @@ void gk20a_ptimer_isr(struct gk20a *g) g->ops.priv_ring.decode_error_code(g, fecs_errcode); } - /* FECS was the target of PRI access */ - inst = 1U; - /* SAVE_0_ADDR cannot be used in this case */ - error_addr = 0U; } nvgpu_err(g, "PRI timeout: ADR 0x%08x " @@ -70,9 +65,7 @@ void gk20a_ptimer_isr(struct gk20a *g) gk20a_writel(g, timer_pri_timeout_save_0_r(), 0); gk20a_writel(g, timer_pri_timeout_save_1_r(), 0); - nvgpu_report_pri_err(g, NVGPU_ERR_MODULE_PRI, - inst, GPU_PRI_TIMEOUT_ERROR, - error_addr, fecs_errcode); + nvgpu_report_err_to_sdl(g, GPU_PRI_TIMEOUT_ERROR); } #ifdef CONFIG_NVGPU_IOCTL_NON_FUSA diff --git a/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h b/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h index e6ebd0d0b..7dd780ef8 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h +++ b/drivers/gpu/nvgpu/include/nvgpu/cic_mon.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -375,25 +375,14 @@ int nvgpu_cic_mon_get_err_desc(struct gk20a *g, u32 hw_unit_id, * used by sub-units in nvgpu-rm and SDL unit. * * @param g [in] - The GPU driver struct. - * @param err_info [in] - Error message. - * @param err_size [in] - Size of the error message. - * @param is_critical [in] - Criticality of the error being reported. + * @param err_id [in] - Error ID. * - * On QNX: - * - Checks whether SDL is initialized. - * - Enqueues \a err_info into error message queue. - * - Signals the workqueue condition variable. - * - If the reported error is critical, invokes #nvgpu_sw_quiesce() api. - * - * on Linux: - * - NOP currently as safety services are absent in Linux + * - Reports the errors to Safety_Services. * * @return 0 in case of success, <0 in case of failure. - * @retval -EAGAIN if SDL not initialized. - * @retval -ENOMEM if sufficient memory is not available. */ int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, - void *err_info, size_t err_size, bool is_critical); + u32 err_id); /** * @brief Get the number of HW modules supported by CIC. diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/cic_mon.h b/drivers/gpu/nvgpu/include/nvgpu/gops/cic_mon.h index 241817623..1387a9b73 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops/cic_mon.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/cic_mon.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -53,15 +53,11 @@ struct gops_cic_mon { * @brief Report error to safety services. * * @param g [in] Pointer to GPU driver struct. - * @param err_pkt [in] Pointer to struct holding err details. - * @param err_size [in] Size of err_pkt. - * @param is_critical [in] Flag indicating criticality of error. + * @param err_id [in] Error ID. * * @return 0 in case of success, < 0 in case of failure. */ - int (*report_err)(struct gk20a *g, - void *err_pkt, size_t err_size, - bool is_critical); + int (*report_err)(struct gk20a *g, u32 err_id); }; #endif/*NVGPU_GOPS_CIC_MON_H*/ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h b/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h index ce3d63db3..66b1eb9f6 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -125,15 +125,11 @@ struct gops_ltc_intr { * -# Increment g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter * with uncorrected counter delta with * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". - * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err - * "nvgpu_report_ecc_err" with following parameters: + * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl + * "nvgpu_report_err_to_sdl" with following parameters: * -# \a g - * -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC" - * -# (\a ltc << 8U) | \a slice * -# \ref GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED * "GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED" - * -# ecc address read above - * -# g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter * -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() is * set in ecc status, then it is considered as fatal error as it is not * expected and call \ref BUG "BUG()". @@ -143,15 +139,11 @@ struct gops_ltc_intr { * -# Increment g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter * with uncorrected counter delta with * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". - * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err - * "nvgpu_report_ecc_err" with following parameters: + * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl + * "nvgpu_report_err_to_sdl" with following parameters: * -# \a g - * -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC" - * -# (\a ltc << 8U) | \a slice * -# \ref GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED * "GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED" - * -# ecc address read above - * -# g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter * -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() is * set in ecc status, then it is considered as fatal error as it is not * expected and call \ref BUG "BUG()". @@ -162,15 +154,11 @@ struct gops_ltc_intr { * -# Increment g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter * with corrected counter delta with * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". - * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err - * "nvgpu_report_ecc_err" with following parameters: + * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl + * "nvgpu_report_err_to_sdl" with following parameters: * -# \a g - * -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC" - * -# (\a ltc << 8U) | \a slice * -# \ref GPU_LTC_CACHE_DSTG_ECC_CORRECTED * "GPU_LTC_CACHE_DSTG_ECC_CORRECTED" - * -# ecc address read above. - * -# g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter * -# Flush the L2 cache by calling * \ref gops_mm_cache.l2_flush "gops_mm_cache.l2_flush". * -# If it fails then call \ref BUG "BUG()". @@ -182,28 +170,20 @@ struct gops_ltc_intr { * -# Increment g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter * with uncorrected counter delta with * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". - * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err - * "nvgpu_report_ecc_err" with following parameters: + * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl + * "nvgpu_report_err_to_sdl" with following parameters: * -# \a g - * -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC" - * -# (\a ltc << 8U) | \a slice * -# \ref GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED * "GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED" - * -# ecc address read above. - * -# g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter * -# Else if the ECC address correspongs to DSTG BE RAM: * -# Increment g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter * with uncorrected counter delta with * \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32". - * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err - * "nvgpu_report_ecc_err" with following parameters: + * -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_err_to_sdl + * "nvgpu_report_err_to_sdl" with following parameters: * -# \a g - * -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC" - * -# (\a ltc << 8U) | \a slice * -# \ref GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED * "GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED" - * -# ecc address read above - * -# g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter * -# Else call \ref BUG "BUG()" as this type of ECC error is not supported. * -# Clear the register ltc_ltc0_lts0_intr3_r() by writing the read value. * - return 0 diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/priv_ring.h b/drivers/gpu/nvgpu/include/nvgpu/gops/priv_ring.h index 50a9a31fb..a09a2ace1 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops/priv_ring.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/priv_ring.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -284,8 +284,8 @@ struct gops_priv_ring { * "pri route error" * }; * \endcode - * - Invoke \ref #nvgpu_report_pri_err "nvgpu_report_pri_err" with parameters \a g, - * #NVGPU_ERR_MODULE_PRI, #GPU_PRI_ACCESS_VIOLATION, 0, error_code respectively. + * - Invoke \ref #nvgpu_report_pri_err "nvgpu_report_err_to_sdl" with parameters \a g, + * #GPU_PRI_ACCESS_VIOLATION, respectively. * - Declare a variable error_type_index and store the bits [8-12] as below. * error_type_index will be used as an index to the above error tables. * error_code is also updated. diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/ptimer.h b/drivers/gpu/nvgpu/include/nvgpu/gops/ptimer.h index 8da04e8b6..c2aa14d62 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops/ptimer.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/ptimer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -91,14 +91,10 @@ struct gops_ptimer { * - Clear timer_pri_timeout_save_0_r() and timer_pri_timeout_save_1_r() * registers so that the next pri access error can be recorded. Write * 0 to these two registers to clear the previous error information. - * - Report the PRI_TIMEOUT_ERROR to SDL unit using \ref nvgpu_report_pri_err() - * API. The inputs to \ref nvgpu_report_pri_err() are - + * - Report the PRI_TIMEOUT_ERROR to SDL unit using \ref nvgpu_report_err_to_sdl() + * API. The inputs to \ref nvgpu_report_err_to_sdl() are - * - g, - * - NVGPU_ERR_MODULE_PRI, - * - inst, - * - GPU_PRI_TIMEOUT_ERROR, - * - error_addr, - * - fecs_errcode + * - GPU_PRI_TIMEOUT_ERROR. */ void (*isr)(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h index ba4c1e7a3..f20824e94 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_intr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -112,23 +112,6 @@ void nvgpu_gr_intr_handle_notify_pending(struct gk20a *g, void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g, struct nvgpu_gr_isr_data *isr_data); -/** - * @brief Report GR exceptions to qnx.sdl unit. - * - * @param g [in] Pointer to GPU driver struct. - * @param inst [in] Unit instance ID. - * @param err_type [in] Error type. - * @param status [in] Exception status value. - * @param sub_err_type [in] Sub error type. - * - * This function reports all GR exceptions to qnx.sdl unit. - * - * Other interrupt handling functions like #nvgpu_gr_intr_handle_fecs_error() - * call this function to report exceptions to qnx.sdl. - */ -void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst, - u32 err_type, u32 status, u32 sub_err_type); - /** * @brief Translate context to channel ID. * @@ -223,7 +206,6 @@ int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, * @see nvgpu_gr_intr_handle_notify_pending * @see nvgpu_gr_intr_handle_semaphore_pending * @see nvgpu_gr_intr_handle_sm_exception - * @see nvgpu_gr_intr_report_exception * @see nvgpu_gr_intr_set_error_notifier */ int nvgpu_gr_intr_stall_isr(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index aba00e1cb..eaa70567c 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -1195,4 +1195,16 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid, u32 mailbox_value); +/** + * @brief This is a wrapper function to report ECC errors from HUBMMU to SDL. + * + * @param g [in] - The GPU driver struct. + * @param err_id [in] - Error ID. + * + * Calls nvgpu_report_err_to_ss to report errors to Safety_Services. + * + * @return None + */ +void nvgpu_report_err_to_sdl(struct gk20a *g, u32 err_id); + #endif /* NVGPU_NVGPU_ERR_H */ diff --git a/drivers/gpu/nvgpu/os/linux/cic/cic_stub.c b/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c similarity index 80% rename from drivers/gpu/nvgpu/os/linux/cic/cic_stub.c rename to drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c index f9cae7884..8a5a06012 100644 --- a/drivers/gpu/nvgpu/os/linux/cic/cic_stub.c +++ b/drivers/gpu/nvgpu/os/linux/cic/cic_report_err.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA Corporation. All rights reserved. + * Copyright (c) 2021-2022, NVIDIA Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -20,7 +20,11 @@ struct gk20a; int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, - void *err_info, size_t err_size, bool is_critical) + u32 metadata) { + /** + * ToDo: Add MISC_EC API to report error. + * Decide on triggering SW quiesce for UE. + */ return 0; } diff --git a/drivers/gpu/nvgpu/os/posix/stubs.c b/drivers/gpu/nvgpu/os/posix/stubs.c index 079dfa315..c3976e45a 100644 --- a/drivers/gpu/nvgpu/os/posix/stubs.c +++ b/drivers/gpu/nvgpu/os/posix/stubs.c @@ -50,11 +50,9 @@ void nvgpu_ecc_sysfs_remove(struct gk20a *g) #endif int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, - void *err_info, size_t err_size, bool is_critical) + u32 err_id) { (void)g; - (void)err_info; - (void)err_size; - (void)is_critical; + (void)err_id; return 0; }