From 37c6b8b1c361ba8a4cd1dea7901ca5fac4433ee6 Mon Sep 17 00:00:00 2001 From: Rajesh Devaraj Date: Tue, 22 Mar 2022 17:11:29 +0530 Subject: [PATCH] gpu: nvgpu: update reporting of errors to sdl In Drive 6.0, the error reporting is supported only for orin (ga10b) in dev-main. For this purpose, this patch does the following: - Removes the redundant reporting of following IDs from gv11b: - GPU_HOST_PFIFO_SCHED_ERROR - GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR - GPU_HOST_PBDMA_HCE_ERROR - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED - GPU_LTC_CACHE_DSTG_ECC_CORRECTED - GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED - Migrates the reporting of following IDs from gv11b to ga10b: - GPU_SM_L1_TAG_ECC_CORRECTED - GPU_SM_L1_TAG_ECC_UNCORRECTED - GPU_SM_CBU_ECC_UNCORRECTED - GPU_SM_LRF_ECC_UNCORRECTED - GPU_SM_L1_DATA_ECC_UNCORRECTED - GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED - GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED - GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED - Removes the unused ID that doesn't have any HSI related to it: - GPU_HOST_PBDMA_PREEMPT_ERROR In addition to the above, this patch does the following: - Updates error IDs related to page fault error. - Updates look-up table to remove unused error IDs. JIRA NVGPU-8094 Bug 200729736 Change-Id: Ifea76d38ba609c894560e61ff5a6e406290f919e Signed-off-by: Rajesh Devaraj Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2685249 Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-cert Reviewed-by: Dinesh T Reviewed-by: Vaibhav Kachore GVS: Gerrit_Virtual_Submit --- .../gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c | 118 +++++------------- .../nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c | 6 +- .../nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c | 3 - .../gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c | 3 - drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c | 3 - .../nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c | 24 ++++ .../nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c | 24 ---- .../nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c | 6 - .../hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c | 14 +-- drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 45 +++---- 10 files changed, 83 insertions(+), 163 deletions(-) diff --git a/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c index 2d17cc79f..e4a2d946a 100644 --- a/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/cic/mon/lut_ga10b_fusa.c @@ -45,7 +45,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .name = "host", .hw_unit = (u32)NVGPU_ERR_MODULE_HOST, .num_instances = 1U, - .num_errs = 17U, + .num_errs = 16U, .errs = (struct nvgpu_err_desc[]) { GPU_CRITERR("pfifo_bind_error", GPU_HOST_PFIFO_BIND_ERROR, INJECT_SW, @@ -113,11 +113,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_CRITERR("pbdma_preempt_error", - GPU_HOST_PBDMA_PREEMPT_ERROR, - INJECT_SW, - NULL, NULL, - NULL, NULL, 0, 0), GPU_NONCRITERR("pfifo_ctxsw_timeout", GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR, INJECT_SW, @@ -134,7 +129,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .name = "sm", .hw_unit = (u32)NVGPU_ERR_MODULE_SM, .num_instances = 8U, - .num_errs = 21U, + .num_errs = 12U, .errs = (struct nvgpu_err_desc[]) { GPU_NONCRITERR("l1_tag_ecc_corrected", GPU_SM_L1_TAG_ECC_CORRECTED, @@ -146,73 +141,41 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("cbu_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("cbu_ecc_uncorrected", GPU_SM_CBU_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("lrf_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("lrf_ecc_uncorrected", GPU_SM_LRF_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("l1_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("l1_data_ecc_uncorrected", GPU_SM_L1_DATA_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("icache_l0_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("icache_l0_data_ecc_uncorrected", GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("icache_l1_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("icache_l1_data_ecc_uncorrected", GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("icache_l0_predecode_ecc_uncorrected", GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected", GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected", GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, INJECT_SW, @@ -223,12 +186,9 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), - GPU_CRITERR("icache_l1_predecode_ecc_uncorrected", - 0, INJECT_NONE, + GPU_CRITERR("rams_urf_ecc_uncorrected", + GPU_SM_RAMS_URF_ECC_UNCORRECTED, + INJECT_NONE, NULL, NULL, NULL, NULL, 0, 0), }, @@ -237,7 +197,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .name = "fecs", .hw_unit = (u32)NVGPU_ERR_MODULE_FECS, .num_instances = 1U, - .num_errs = 8U, + .num_errs = 7U, .errs = (struct nvgpu_err_desc[]) { GPU_NONCRITERR("falcon_imem_ecc_corrected", GPU_FECS_FALCON_IMEM_ECC_CORRECTED, @@ -249,10 +209,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("falcon_dmem_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("falcon_dmem_ecc_uncorrected", GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, INJECT_SW, @@ -284,7 +240,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .name = "gpccs", .hw_unit = (u32)NVGPU_ERR_MODULE_GPCCS, .num_instances = 1U, - .num_errs = 4U, + .num_errs = 3U, .errs = (struct nvgpu_err_desc[]) { GPU_NONCRITERR("falcon_imem_ecc_corrected", GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, @@ -296,10 +252,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("falcon_dmem_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("falcon_dmem_ecc_uncorrected", GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, INJECT_SW, @@ -311,21 +263,13 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .name = "mmu", .hw_unit = (u32)NVGPU_ERR_MODULE_MMU, .num_instances = 1U, - .num_errs = 4U, + .num_errs = 2U, .errs = (struct nvgpu_err_desc[]) { - GPU_NONCRITERR("l1tlb_sa_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("l1tlb_sa_data_ecc_uncorrected", GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("l1tlb_fa_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("l1tlb_fa_data_ecc_uncorrected", GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, INJECT_SW, @@ -337,12 +281,8 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .name = "gcc", .hw_unit = (u32)NVGPU_ERR_MODULE_GCC, .num_instances = 1U, - .num_errs = 2U, + .num_errs = 1U, .errs = (struct nvgpu_err_desc[]) { - GPU_NONCRITERR("l15_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("l15_ecc_uncorrected", GPU_GCC_L15_ECC_UNCORRECTED, INJECT_TYPE, @@ -555,44 +495,48 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = { .num_instances = 1U, .num_errs = 9U, .errs = (struct nvgpu_err_desc[]) { - GPU_NONCRITERR("hubmmu_l2tlb_sa_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("hubmmu_l2tlb_sa_data_ecc_uncorrected", GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("hubmmu_tlb_sa_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("hubmmu_tlb_sa_data_ecc_uncorrected", GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("hubmmu_pte_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("hubmmu_pte_data_ecc_uncorrected", GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, INJECT_TYPE, NULL, NULL, NULL, NULL, 0, 0), - GPU_NONCRITERR("hubmmu_pde0_data_ecc_corrected", - 0, INJECT_NONE, - NULL, NULL, - NULL, NULL, 0, 0), GPU_CRITERR("hubmmu_pde0_data_ecc_uncorrected", GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), - GPU_CRITERR("hubmmu_page_fault_error", - GPU_HUBMMU_PAGE_FAULT_ERROR, + GPU_CRITERR("hubmmu_page_fault_other_fault_notify_error", + GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_page_fault_nonreplayable_fault_overflow_error", + GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_page_fault_replayable_fault_overflow_error", + GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_page_fault_replayable_fault_notify_error", + GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR, + INJECT_SW, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_page_fault_nonreplayable_fault_notify_error", + GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR, INJECT_SW, NULL, NULL, NULL, NULL, 0, 0), diff --git a/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c index 57bcc9a79..14fb8efea 100644 --- a/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fb/fb_mmu_fault_gv11b_fusa.c @@ -513,7 +513,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr) fb_niso_intr_mmu_other_fault_notify_m()) != 0U) { nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR); + GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR); nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. " "sub-err: OTHER_FAULT_NOTIFY. " "fault_status(0x%x)", fault_status); @@ -542,7 +542,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr) fb_niso_intr_mmu_nonreplayable_fault_overflow_m()) != 0U) { nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR); + GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR); nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. " "sub-err: NONREPLAYABLE_FAULT_OVERFLOW. " "fault_status(0x%x)", fault_status); @@ -567,7 +567,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr) fb_niso_intr_mmu_replayable_fault_overflow_m()) != 0U) { nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR); + GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR); nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. " "sub-err: REPLAYABLE_FAULT_OVERFLOW. " "fault_status(0x%x)", fault_status); diff --git a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c index 5239ae4c4..95f969c74 100644 --- a/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/ctxsw_timeout_gv11b_fusa.c @@ -215,9 +215,6 @@ bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g) continue; } - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST, - GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR); - #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT recover = g->ops.tsg.check_ctxsw_timeout(tsg, &debug_dump, &ms); diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c index a6ce39fda..7b32e5c3d 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gv11b_fusa.c @@ -132,9 +132,6 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g) nvgpu_err(g, "fifo sched error code not supported"); } - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST, - GPU_HOST_PFIFO_SCHED_ERROR); - if (sched_error == SCHED_ERROR_CODE_BAD_TSG) { /* id is unknown, preempt all runlists and do recovery */ nvgpu_rc_sched_error_bad_tsg(g); diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c index fa406a3c1..43408db27 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gv11b_fusa.c @@ -195,9 +195,6 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1, recover = true; - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST, - GPU_HOST_PBDMA_HCE_ERROR); - if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) { nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d", pbdma_id); diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c index 8224580e2..995879ce7 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_ga10b_fusa.c @@ -483,6 +483,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g, if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_L1_TAG_ECC_UNCORRECTED); + nvgpu_err(g, "sm_l1_tag_ecc_uncorrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_L1_TAG_ECC_UNCORRECTED; ecc_status->err_count = @@ -491,6 +494,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g, if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED); + nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED; ecc_status->err_count = @@ -499,6 +505,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g, if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED); + nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED; ecc_status->err_count = @@ -513,6 +522,9 @@ static void ga10b_gr_intr_set_l1_tag_corrected_err(struct gk20a *g, if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m()) != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_L1_TAG_ECC_CORRECTED); + nvgpu_err(g, "sm_l1_tag_ecc_corrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_L1_TAG_ECC_CORRECTED; ecc_status->err_count = @@ -569,6 +581,9 @@ static bool ga10b_gr_intr_sm_lrf_ecc_status_errors(struct gk20a *g, ecc_status->err_count = 0U; if (uncorr_err != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_CBU_ECC_UNCORRECTED); + nvgpu_err(g, "sm_lrf_ecc_uncorrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_LRF_ECC_UNCORRECTED; ecc_status->err_count = @@ -604,6 +619,9 @@ static bool ga10b_gr_intr_sm_cbu_ecc_status_errors(struct gk20a *g, ecc_status->err_count = 0; if (uncorr_err != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_CBU_ECC_UNCORRECTED); + nvgpu_err(g, "sm_cbu_ecc_uncorrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_CBU_ECC_UNCORRECTED; ecc_status->err_count = @@ -636,6 +654,9 @@ static bool ga10b_gr_intr_sm_l1_data_ecc_status_errors(struct gk20a *g, ecc_status->err_count = 0U; if (uncorr_err != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_L1_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "sm_l1_data_ecc_uncorrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_L1_DATA_ECC_UNCORRECTED; ecc_status->err_count = @@ -727,6 +748,9 @@ static bool ga10b_gr_intr_sm_icache_ecc_status_errors(struct gk20a *g, ecc_status->err_count = 0U; if (uncorr_err != 0U) { + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED); + nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected"); ecc_status->err_id[ecc_status->err_count] = GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED; ecc_status->err_count = diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c index db4aa11aa..07b7edc74 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c @@ -371,8 +371,6 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU, - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED); nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc); } if ((ecc_status & @@ -387,8 +385,6 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU, - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED); nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc); } } @@ -909,22 +905,16 @@ static void gv11b_gr_intr_report_l1_tag_uncorrected_err(struct gk20a *g, if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_UNCORRECTED) { nvgpu_err(g, "sm_l1_tag_ecc_uncorrected " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_L1_TAG_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED) { nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED) { nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED); } } } @@ -944,8 +934,6 @@ static void gv11b_gr_intr_report_l1_tag_corrected_err(struct gk20a *g, if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_CORRECTED) { nvgpu_err(g, "sm_l1_tag_ecc_corrected " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_L1_TAG_ECC_CORRECTED); } } } @@ -1265,8 +1253,6 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc nvgpu_safe_add_u32( g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter, lrf_uncorrected_err_count_delta); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_LRF_ECC_UNCORRECTED); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset), 0U); @@ -1398,8 +1384,6 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc nvgpu_safe_add_u32( g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter, cbu_uncorrected_err_count_delta); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_CBU_ECC_UNCORRECTED); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset), 0U); @@ -1527,8 +1511,6 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 nvgpu_safe_add_u32( g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter, l1_data_uncorrected_err_count_delta); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_L1_DATA_ECC_UNCORRECTED); nvgpu_writel(g, nvgpu_safe_add_u32( gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset), 0U); @@ -1553,22 +1535,16 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g, if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) { nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) { nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED); } if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED) { nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected. " "gpc_id(%d), tpc_id(%d)", gpc, tpc); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM, - GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED); } } } diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c index ea4ee9b5f..55b4d762a 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b_fusa.c @@ -126,8 +126,6 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g, g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter, uncorrected_delta); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC, - GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED); nvgpu_err(g, "tstg ecc error uncorrected. " "ecc_addr(0x%x)", ecc_addr); } @@ -281,8 +279,6 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ecc_stats_reg_val); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC, - GPU_LTC_CACHE_DSTG_ECC_CORRECTED); nvgpu_err(g, "dstg ecc error corrected. " "ecc_addr(0x%x)", dstg_ecc_addr); @@ -328,8 +324,6 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt ltc_ltc0_lts0_dstg_ecc_report_r(), offset), ecc_stats_reg_val); - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC, - GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED); nvgpu_err(g, "dstg ecc error uncorrected. " "ecc_addr(0x%x)", dstg_ecc_addr); } diff --git a/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c index a5d913f3e..079b3d202 100644 --- a/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/mm/mmu_fault/mmu_fault_gv11b_fusa.c @@ -499,7 +499,7 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g, u32 *invalidate_replay_val_ptr, u32 rd32_val, u32 fault_status, u32 index, u32 get_indx, u32 offset, u32 entries) { - u32 sub_err_type = 0U; + u32 err_type = 0U; #ifdef CONFIG_NVGPU_REPLAYABLE_FAULT u64 prev_fault_addr = 0ULL; u64 next_fault_addr = 0ULL; @@ -513,19 +513,17 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g, #ifdef CONFIG_NVGPU_REPLAYABLE_FAULT if (index == NVGPU_MMU_FAULT_REPLAY_REG_INDX) { - sub_err_type = GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY; + err_type = GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR; } else { #endif - sub_err_type = GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY; + err_type = GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR; #ifdef CONFIG_NVGPU_REPLAYABLE_FAULT } #endif - nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU, - GPU_HUBMMU_PAGE_FAULT_ERROR); - nvgpu_err(g, "page fault error: sub_er_type = 0x%x, " - "fault_status = 0x%x", - sub_err_type, fault_status); + nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU, err_type); + nvgpu_err(g, "page fault error: err_type = 0x%x, " + "fault_status = 0x%x", err_type, fault_status); nvgpu_assert(get_indx < U32_MAX); nvgpu_assert(entries != 0U); diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index c46330682..9d6d397b5 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -78,10 +78,9 @@ struct mmu_fault_info; #define GPU_HOST_PBDMA_METHOD_ERROR (11U) #define GPU_HOST_PBDMA_SIGNATURE_ERROR (12U) #define GPU_HOST_PBDMA_HCE_ERROR (13U) -#define GPU_HOST_PBDMA_PREEMPT_ERROR (14U) -#define GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR (15U) -#define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (16U) -#define GPU_HOST_INVALID_ERROR (17U) +#define GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR (14U) +#define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (15U) +#define GPU_HOST_INVALID_ERROR (16U) /** * @} */ @@ -116,11 +115,11 @@ struct mmu_fault_info; */ #define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U) #define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U) -#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U) -#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT (4U) -#define GPU_FECS_CTXSW_CRC_MISMATCH (5U) -#define GPU_FECS_FAULT_DURING_CTXSW (6U) -#define GPU_FECS_CTXSW_INIT_ERROR (7U) +#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (2U) +#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT (3U) +#define GPU_FECS_CTXSW_CRC_MISMATCH (4U) +#define GPU_FECS_FAULT_DURING_CTXSW (5U) +#define GPU_FECS_CTXSW_INIT_ERROR (6U) /** * @} */ @@ -132,7 +131,7 @@ struct mmu_fault_info; */ #define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U) #define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U) -#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U) +#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (2U) /** * @} */ @@ -153,7 +152,7 @@ struct mmu_fault_info; * Macros used to assign unique index to errors reported from the GCC unit. * @{ */ -#define GPU_GCC_L15_ECC_UNCORRECTED (1U) +#define GPU_GCC_L15_ECC_UNCORRECTED (0U) /** * @} */ @@ -264,25 +263,19 @@ struct mmu_fault_info; * Macros used to assign unique index to errors reported from the HUBMMU unit. * @{ */ -#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (0U) -#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (1U) -#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (2U) -#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (3U) -#define GPU_HUBMMU_PAGE_FAULT_ERROR (4U) +#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (0U) +#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (1U) +#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (2U) +#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (3U) +#define GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR (4U) +#define GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR (5U) +#define GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR (6U) +#define GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR (7U) +#define GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR (8U) /** * @} */ -/** - * This assigns an unique index for sub-errors - * in GPU_HUBMMU_PAGE_FAULT_ERROR. - */ -#define GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW (0U) -#define GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY (1U) -#define GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW (2U) -#define GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY (3U) -#define GPU_HUBMMU_OTHER_FAULT_NOTIFY (4U) - /** * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PRI * Macros used to assign unique index to errors reported from the PRI unit.