mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 10:34:43 +03:00
gpu: nvgpu: update reporting of errors to sdl
In Drive 6.0, the error reporting is supported only for orin (ga10b) in dev-main. For this purpose, this patch does the following: - Removes the redundant reporting of following IDs from gv11b: - GPU_HOST_PFIFO_SCHED_ERROR - GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR - GPU_HOST_PBDMA_HCE_ERROR - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED - GPU_LTC_CACHE_DSTG_ECC_CORRECTED - GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED - Migrates the reporting of following IDs from gv11b to ga10b: - GPU_SM_L1_TAG_ECC_CORRECTED - GPU_SM_L1_TAG_ECC_UNCORRECTED - GPU_SM_CBU_ECC_UNCORRECTED - GPU_SM_LRF_ECC_UNCORRECTED - GPU_SM_L1_DATA_ECC_UNCORRECTED - GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED - GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED - GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED - Removes the unused ID that doesn't have any HSI related to it: - GPU_HOST_PBDMA_PREEMPT_ERROR In addition to the above, this patch does the following: - Updates error IDs related to page fault error. - Updates look-up table to remove unused error IDs. JIRA NVGPU-8094 Bug 200729736 Change-Id: Ifea76d38ba609c894560e61ff5a6e406290f919e Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2685249 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Dinesh T <dt@nvidia.com> Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> GVS: Gerrit_Virtual_Submit
This commit is contained in:
committed by
mobile promotions
parent
7ff977063b
commit
37c6b8b1c3
@@ -45,7 +45,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.name = "host",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_HOST,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 17U,
|
||||
.num_errs = 16U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_CRITERR("pfifo_bind_error",
|
||||
GPU_HOST_PFIFO_BIND_ERROR, INJECT_SW,
|
||||
@@ -113,11 +113,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("pbdma_preempt_error",
|
||||
GPU_HOST_PBDMA_PREEMPT_ERROR,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("pfifo_ctxsw_timeout",
|
||||
GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR,
|
||||
INJECT_SW,
|
||||
@@ -134,7 +129,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.name = "sm",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_SM,
|
||||
.num_instances = 8U,
|
||||
.num_errs = 21U,
|
||||
.num_errs = 12U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_NONCRITERR("l1_tag_ecc_corrected",
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED,
|
||||
@@ -146,73 +141,41 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("cbu_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("cbu_ecc_uncorrected",
|
||||
GPU_SM_CBU_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("lrf_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("lrf_ecc_uncorrected",
|
||||
GPU_SM_LRF_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("l1_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("l1_data_ecc_uncorrected",
|
||||
GPU_SM_L1_DATA_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("icache_l0_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("icache_l0_data_ecc_uncorrected",
|
||||
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("icache_l1_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("icache_l1_data_ecc_uncorrected",
|
||||
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("icache_l0_predecode_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
|
||||
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
|
||||
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
|
||||
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
@@ -223,12 +186,9 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("icache_l1_predecode_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
|
||||
0, INJECT_NONE,
|
||||
GPU_CRITERR("rams_urf_ecc_uncorrected",
|
||||
GPU_SM_RAMS_URF_ECC_UNCORRECTED,
|
||||
INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
},
|
||||
@@ -237,7 +197,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.name = "fecs",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 8U,
|
||||
.num_errs = 7U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_NONCRITERR("falcon_imem_ecc_corrected",
|
||||
GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
|
||||
@@ -249,10 +209,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("falcon_dmem_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
|
||||
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
@@ -284,7 +240,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.name = "gpccs",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_GPCCS,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 4U,
|
||||
.num_errs = 3U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_NONCRITERR("falcon_imem_ecc_corrected",
|
||||
GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
|
||||
@@ -296,10 +252,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("falcon_dmem_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
|
||||
GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
@@ -311,21 +263,13 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.name = "mmu",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_MMU,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 4U,
|
||||
.num_errs = 2U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_NONCRITERR("l1tlb_sa_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("l1tlb_sa_data_ecc_uncorrected",
|
||||
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("l1tlb_fa_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("l1tlb_fa_data_ecc_uncorrected",
|
||||
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
@@ -337,12 +281,8 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.name = "gcc",
|
||||
.hw_unit = (u32)NVGPU_ERR_MODULE_GCC,
|
||||
.num_instances = 1U,
|
||||
.num_errs = 2U,
|
||||
.num_errs = 1U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_NONCRITERR("l15_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("l15_ecc_uncorrected",
|
||||
GPU_GCC_L15_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
@@ -555,44 +495,48 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
|
||||
.num_instances = 1U,
|
||||
.num_errs = 9U,
|
||||
.errs = (struct nvgpu_err_desc[]) {
|
||||
GPU_NONCRITERR("hubmmu_l2tlb_sa_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_l2tlb_sa_data_ecc_uncorrected",
|
||||
GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("hubmmu_tlb_sa_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_tlb_sa_data_ecc_uncorrected",
|
||||
GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("hubmmu_pte_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_pte_data_ecc_uncorrected",
|
||||
GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED,
|
||||
INJECT_TYPE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_NONCRITERR("hubmmu_pde0_data_ecc_corrected",
|
||||
0, INJECT_NONE,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_pde0_data_ecc_uncorrected",
|
||||
GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_page_fault_error",
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR,
|
||||
GPU_CRITERR("hubmmu_page_fault_other_fault_notify_error",
|
||||
GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_page_fault_nonreplayable_fault_overflow_error",
|
||||
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_page_fault_replayable_fault_overflow_error",
|
||||
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_page_fault_replayable_fault_notify_error",
|
||||
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
GPU_CRITERR("hubmmu_page_fault_nonreplayable_fault_notify_error",
|
||||
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR,
|
||||
INJECT_SW,
|
||||
NULL, NULL,
|
||||
NULL, NULL, 0, 0),
|
||||
|
||||
@@ -513,7 +513,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
|
||||
fb_niso_intr_mmu_other_fault_notify_m()) != 0U) {
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR);
|
||||
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
|
||||
"sub-err: OTHER_FAULT_NOTIFY. "
|
||||
"fault_status(0x%x)", fault_status);
|
||||
@@ -542,7 +542,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
|
||||
fb_niso_intr_mmu_nonreplayable_fault_overflow_m()) != 0U) {
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR);
|
||||
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
|
||||
"sub-err: NONREPLAYABLE_FAULT_OVERFLOW. "
|
||||
"fault_status(0x%x)", fault_status);
|
||||
@@ -567,7 +567,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
|
||||
fb_niso_intr_mmu_replayable_fault_overflow_m()) != 0U) {
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR);
|
||||
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
|
||||
"sub-err: REPLAYABLE_FAULT_OVERFLOW. "
|
||||
"fault_status(0x%x)", fault_status);
|
||||
|
||||
@@ -215,9 +215,6 @@ bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g)
|
||||
continue;
|
||||
}
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
|
||||
GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR);
|
||||
|
||||
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
|
||||
recover = g->ops.tsg.check_ctxsw_timeout(tsg,
|
||||
&debug_dump, &ms);
|
||||
|
||||
@@ -132,9 +132,6 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g)
|
||||
nvgpu_err(g, "fifo sched error code not supported");
|
||||
}
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
|
||||
GPU_HOST_PFIFO_SCHED_ERROR);
|
||||
|
||||
if (sched_error == SCHED_ERROR_CODE_BAD_TSG) {
|
||||
/* id is unknown, preempt all runlists and do recovery */
|
||||
nvgpu_rc_sched_error_bad_tsg(g);
|
||||
|
||||
@@ -195,9 +195,6 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
|
||||
|
||||
recover = true;
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
|
||||
GPU_HOST_PBDMA_HCE_ERROR);
|
||||
|
||||
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",
|
||||
pbdma_id);
|
||||
|
||||
@@ -483,6 +483,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
|
||||
|
||||
if ((l1_tag_ecc_status &
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m()) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_uncorrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_L1_TAG_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -491,6 +494,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
|
||||
|
||||
if ((l1_tag_ecc_status &
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -499,6 +505,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
|
||||
|
||||
if ((l1_tag_ecc_status &
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -513,6 +522,9 @@ static void ga10b_gr_intr_set_l1_tag_corrected_err(struct gk20a *g,
|
||||
|
||||
if ((l1_tag_ecc_status &
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m()) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED);
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_corrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -569,6 +581,9 @@ static bool ga10b_gr_intr_sm_lrf_ecc_status_errors(struct gk20a *g,
|
||||
ecc_status->err_count = 0U;
|
||||
|
||||
if (uncorr_err != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_CBU_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_lrf_ecc_uncorrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_LRF_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -604,6 +619,9 @@ static bool ga10b_gr_intr_sm_cbu_ecc_status_errors(struct gk20a *g,
|
||||
ecc_status->err_count = 0;
|
||||
|
||||
if (uncorr_err != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_CBU_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_cbu_ecc_uncorrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_CBU_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -636,6 +654,9 @@ static bool ga10b_gr_intr_sm_l1_data_ecc_status_errors(struct gk20a *g,
|
||||
ecc_status->err_count = 0U;
|
||||
|
||||
if (uncorr_err != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_l1_data_ecc_uncorrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_L1_DATA_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -727,6 +748,9 @@ static bool ga10b_gr_intr_sm_icache_ecc_status_errors(struct gk20a *g,
|
||||
ecc_status->err_count = 0U;
|
||||
|
||||
if (uncorr_err != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected");
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
|
||||
@@ -371,8 +371,6 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
|
||||
0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
|
||||
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
@@ -387,8 +385,6 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
|
||||
0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
|
||||
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc);
|
||||
}
|
||||
}
|
||||
@@ -909,22 +905,16 @@ static void gv11b_gr_intr_report_l1_tag_uncorrected_err(struct gk20a *g,
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_UNCORRECTED) {
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_uncorrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED) {
|
||||
nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED) {
|
||||
nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -944,8 +934,6 @@ static void gv11b_gr_intr_report_l1_tag_corrected_err(struct gk20a *g,
|
||||
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_CORRECTED) {
|
||||
nvgpu_err(g, "sm_l1_tag_ecc_corrected "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1265,8 +1253,6 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
|
||||
nvgpu_safe_add_u32(
|
||||
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
|
||||
lrf_uncorrected_err_count_delta);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_LRF_ECC_UNCORRECTED);
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset),
|
||||
0U);
|
||||
@@ -1398,8 +1384,6 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
|
||||
nvgpu_safe_add_u32(
|
||||
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter,
|
||||
cbu_uncorrected_err_count_delta);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_CBU_ECC_UNCORRECTED);
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset),
|
||||
0U);
|
||||
@@ -1527,8 +1511,6 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
|
||||
nvgpu_safe_add_u32(
|
||||
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter,
|
||||
l1_data_uncorrected_err_count_delta);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_writel(g, nvgpu_safe_add_u32(
|
||||
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset),
|
||||
0U);
|
||||
@@ -1553,22 +1535,16 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g,
|
||||
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) {
|
||||
nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) {
|
||||
nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED) {
|
||||
nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected. "
|
||||
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,8 +126,6 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
||||
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
|
||||
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "tstg ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
@@ -281,8 +279,6 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
|
||||
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||
ecc_stats_reg_val);
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
|
||||
GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
|
||||
nvgpu_err(g, "dstg ecc error corrected. "
|
||||
"ecc_addr(0x%x)", dstg_ecc_addr);
|
||||
|
||||
@@ -328,8 +324,6 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
|
||||
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||
ecc_stats_reg_val);
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
|
||||
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "dstg ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", dstg_ecc_addr);
|
||||
}
|
||||
|
||||
@@ -499,7 +499,7 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g,
|
||||
u32 *invalidate_replay_val_ptr, u32 rd32_val, u32 fault_status,
|
||||
u32 index, u32 get_indx, u32 offset, u32 entries)
|
||||
{
|
||||
u32 sub_err_type = 0U;
|
||||
u32 err_type = 0U;
|
||||
#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
|
||||
u64 prev_fault_addr = 0ULL;
|
||||
u64 next_fault_addr = 0ULL;
|
||||
@@ -513,19 +513,17 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g,
|
||||
|
||||
#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
|
||||
if (index == NVGPU_MMU_FAULT_REPLAY_REG_INDX) {
|
||||
sub_err_type = GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY;
|
||||
err_type = GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR;
|
||||
} else {
|
||||
#endif
|
||||
sub_err_type = GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY;
|
||||
err_type = GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR;
|
||||
#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
|
||||
}
|
||||
#endif
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
|
||||
GPU_HUBMMU_PAGE_FAULT_ERROR);
|
||||
nvgpu_err(g, "page fault error: sub_er_type = 0x%x, "
|
||||
"fault_status = 0x%x",
|
||||
sub_err_type, fault_status);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU, err_type);
|
||||
nvgpu_err(g, "page fault error: err_type = 0x%x, "
|
||||
"fault_status = 0x%x", err_type, fault_status);
|
||||
|
||||
nvgpu_assert(get_indx < U32_MAX);
|
||||
nvgpu_assert(entries != 0U);
|
||||
|
||||
@@ -78,10 +78,9 @@ struct mmu_fault_info;
|
||||
#define GPU_HOST_PBDMA_METHOD_ERROR (11U)
|
||||
#define GPU_HOST_PBDMA_SIGNATURE_ERROR (12U)
|
||||
#define GPU_HOST_PBDMA_HCE_ERROR (13U)
|
||||
#define GPU_HOST_PBDMA_PREEMPT_ERROR (14U)
|
||||
#define GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR (15U)
|
||||
#define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (16U)
|
||||
#define GPU_HOST_INVALID_ERROR (17U)
|
||||
#define GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR (14U)
|
||||
#define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (15U)
|
||||
#define GPU_HOST_INVALID_ERROR (16U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
@@ -116,11 +115,11 @@ struct mmu_fault_info;
|
||||
*/
|
||||
#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
|
||||
#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
|
||||
#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U)
|
||||
#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT (4U)
|
||||
#define GPU_FECS_CTXSW_CRC_MISMATCH (5U)
|
||||
#define GPU_FECS_FAULT_DURING_CTXSW (6U)
|
||||
#define GPU_FECS_CTXSW_INIT_ERROR (7U)
|
||||
#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (2U)
|
||||
#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT (3U)
|
||||
#define GPU_FECS_CTXSW_CRC_MISMATCH (4U)
|
||||
#define GPU_FECS_FAULT_DURING_CTXSW (5U)
|
||||
#define GPU_FECS_CTXSW_INIT_ERROR (6U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
@@ -132,7 +131,7 @@ struct mmu_fault_info;
|
||||
*/
|
||||
#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
|
||||
#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
|
||||
#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
|
||||
#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (2U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
@@ -153,7 +152,7 @@ struct mmu_fault_info;
|
||||
* Macros used to assign unique index to errors reported from the GCC unit.
|
||||
* @{
|
||||
*/
|
||||
#define GPU_GCC_L15_ECC_UNCORRECTED (1U)
|
||||
#define GPU_GCC_L15_ECC_UNCORRECTED (0U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
@@ -264,25 +263,19 @@ struct mmu_fault_info;
|
||||
* Macros used to assign unique index to errors reported from the HUBMMU unit.
|
||||
* @{
|
||||
*/
|
||||
#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (0U)
|
||||
#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (1U)
|
||||
#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (2U)
|
||||
#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (3U)
|
||||
#define GPU_HUBMMU_PAGE_FAULT_ERROR (4U)
|
||||
#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (0U)
|
||||
#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (1U)
|
||||
#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (2U)
|
||||
#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (3U)
|
||||
#define GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR (4U)
|
||||
#define GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR (5U)
|
||||
#define GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR (6U)
|
||||
#define GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR (7U)
|
||||
#define GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR (8U)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* This assigns an unique index for sub-errors
|
||||
* in GPU_HUBMMU_PAGE_FAULT_ERROR.
|
||||
*/
|
||||
#define GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW (0U)
|
||||
#define GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY (1U)
|
||||
#define GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW (2U)
|
||||
#define GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY (3U)
|
||||
#define GPU_HUBMMU_OTHER_FAULT_NOTIFY (4U)
|
||||
|
||||
/**
|
||||
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_PRI
|
||||
* Macros used to assign unique index to errors reported from the PRI unit.
|
||||
|
||||
Reference in New Issue
Block a user