diff --git a/drivers/gpu/nvgpu/common/ecc.c b/drivers/gpu/nvgpu/common/ecc.c index ab5895898..605ea3755 100644 --- a/drivers/gpu/nvgpu/common/ecc.c +++ b/drivers/gpu/nvgpu/common/ecc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -22,6 +22,7 @@ #include #include +#include static void nvgpu_ecc_stat_add(struct gk20a *g, struct nvgpu_ecc_stat *stat) { @@ -373,3 +374,91 @@ void nvgpu_ecc_remove_support(struct gk20a *g) nvgpu_ecc_sysfs_remove(g); nvgpu_ecc_free(g); } + +void nvgpu_hubmmu_report_ecc_error(struct gk20a *g, u32 inst, + u32 err_type, u64 err_addr, u64 err_cnt) +{ + int ret = 0; + + if (g->ops.fb.err_ops.report_ecc_parity_err == NULL) { + return; + } + ret = g->ops.fb.err_ops.report_ecc_parity_err(g, + NVGPU_ERR_MODULE_HUBMMU, inst, err_type, err_addr, + err_cnt); + if (ret != 0) { + nvgpu_err(g, "Failed to report HUBMMU error: inst=%u, " + "err_type=%u, err_addr=%llu, err_cnt=%llu", + inst, err_type, err_addr, err_cnt); + } +} + +void nvgpu_ltc_report_ecc_error(struct gk20a *g, u32 ltc, u32 slice, + u32 err_type, u64 err_addr, u64 err_cnt) +{ + int ret = 0; + u32 inst = 0U; + + if (g->ops.ltc.err_ops.report_ecc_parity_err == NULL) { + return; + } + if (slice < 256U) { + inst = (ltc << 8U) | slice; + } else { + nvgpu_err(g, "Invalid slice id=%u", slice); + return; + } + ret = g->ops.ltc.err_ops.report_ecc_parity_err(g, + NVGPU_ERR_MODULE_LTC, inst, err_type, err_addr, + err_cnt); + if (ret != 0) { + nvgpu_err(g, "Failed to report LTC error: inst=%u, \ + err_type=%u, err_addr=%llu, err_cnt=%llu", + inst, err_type, err_addr, err_cnt); + } +} + +void nvgpu_pmu_report_ecc_error(struct gk20a *g, u32 inst, + u32 err_type, u64 err_addr, u64 err_cnt) +{ + int ret = 0; + + if (g->ops.pmu.err_ops.report_ecc_parity_err == NULL) { + return; + } + ret = g->ops.pmu.err_ops.report_ecc_parity_err(g, + NVGPU_ERR_MODULE_PWR, inst, err_type, err_addr, + err_cnt); + if (ret != 0) { + nvgpu_err(g, "Failed to report PMU error: inst=%u, \ + err_type=%u, err_addr=%llu, err_cnt=%llu", + inst, err_type, err_addr, err_cnt); + } +} + +void nvgpu_gr_report_ecc_error(struct gk20a *g, u32 hw_module, + u32 gpc, u32 tpc, u32 err_type, + u64 err_addr, u64 err_cnt) +{ + int ret = 0; + u32 inst = 0U; + + if (g->ops.gr.err_ops.report_ecc_parity_err == NULL) { + return; + } + if (tpc < 256U) { + inst = (gpc << 8) | tpc; + } else { + nvgpu_err(g, "Invalid tpc id=%u", tpc); + return; + } + ret = g->ops.gr.err_ops.report_ecc_parity_err(g, + hw_module, inst, err_type, + err_addr, err_cnt); + if (ret != 0) { + nvgpu_err(g, "Failed to report GR error: hw_module=%u, \ + inst=%u, err_type=%u, err_addr=%llu, \ + err_cnt=%llu", hw_module, inst, err_type, + err_addr, err_cnt); + } +} diff --git a/drivers/gpu/nvgpu/common/fb/fb_gv11b.c b/drivers/gpu/nvgpu/common/fb/fb_gv11b.c index c3d241a0a..eb6e6447e 100644 --- a/drivers/gpu/nvgpu/common/fb/fb_gv11b.c +++ b/drivers/gpu/nvgpu/common/fb/fb_gv11b.c @@ -50,24 +50,6 @@ #include #include -static void gv11b_hubmmu_report_ecc_error(struct gk20a *g, u32 inst, - u32 err_type, u64 err_addr, u64 err_cnt) -{ - int ret = 0; - - if (g->ops.fb.err_ops.report_ecc_parity_err == NULL) { - return; - } - ret = g->ops.fb.err_ops.report_ecc_parity_err(g, - NVGPU_ERR_MODULE_HUBMMU, inst, err_type, err_addr, - err_cnt); - if (ret != 0) { - nvgpu_err(g, "Failed to report HUBMMU error: inst=%u, " - "err_type=%u, err_addr=%llu, err_cnt=%llu", - inst, err_type, err_addr, err_cnt); - } -} - static int gv11b_fb_fix_page_fault(struct gk20a *g, struct mmu_fault_info *mmfault); @@ -514,7 +496,7 @@ void gv11b_handle_l2tlb_ecc_isr(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_l2tlb_ecc_status_corrected_err_l2tlb_sa_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_l2tlb_ecc_corrected_err_count[0].counter); @@ -522,7 +504,7 @@ void gv11b_handle_l2tlb_ecc_isr(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_l2tlb_ecc_status_uncorrected_err_l2tlb_sa_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter); @@ -589,7 +571,7 @@ void gv11b_handle_hubtlb_ecc_isr(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_hubtlb_ecc_status_corrected_err_sa_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_hubtlb_ecc_corrected_err_count[0].counter); @@ -597,7 +579,7 @@ void gv11b_handle_hubtlb_ecc_isr(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter); @@ -664,7 +646,7 @@ void gv11b_handle_fillunit_ecc_isr(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_fillunit_ecc_status_corrected_err_pte_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_PTE_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter); @@ -672,7 +654,7 @@ void gv11b_handle_fillunit_ecc_isr(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); @@ -680,7 +662,7 @@ void gv11b_handle_fillunit_ecc_isr(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_PDE0_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter); @@ -688,7 +670,7 @@ void gv11b_handle_fillunit_ecc_isr(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m()) != 0U) { - gv11b_hubmmu_report_ecc_error(g, 0, + nvgpu_hubmmu_report_ecc_error(g, 0, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); diff --git a/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c b/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c index 201e81c91..54dbf1cdb 100644 --- a/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c +++ b/drivers/gpu/nvgpu/common/ltc/ltc_gv11b.c @@ -36,31 +36,6 @@ #include -static void gv11b_ltc_report_ecc_error(struct gk20a *g, u32 ltc, u32 slice, - u32 err_type, u64 err_addr, u64 err_cnt) -{ - int ret = 0; - u32 inst = 0U; - - if (g->ops.ltc.err_ops.report_ecc_parity_err == NULL) { - return; - } - if (slice < 256U) { - inst = (ltc << 8U) | slice; - } else { - nvgpu_err(g, "Invalid slice id=%u", slice); - return ; - } - ret = g->ops.ltc.err_ops.report_ecc_parity_err(g, - NVGPU_ERR_MODULE_LTC, inst, err_type, err_addr, - err_cnt); - if (ret != 0) { - nvgpu_err(g, "Failed to report LTC error: inst=%u, \ - err_type=%u, err_addr=%llu, err_cnt=%llu", - inst, err_type, err_addr, err_cnt); - } -} - /* * Sets the ZBC stencil for the passed index. */ @@ -201,36 +176,36 @@ void gv11b_ltc_lts_isr(struct gk20a *g, unsigned int ltc, unsigned int slice) "ltc:%d lts: %d cache ecc interrupt intr: 0x%x", ltc, slice, ltc_intr3); if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_RSTG_ECC_CORRECTED, ecc_addr, g->ecc.ltc.ecc_sec_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr, g->ecc.ltc.ecc_ded_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_TSTG_ECC_CORRECTED, ecc_addr, g->ecc.ltc.ecc_sec_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, g->ecc.ltc.ecc_ded_count[ltc][slice].counter); nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) { if ((dstg_ecc_addr & ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr, g->ecc.ltc.ecc_sec_count[ltc][slice].counter); } else { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_DSTG_BE_ECC_CORRECTED, ecc_addr, g->ecc.ltc.ecc_sec_count[ltc][slice].counter); } @@ -238,11 +213,11 @@ void gv11b_ltc_lts_isr(struct gk20a *g, unsigned int ltc, unsigned int slice) } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) { if ((dstg_ecc_addr & ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr, g->ecc.ltc.ecc_ded_count[ltc][slice].counter); } else { - gv11b_ltc_report_ecc_error(g, ltc, slice, + nvgpu_ltc_report_ecc_error(g, ltc, slice, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, g->ecc.ltc.ecc_ded_count[ltc][slice].counter); } diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c b/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c index ecaf34b7d..4fd1ffca8 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/common/pmu/pmu_gv11b.c @@ -116,24 +116,6 @@ static struct pg_init_sequence_list _pginitseq_gv11b[] = { {0x00020004U, 0x00000000U} , }; -static void gv11b_pmu_report_ecc_error(struct gk20a *g, u32 inst, - u32 err_type, u64 err_addr, u64 err_cnt) -{ - int ret = 0; - - if (g->ops.pmu.err_ops.report_ecc_parity_err == NULL) { - return ; - } - ret = g->ops.pmu.err_ops.report_ecc_parity_err(g, - NVGPU_ERR_MODULE_PWR, inst, err_type, err_addr, - err_cnt); - if (ret != 0) { - nvgpu_err(g, "Failed to report PMU error: inst=%u, \ - err_type=%u, err_addr=%llu, err_cnt=%llu", - inst, err_type, err_addr, err_cnt); - } -} - int gv11b_pmu_setup_elpg(struct gk20a *g) { int ret = 0; @@ -345,7 +327,7 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) "pmu ecc interrupt intr1: 0x%x", intr1); if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - gv11b_pmu_report_ecc_error(g, 0, + nvgpu_pmu_report_ecc_error(g, 0, GPU_PMU_FALCON_IMEM_ECC_CORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); @@ -353,7 +335,7 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) "imem ecc error corrected"); } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - gv11b_pmu_report_ecc_error(g, 0, + nvgpu_pmu_report_ecc_error(g, 0, GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); @@ -361,7 +343,7 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) "imem ecc error uncorrected"); } if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - gv11b_pmu_report_ecc_error(g, 0, + nvgpu_pmu_report_ecc_error(g, 0, GPU_PMU_FALCON_DMEM_ECC_CORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); @@ -369,7 +351,7 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) "dmem ecc error corrected"); } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - gv11b_pmu_report_ecc_error(g, 0, + nvgpu_pmu_report_ecc_error(g, 0, GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 7c67a2d4e..d701bde67 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -77,33 +77,6 @@ */ #define GR_TPCS_INFO_FOR_MAPREGISTER 6U -static void gv11b_gr_report_ecc_error(struct gk20a *g, u32 hw_module, - u32 gpc, u32 tpc, u32 err_type, - u64 err_addr, u64 err_cnt) -{ - int ret = 0; - u32 inst = 0U; - - if (g->ops.gr.err_ops.report_ecc_parity_err == NULL) { - return; - } - if (tpc < 256U) { - inst = (gpc << 8) | tpc; - } else { - nvgpu_err(g, "Invalid tpc id=%u", tpc); - return ; - } - ret = g->ops.gr.err_ops.report_ecc_parity_err(g, - hw_module, inst, err_type, - err_addr, err_cnt); - if (ret != 0) { - nvgpu_err(g, "Failed to report GR error: hw_module=%u, \ - inst=%u, err_type=%u, err_addr=%llu, \ - err_cnt=%llu", hw_module, inst, err_type, - err_addr, err_cnt); - } -} - bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num) { bool valid = false; @@ -258,19 +231,19 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_TAG_ECC_CORRECTED, 0, g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0, g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0, g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); } @@ -293,19 +266,19 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); } @@ -385,7 +358,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += lrf_corrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_LRF_ECC_CORRECTED, 0, g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter); gk20a_writel(g, @@ -404,7 +377,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += lrf_uncorrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_LRF_ECC_UNCORRECTED, 0, g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter); gk20a_writel(g, @@ -551,7 +524,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter += cbu_corrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_CBU_ECC_CORRECTED, 0, g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter); gk20a_writel(g, @@ -570,7 +543,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += cbu_uncorrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_CBU_ECC_UNCORRECTED, 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter); gk20a_writel(g, @@ -637,7 +610,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter += l1_data_corrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_DATA_ECC_CORRECTED, 0, g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter); gk20a_writel(g, @@ -656,7 +629,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += l1_data_uncorrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter); gk20a_writel(g, @@ -731,25 +704,25 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, 0); if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, 0, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter); } @@ -771,25 +744,25 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, 0); if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_SM, gpc, tpc, GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter); } @@ -878,7 +851,7 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter += gcc_l15_corrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc, GPU_GCC_L15_ECC_CORRECTED, 0, g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter); gk20a_writel(g, @@ -897,7 +870,7 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter += gcc_l15_uncorrected_err_count_delta; - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GCC, gpc, tpc, GPU_GCC_L15_ECC_UNCORRECTED, 0, g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter); gk20a_writel(g, @@ -983,28 +956,28 @@ static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, 0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); } if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, 0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); } if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, 0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); } if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, 0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); @@ -1084,28 +1057,28 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); @@ -2398,14 +2371,14 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) "fecs ecc interrupt intr: 0x%x", intr); if ((ecc_status & gr_fecs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, GPU_FECS_FALCON_IMEM_ECC_CORRECTED, ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if ((ecc_status & gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, @@ -2413,14 +2386,14 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) } if ((ecc_status & gr_fecs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, GPU_FECS_FALCON_DMEM_ECC_CORRECTED, ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if ((ecc_status & gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - gv11b_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, + nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_FECS, 0, 0, GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 7c0716d50..728202482 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -155,4 +155,17 @@ void nvgpu_report_gr_exception(struct gk20a *g, u32 inst, void nvgpu_report_ce_error(struct gk20a *g, u32 inst, u32 err_type, u32 status); + +void nvgpu_hubmmu_report_ecc_error(struct gk20a *g, u32 inst, + u32 err_type, u64 err_addr, u64 err_cnt); + +void nvgpu_ltc_report_ecc_error(struct gk20a *g, u32 ltc, u32 slice, + u32 err_type, u64 err_addr, u64 err_cnt); + +void nvgpu_pmu_report_ecc_error(struct gk20a *g, u32 inst, + u32 err_type, u64 err_addr, u64 err_cnt); + +void nvgpu_gr_report_ecc_error(struct gk20a *g, u32 hw_module, + u32 gpc, u32 tpc, u32 err_type, + u64 err_addr, u64 err_cnt); #endif