mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 09:12:24 +03:00
gpu: nvgpu: add missing error reporting for GV11B Hals
Error reportings were removed from the following functions in GV11B 1. gp10b_priv_ring_decode_error_code 2. gv11b_gr_intr_report_gpcmmu_ecc_err 3. gv11b_gr_intr_report_icache_uncorrected_err -> Duplicate 4. gv11b_gr_intr_report_l1_tag_corrected_err -> Duplicate 5. gv11b_gr_intr_report_l1_tag_uncorrected_err -> Duplicate 6. gv11b_ltc_intr_handle_dstg_ecc_interrupts 7. gv11b_ltc_intr_handle_ecc_sec_ded_interrupts 8. gv11b_ltc_intr_handle_tstg_ecc_interrupts 9. gv11b_pbdma_handle_intr_1 The ones marked "Duplicate" are the only ones which are used for both gv11b and ga10b. Others are invoked only for gv11b and not ga10b. a) For gv11b_gr_intr_report_l1_tag_corrected_err and gv11b_gr_intr_report_l1_tag_corrected_err, the errors are handled by moving them into gv11b_gr_intr_set_l1_tag_corrected_err and gv11b_gr_intr_set_l1_tag_uncorrected_err functions respectively. These functions are invoked only from GV11B. b) For gv11b_gr_intr_report_icache_uncorrected_err, the errors are handled by adding them in gv11b_set_icache_ecc_status_uncorrected_errors which is specific to gv11b. Bug 200588528 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Change-Id: I581bdfec8f996643d6af63b2b80a135e7d715b89 Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2770836 Reviewed-by: Bibek Basu <bbasu@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
1274f25dda
commit
00f4dbf9aa
@@ -195,6 +195,9 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
|
||||
|
||||
recover = true;
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
|
||||
GPU_HOST_PBDMA_HCE_ERROR);
|
||||
|
||||
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",
|
||||
pbdma_id);
|
||||
|
||||
@@ -376,6 +376,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
|
||||
0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
|
||||
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc);
|
||||
}
|
||||
if ((ecc_status &
|
||||
@@ -390,6 +392,8 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
|
||||
if ((ecc_status &
|
||||
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
|
||||
0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
|
||||
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc);
|
||||
}
|
||||
}
|
||||
@@ -978,6 +982,8 @@ static void gv11b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
|
||||
if ((l1_tag_ecc_status &
|
||||
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_ECC_UNCORRECTED);
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_L1_TAG_ECC_UNCORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -1009,6 +1015,8 @@ static void gv11b_gr_intr_set_l1_tag_corrected_err(struct gk20a *g,
|
||||
if ((l1_tag_ecc_status &
|
||||
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
|
||||
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) {
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED);
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_L1_TAG_ECC_CORRECTED;
|
||||
ecc_status->err_count =
|
||||
@@ -1592,6 +1600,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g,
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED;
|
||||
ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if ((icache_ecc_status &
|
||||
@@ -1599,6 +1609,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g,
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED;
|
||||
ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED);
|
||||
}
|
||||
|
||||
if ((icache_ecc_status &
|
||||
@@ -1606,6 +1618,8 @@ static void gv11b_set_icache_ecc_status_uncorrected_errors(struct gk20a *g,
|
||||
ecc_status->err_id[ecc_status->err_count] =
|
||||
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED;
|
||||
ecc_status->err_count = nvgpu_safe_add_u32(ecc_status->err_count, 1U);
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
|
||||
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -126,6 +126,8 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
|
||||
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
|
||||
uncorrected_delta);
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
|
||||
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "tstg ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", ecc_addr);
|
||||
}
|
||||
@@ -279,6 +281,8 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
|
||||
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||
ecc_stats_reg_val);
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
|
||||
GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
|
||||
nvgpu_err(g, "dstg ecc error corrected. "
|
||||
"ecc_addr(0x%x)", dstg_ecc_addr);
|
||||
|
||||
@@ -324,6 +328,9 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
|
||||
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
|
||||
ecc_stats_reg_val);
|
||||
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
|
||||
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
|
||||
nvgpu_err(g, "dstg ecc error uncorrected. "
|
||||
"ecc_addr(0x%x)", dstg_ecc_addr);
|
||||
}
|
||||
|
||||
@@ -71,6 +71,9 @@ void gp10b_priv_ring_decode_error_code(struct gk20a *g,
|
||||
{
|
||||
u32 error_type_index;
|
||||
|
||||
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_PRI,
|
||||
GPU_PRI_ACCESS_VIOLATION);
|
||||
|
||||
error_type_index = (error_code & 0x00000f00U) >> 8U;
|
||||
error_code = error_code & 0xBADFf000U;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user