diff --git a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c index be1d26126..8ad2f9a42 100644 --- a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c @@ -31,6 +31,46 @@ #include +static struct nvgpu_hw_err_inject_info hubmmu_ecc_err_desc[] = { + NVGPU_ECC_ERR("hubmmu_l2tlb_sa_data_ecc_uncorrected", + gv11b_fb_intr_inject_hubmmu_ecc_error, + fb_mmu_l2tlb_ecc_control_r, + fb_mmu_l2tlb_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("hubmmu_tlb_sa_data_ecc_uncorrected", + gv11b_fb_intr_inject_hubmmu_ecc_error, + fb_mmu_hubtlb_ecc_control_r, + fb_mmu_hubtlb_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("hubmmu_pte_data_ecc_uncorrected", + gv11b_fb_intr_inject_hubmmu_ecc_error, + fb_mmu_fillunit_ecc_control_r, + fb_mmu_fillunit_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc hubmmu_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_fb_intr_get_hubmmu_err_desc(struct gk20a *g) +{ + hubmmu_err_desc.info_ptr = hubmmu_ecc_err_desc; + hubmmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(hubmmu_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &hubmmu_err_desc; +} + +int gv11b_fb_intr_inject_hubmmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, + u32 error_info) +{ + unsigned int reg_addr = err->get_reg_addr(); + + nvgpu_info(g, "Injecting HUBMMU fault %s", err->name); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status) { u32 ecc_addr, corrected_cnt, uncorrected_cnt; @@ -83,7 +123,7 @@ static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_l2tlb_ecc_status_corrected_err_l2tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_l2tlb_ecc_corrected_err_count[0].counter); @@ -92,7 +132,7 @@ static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_l2tlb_ecc_status_uncorrected_err_l2tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter); @@ -161,7 +201,7 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_hubtlb_ecc_status_corrected_err_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_hubtlb_ecc_corrected_err_count[0].counter); @@ -169,7 +209,7 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter); @@ -239,7 +279,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_fillunit_ecc_status_corrected_err_pte_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PTE_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter); @@ -248,7 +288,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); @@ -256,7 +296,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PDE0_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter); @@ -265,7 +305,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); diff --git a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h index 9ecdce587..bc73a6148 100644 --- a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h +++ b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h @@ -25,8 +25,17 @@ #ifndef NVGPU_FB_INTR_ECC_GV11B_H #define NVGPU_FB_INTR_ECC_GV11B_H +#include +#include + struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; void gv11b_fb_intr_handle_ecc(struct gk20a *g); +struct nvgpu_hw_err_inject_info_desc * + gv11b_fb_intr_get_hubmmu_err_desc(struct gk20a *g); +int gv11b_fb_intr_inject_hubmmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); #endif /* NVGPU_FB_INTR_ECC_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c index 0c14be62c..d708c23c5 100644 --- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c @@ -28,6 +28,203 @@ #include "ecc_gv11b.h" +static struct nvgpu_hw_err_inject_info fecs_ecc_err_desc[] = { + NVGPU_ECC_ERR("falcon_imem_ecc_corrected", + gv11b_gr_intr_inject_fecs_ecc_error, + gr_fecs_falcon_ecc_control_r, + gr_fecs_falcon_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected", + gv11b_gr_intr_inject_fecs_ecc_error, + gr_fecs_falcon_ecc_control_r, + gr_fecs_falcon_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc fecs_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_fecs_err_desc(struct gk20a *g) +{ + fecs_err_desc.info_ptr = fecs_ecc_err_desc; + fecs_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(fecs_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &fecs_err_desc; +} + +int gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + nvgpu_info(g, "Injecting FECS fault %s", err->name); + nvgpu_writel(g, err->get_reg_addr(), err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info gpccs_ecc_err_desc[] = { + NVGPU_ECC_ERR("falcon_imem_ecc_corrected", + gv11b_gr_intr_inject_gpccs_ecc_error, + gr_gpccs_falcon_ecc_control_r, + gr_gpccs_falcon_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected", + gv11b_gr_intr_inject_gpccs_ecc_error, + gr_gpccs_falcon_ecc_control_r, + gr_gpccs_falcon_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc gpccs_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gpccs_err_desc(struct gk20a *g) +{ + gpccs_err_desc.info_ptr = gpccs_ecc_err_desc; + gpccs_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(gpccs_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &gpccs_err_desc; +} + +int gv11b_gr_intr_inject_gpccs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride; + + nvgpu_info(g, "Injecting GPCCS fault %s for gpc: %d", err->name, gpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info sm_ecc_err_desc[] = { + NVGPU_ECC_ERR("l1_tag_ecc_corrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_r, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("l1_tag_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_r, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("cbu_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_r, + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("lrf_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_r, + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("l1_data_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_r, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("icache_l0_data_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_icache_ecc_control_r, + gr_pri_gpc0_tpc0_sm_icache_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc sm_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_sm_err_desc(struct gk20a *g) +{ + sm_err_desc.info_ptr = sm_ecc_err_desc; + sm_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(sm_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &sm_err_desc; +} + +int gv11b_gr_intr_inject_sm_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, + u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + unsigned int tpc_stride = + nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFF00U) >> 8U; + unsigned int tpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride + + tpc * tpc_stride; + + nvgpu_info(g, "Injecting SM fault %s for gpc: %d, tpc: %d", + err->name, gpc, tpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info mmu_ecc_err_desc[] = { + NVGPU_ECC_ERR("l1tlb_sa_data_ecc_uncorrected", + gv11b_gr_intr_inject_mmu_ecc_error, + gr_gpc0_mmu_l1tlb_ecc_control_r, + gr_gpc0_mmu_l1tlb_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc mmu_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_mmu_err_desc(struct gk20a *g) +{ + mmu_err_desc.info_ptr = mmu_ecc_err_desc; + mmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(mmu_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &mmu_err_desc; +} + +int gv11b_gr_intr_inject_mmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride; + + nvgpu_info(g, "Injecting MMU fault %s for gpc: %d", err->name, gpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info gcc_ecc_err_desc[] = { + NVGPU_ECC_ERR("l15_ecc_uncorrected", + gv11b_gr_intr_inject_gcc_ecc_error, + gr_pri_gpc0_gcc_l15_ecc_control_r, + gr_pri_gpc0_gcc_l15_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc gcc_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gcc_err_desc(struct gk20a *g) +{ + gcc_err_desc.info_ptr = gcc_ecc_err_desc; + gcc_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(gcc_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &gcc_err_desc; +} + +int gv11b_gr_intr_inject_gcc_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, + GPU_LIT_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + + gpc * gpc_stride; + + nvgpu_info(g, "Injecting GCC fault %s for gpc: %d", err->name, gpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + void gv11b_ecc_detect_enabled_units(struct gk20a *g) { bool opt_ecc_en = g->ops.fuse.is_opt_ecc_enable(g); diff --git a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h index 40cc9ac2e..3c868cdec 100644 --- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h @@ -23,9 +23,34 @@ #ifndef NVGPU_ECC_GV11B_H #define NVGPU_ECC_GV11B_H +#include +#include + struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; void gv11b_ecc_detect_enabled_units(struct gk20a *g); int gv11b_ecc_init(struct gk20a *g); +int gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_fecs_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_gpccs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gpccs_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_sm_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_sm_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_mmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_mmu_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_gcc_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gcc_err_desc(struct gk20a *g); #endif /* NVGPU_ECC_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c index 913633774..6be068e2e 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c @@ -154,7 +154,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_ECC_CORRECTED, 0, @@ -162,7 +162,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0, @@ -170,7 +170,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0, @@ -195,7 +195,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, @@ -203,7 +203,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, @@ -211,7 +211,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, @@ -293,7 +293,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += lrf_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_LRF_ECC_CORRECTED, 0, @@ -314,7 +314,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += lrf_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_LRF_ECC_UNCORRECTED, 0, @@ -387,7 +387,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter += cbu_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_CBU_ECC_CORRECTED, @@ -408,7 +408,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += cbu_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_CBU_ECC_UNCORRECTED, @@ -477,7 +477,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter += l1_data_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_DATA_ECC_CORRECTED, @@ -498,7 +498,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += l1_data_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_DATA_ECC_UNCORRECTED, @@ -575,7 +575,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, 0); if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, @@ -583,7 +583,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, @@ -591,7 +591,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, @@ -599,7 +599,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, @@ -623,7 +623,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, 0); if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, @@ -631,7 +631,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, @@ -639,7 +639,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, @@ -647,7 +647,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c index e0ed351c1..8d4f1f4a3 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c @@ -50,28 +50,28 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g) fecs_ecc_status.uncorrected_delta; if (fecs_ecc_status.imem_corrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_IMEM_ECC_CORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if (fecs_ecc_status.imem_uncorrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if (fecs_ecc_status.dmem_corrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_DMEM_ECC_CORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if (fecs_ecc_status.dmem_uncorrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); @@ -320,7 +320,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, ); } *corrected_err += gcc_l15_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GCC, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc, GPU_GCC_L15_ECC_CORRECTED, 0, *corrected_err); nvgpu_writel(g, @@ -342,7 +342,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, ); } *uncorrected_err += gcc_l15_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GCC, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc, GPU_GCC_L15_ECC_UNCORRECTED, 0, *uncorrected_err); nvgpu_writel(g, @@ -430,7 +430,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, 0, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); @@ -438,7 +438,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, 0, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); @@ -446,7 +446,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, 0, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); @@ -454,7 +454,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, 0, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); @@ -537,28 +537,28 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, ecc_addr, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, ecc_addr, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, ecc_addr, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, ecc_addr, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 119299494..7e02cb401 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -64,6 +64,7 @@ #include "hal/fb/fb_gv11b.h" #include "hal/fb/fb_mmu_fault_gv11b.h" #include "hal/fb/intr/fb_intr_gv11b.h" +#include "hal/fb/intr/fb_intr_ecc_gv11b.h" #include "hal/fuse/fuse_gm20b.h" #include "hal/fuse/fuse_gp10b.h" #include "hal/ptimer/ptimer_gk20a.h" @@ -179,6 +180,8 @@ static void gv11b_init_gpu_characteristics(struct gk20a *g) static const struct gpu_ops gv11b_ops = { .ltc = { + .get_ltc_err_desc = + gv11b_ltc_get_err_desc, .determine_L2_size_bytes = gp10b_determine_L2_size_bytes, #ifdef NVGPU_GRAPHICS .set_zbc_s_entry = gv11b_ltc_set_zbc_stencil_entry, @@ -295,6 +298,16 @@ static const struct gpu_ops gv11b_ops = { .ecc = { .detect = gv11b_ecc_detect_enabled_units, .init = gv11b_ecc_init, + .get_mmu_err_desc = + gv11b_gr_intr_get_mmu_err_desc, + .get_gcc_err_desc = + gv11b_gr_intr_get_gcc_err_desc, + .get_sm_err_desc = + gv11b_gr_intr_get_sm_err_desc, + .get_gpccs_err_desc = + gv11b_gr_intr_get_gpccs_err_desc, + .get_fecs_err_desc = + gv11b_gr_intr_get_fecs_err_desc, }, .ctxsw_prog = { .hw_get_fecs_header_size = @@ -651,6 +664,8 @@ static const struct gpu_ops gv11b_ops = { .is_valid_compute = gv11b_class_is_valid_compute, }, .fb = { + .get_hubmmu_err_desc = + gv11b_fb_intr_get_hubmmu_err_desc, .init_hw = gv11b_fb_init_hw, .init_fs_state = gv11b_fb_init_fs_state, .cbc_configure = gv11b_fb_cbc_configure, @@ -998,6 +1013,8 @@ static const struct gpu_ops gv11b_ops = { .elcg_init_idle_filters = gv11b_elcg_init_idle_filters, }, .pmu = { + .get_pmu_err_desc = + gv11b_pmu_intr_get_err_desc, /* * Basic init ops are must, as PMU engine used by ACR to * load & bootstrap GR LS falcons without LS PMU, remaining diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c index 85c87ae16..2381ead6d 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c @@ -154,7 +154,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_RSTG_ECC_CORRECTED, ecc_addr, @@ -163,7 +163,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr, @@ -172,7 +172,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_TSTG_ECC_CORRECTED, ecc_addr, @@ -181,7 +181,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, @@ -193,13 +193,13 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) { if ((dstg_ecc_addr & ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr, g->ecc.ltc.ecc_sec_count[ltc][slice].counter); } else { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_BE_ECC_CORRECTED, ecc_addr, @@ -209,13 +209,13 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) { if ((dstg_ecc_addr & ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr, g->ecc.ltc.ecc_ded_count[ltc][slice].counter); } else { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c index 9ba7b8479..b656bd05c 100644 --- a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c @@ -33,6 +33,46 @@ #include +static struct nvgpu_hw_err_inject_info ltc_ecc_err_desc[] = { + NVGPU_ECC_ERR("cache_rstg_ecc_corrected", + gv11b_ltc_inject_ecc_error, + ltc_ltc0_lts0_l1_cache_ecc_control_r, + ltc_ltc0_lts0_l1_cache_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("cache_rstg_ecc_uncorrected", + gv11b_ltc_inject_ecc_error, + ltc_ltc0_lts0_l1_cache_ecc_control_r, + ltc_ltc0_lts0_l1_cache_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc ltc_err_desc; + +struct nvgpu_hw_err_inject_info_desc * gv11b_ltc_get_err_desc(struct gk20a *g) +{ + ltc_err_desc.info_ptr = ltc_ecc_err_desc; + ltc_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(ltc_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return <c_err_desc; +} + +int gv11b_ltc_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE); + unsigned int lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE); + unsigned int ltc = (error_info & 0xFF00U) >> 8U; + unsigned int lts = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + ltc * ltc_stride + + lts * lts_stride; + + nvgpu_info(g, "Injecting LTC fault %s for ltc: %d, lts: %d", + err->name, ltc, lts); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + #ifdef NVGPU_GRAPHICS /* * Sets the ZBC stencil for the passed index. diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h index 5e2e73584..ea66e066b 100644 --- a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h +++ b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h @@ -22,7 +22,13 @@ #ifndef LTC_GV11B_H #define LTC_GV11B_H + +#include +#include + struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; #ifdef NVGPU_GRAPHICS void gv11b_ltc_set_zbc_stencil_entry(struct gk20a *g, @@ -30,5 +36,8 @@ void gv11b_ltc_set_zbc_stencil_entry(struct gk20a *g, u32 index); #endif /* NVGPU_GRAPHICS */ void gv11b_ltc_init_fs_state(struct gk20a *g); +struct nvgpu_hw_err_inject_info_desc * gv11b_ltc_get_err_desc(struct gk20a *g); +int gv11b_ltc_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); #endif diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c index ce76202e6..38297e346 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c @@ -39,6 +39,39 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr); #define ALIGN_4KB 12 +static struct nvgpu_hw_err_inject_info pmu_ecc_err_desc[] = { + NVGPU_ECC_ERR("falcon_imem_ecc_corrected", + gv11b_pmu_inject_ecc_error, + pwr_pmu_falcon_ecc_control_r, + pwr_pmu_falcon_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected", + gv11b_pmu_inject_ecc_error, + pwr_pmu_falcon_ecc_control_r, + pwr_pmu_falcon_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc pmu_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_pmu_intr_get_err_desc(struct gk20a *g) +{ + pmu_err_desc.info_ptr = pmu_ecc_err_desc; + pmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(pmu_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &pmu_err_desc; +} + +int gv11b_pmu_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + nvgpu_info(g, "Injecting PMU fault %s", err->name); + nvgpu_writel(g, err->get_reg_addr(), err->get_reg_val(1U)); + + return 0; +} + #ifdef NVGPU_FEATURE_LS_PMU /* PROD settings for ELPG sequencing registers*/ static struct pg_init_sequence_list _pginitseq_gv11b[] = { @@ -443,7 +476,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_IMEM_ECC_CORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); @@ -451,7 +484,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); @@ -460,7 +493,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) } if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_DMEM_ECC_CORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); @@ -468,7 +501,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h index 9acec46da..2676a4a33 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h @@ -23,9 +23,12 @@ #ifndef PMU_GV11B_H #define PMU_GV11B_H +#include #include struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; bool gv11b_pmu_is_debug_mode_en(struct gk20a *g); void gv11b_pmu_flcn_setup_boot_config(struct gk20a *g); @@ -43,5 +46,8 @@ void gv11b_clear_pmu_bar0_host_err_status(struct gk20a *g); int gv11b_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status, u32 *etype); bool gv11b_pmu_validate_mem_integrity(struct gk20a *g); +struct nvgpu_hw_err_inject_info_desc * gv11b_pmu_intr_get_err_desc(struct gk20a *g); +int gv11b_pmu_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); #endif /* PMU_GV11B_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index bc72fb0db..d99a86719 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -214,6 +214,8 @@ enum nvgpu_event_id_type { struct gpu_ops { struct { u64 (*determine_L2_size_bytes)(struct gk20a *gk20a); + struct nvgpu_hw_err_inject_info_desc * (*get_ltc_err_desc) + (struct gk20a *g); #ifdef NVGPU_GRAPHICS void (*set_zbc_color_entry)(struct gk20a *g, u32 *color_val_l2, @@ -411,6 +413,16 @@ struct gpu_ops { struct { void (*detect)(struct gk20a *g); int (*init)(struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_mmu_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_gcc_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_sm_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_gpccs_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_fecs_err_desc) + (struct gk20a *g); } ecc; struct { u32 (*hw_get_fecs_header_size)(void); @@ -847,6 +859,8 @@ struct gpu_ops { } gpu_class; struct { + struct nvgpu_hw_err_inject_info_desc * (*get_hubmmu_err_desc) + (struct gk20a *g); void (*init_hw)(struct gk20a *g); void (*cbc_configure)(struct gk20a *g, struct nvgpu_cbc *cbc); void (*init_fs_state)(struct gk20a *g); @@ -1317,6 +1331,8 @@ struct gpu_ops { u32 (*idle_slowdown_disable)(struct gk20a *g); } therm; struct { + struct nvgpu_hw_err_inject_info_desc * (*get_pmu_err_desc) + (struct gk20a *g); bool (*is_pmu_supported)(struct gk20a *g); u32 (*falcon_base_addr)(void); /* reset */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 5f46b76d7..8ae307675 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -175,6 +175,27 @@ struct gr_err_info { struct gr_exception_info *exception_info; }; +#define NVGPU_ECC_ERR(err_name, inject_fn, addr, val) \ +{ \ + .name = (err_name), \ + .inject_hw_fault = (inject_fn), \ + .get_reg_addr = (addr), \ + .get_reg_val = (val) \ +} + +struct nvgpu_hw_err_inject_info { + const char *name; + int (*inject_hw_fault)(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 err_info); + u32 (*get_reg_addr)(void); + u32 (*get_reg_val)(u32 val); +}; + +struct nvgpu_hw_err_inject_info_desc { + struct nvgpu_hw_err_inject_info *info_ptr; + u32 info_size; +}; + /* Functions to report errors to 3LSS */ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_id, u32 intr_info); @@ -182,7 +203,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, int nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_id, u32 intr_info); -int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst, +int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_type, u64 err_addr, u64 err_count); int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id, diff --git a/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c b/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c index 9249e5988..caf73778e 100644 --- a/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c +++ b/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c @@ -30,7 +30,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, return 0; } -int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst, +int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_type, u64 err_addr, u64 err_count) { return 0; diff --git a/drivers/gpu/nvgpu/os/posix/stubs.c b/drivers/gpu/nvgpu/os/posix/stubs.c index a771965ce..5956d0539 100644 --- a/drivers/gpu/nvgpu/os/posix/stubs.c +++ b/drivers/gpu/nvgpu/os/posix/stubs.c @@ -50,7 +50,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, return 0; } -int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst, +int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_type, u64 err_addr, u64 err_count) { return 0;