From 05ed37ae3a7e4c389656f7374d2d0ed845043126 Mon Sep 17 00:00:00 2001 From: Rajesh Devaraj Date: Mon, 6 May 2019 15:46:40 +0530 Subject: [PATCH] gpu: nvgpu: remove usage of hw headers from SDL This patch does the following: (1) Removes the usage of hw headers in SDL unit. For this purpose, it moves the initialization required for errors that can be injected using hw support, error injection function. Further, it passes the required information to SDL via hal layers. (2) Renames (i) PWR as PMU, (ii) nvgpu_report_ecc_parity_err to nvgpu_report_ecc_err. Jira NVGPU-3235 Change-Id: I69290af78c09fbb5b792058e7bc6cc8b6ba340c9 Signed-off-by: Rajesh Devaraj Reviewed-on: https://git-master.nvidia.com/r/2112837 Reviewed-by: Raghuram Kothakota Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-misra GVS: Gerrit_Virtual_Submit Reviewed-by: Vaibhav Kachore Reviewed-by: mobile promotions Tested-by: mobile promotions --- .../gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c | 56 ++++- .../gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h | 9 + drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c | 197 ++++++++++++++++++ drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h | 25 +++ drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c | 40 ++-- drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c | 28 +-- drivers/gpu/nvgpu/hal/init/hal_gv11b.c | 17 ++ .../gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c | 16 +- drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c | 40 ++++ drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h | 9 + drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c | 41 +++- drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h | 6 + drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 16 ++ drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 23 +- drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c | 2 +- drivers/gpu/nvgpu/os/posix/stubs.c | 2 +- 16 files changed, 470 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c index be1d26126..8ad2f9a42 100644 --- a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.c @@ -31,6 +31,46 @@ #include +static struct nvgpu_hw_err_inject_info hubmmu_ecc_err_desc[] = { + NVGPU_ECC_ERR("hubmmu_l2tlb_sa_data_ecc_uncorrected", + gv11b_fb_intr_inject_hubmmu_ecc_error, + fb_mmu_l2tlb_ecc_control_r, + fb_mmu_l2tlb_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("hubmmu_tlb_sa_data_ecc_uncorrected", + gv11b_fb_intr_inject_hubmmu_ecc_error, + fb_mmu_hubtlb_ecc_control_r, + fb_mmu_hubtlb_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("hubmmu_pte_data_ecc_uncorrected", + gv11b_fb_intr_inject_hubmmu_ecc_error, + fb_mmu_fillunit_ecc_control_r, + fb_mmu_fillunit_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc hubmmu_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_fb_intr_get_hubmmu_err_desc(struct gk20a *g) +{ + hubmmu_err_desc.info_ptr = hubmmu_ecc_err_desc; + hubmmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(hubmmu_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &hubmmu_err_desc; +} + +int gv11b_fb_intr_inject_hubmmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, + u32 error_info) +{ + unsigned int reg_addr = err->get_reg_addr(); + + nvgpu_info(g, "Injecting HUBMMU fault %s", err->name); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status) { u32 ecc_addr, corrected_cnt, uncorrected_cnt; @@ -83,7 +123,7 @@ static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_l2tlb_ecc_status_corrected_err_l2tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_l2tlb_ecc_corrected_err_count[0].counter); @@ -92,7 +132,7 @@ static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_l2tlb_ecc_status_uncorrected_err_l2tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter); @@ -161,7 +201,7 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_hubtlb_ecc_status_corrected_err_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_hubtlb_ecc_corrected_err_count[0].counter); @@ -169,7 +209,7 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter); @@ -239,7 +279,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_fillunit_ecc_status_corrected_err_pte_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PTE_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter); @@ -248,7 +288,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); @@ -256,7 +296,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) } if ((ecc_status & fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PDE0_DATA_ECC_CORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter); @@ -265,7 +305,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status) if ((ecc_status & fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, ecc_addr, g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter); diff --git a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h index 9ecdce587..bc73a6148 100644 --- a/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h +++ b/drivers/gpu/nvgpu/hal/fb/intr/fb_intr_ecc_gv11b.h @@ -25,8 +25,17 @@ #ifndef NVGPU_FB_INTR_ECC_GV11B_H #define NVGPU_FB_INTR_ECC_GV11B_H +#include +#include + struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; void gv11b_fb_intr_handle_ecc(struct gk20a *g); +struct nvgpu_hw_err_inject_info_desc * + gv11b_fb_intr_get_hubmmu_err_desc(struct gk20a *g); +int gv11b_fb_intr_inject_hubmmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); #endif /* NVGPU_FB_INTR_ECC_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c index 0c14be62c..d708c23c5 100644 --- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.c @@ -28,6 +28,203 @@ #include "ecc_gv11b.h" +static struct nvgpu_hw_err_inject_info fecs_ecc_err_desc[] = { + NVGPU_ECC_ERR("falcon_imem_ecc_corrected", + gv11b_gr_intr_inject_fecs_ecc_error, + gr_fecs_falcon_ecc_control_r, + gr_fecs_falcon_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected", + gv11b_gr_intr_inject_fecs_ecc_error, + gr_fecs_falcon_ecc_control_r, + gr_fecs_falcon_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc fecs_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_fecs_err_desc(struct gk20a *g) +{ + fecs_err_desc.info_ptr = fecs_ecc_err_desc; + fecs_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(fecs_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &fecs_err_desc; +} + +int gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + nvgpu_info(g, "Injecting FECS fault %s", err->name); + nvgpu_writel(g, err->get_reg_addr(), err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info gpccs_ecc_err_desc[] = { + NVGPU_ECC_ERR("falcon_imem_ecc_corrected", + gv11b_gr_intr_inject_gpccs_ecc_error, + gr_gpccs_falcon_ecc_control_r, + gr_gpccs_falcon_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected", + gv11b_gr_intr_inject_gpccs_ecc_error, + gr_gpccs_falcon_ecc_control_r, + gr_gpccs_falcon_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc gpccs_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gpccs_err_desc(struct gk20a *g) +{ + gpccs_err_desc.info_ptr = gpccs_ecc_err_desc; + gpccs_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(gpccs_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &gpccs_err_desc; +} + +int gv11b_gr_intr_inject_gpccs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride; + + nvgpu_info(g, "Injecting GPCCS fault %s for gpc: %d", err->name, gpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info sm_ecc_err_desc[] = { + NVGPU_ECC_ERR("l1_tag_ecc_corrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_r, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("l1_tag_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_r, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("cbu_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_r, + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("lrf_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_r, + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("l1_data_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_r, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_inject_uncorrected_err_f), + NVGPU_ECC_ERR("icache_l0_data_ecc_uncorrected", + gv11b_gr_intr_inject_sm_ecc_error, + gr_pri_gpc0_tpc0_sm_icache_ecc_control_r, + gr_pri_gpc0_tpc0_sm_icache_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc sm_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_sm_err_desc(struct gk20a *g) +{ + sm_err_desc.info_ptr = sm_ecc_err_desc; + sm_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(sm_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &sm_err_desc; +} + +int gv11b_gr_intr_inject_sm_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, + u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + unsigned int tpc_stride = + nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFF00U) >> 8U; + unsigned int tpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride + + tpc * tpc_stride; + + nvgpu_info(g, "Injecting SM fault %s for gpc: %d, tpc: %d", + err->name, gpc, tpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info mmu_ecc_err_desc[] = { + NVGPU_ECC_ERR("l1tlb_sa_data_ecc_uncorrected", + gv11b_gr_intr_inject_mmu_ecc_error, + gr_gpc0_mmu_l1tlb_ecc_control_r, + gr_gpc0_mmu_l1tlb_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc mmu_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_mmu_err_desc(struct gk20a *g) +{ + mmu_err_desc.info_ptr = mmu_ecc_err_desc; + mmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(mmu_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &mmu_err_desc; +} + +int gv11b_gr_intr_inject_mmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride; + + nvgpu_info(g, "Injecting MMU fault %s for gpc: %d", err->name, gpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + +static struct nvgpu_hw_err_inject_info gcc_ecc_err_desc[] = { + NVGPU_ECC_ERR("l15_ecc_uncorrected", + gv11b_gr_intr_inject_gcc_ecc_error, + gr_pri_gpc0_gcc_l15_ecc_control_r, + gr_pri_gpc0_gcc_l15_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc gcc_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gcc_err_desc(struct gk20a *g) +{ + gcc_err_desc.info_ptr = gcc_ecc_err_desc; + gcc_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(gcc_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &gcc_err_desc; +} + +int gv11b_gr_intr_inject_gcc_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int gpc_stride = nvgpu_get_litter_value(g, + GPU_LIT_GPC_STRIDE); + unsigned int gpc = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + + gpc * gpc_stride; + + nvgpu_info(g, "Injecting GCC fault %s for gpc: %d", err->name, gpc); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + void gv11b_ecc_detect_enabled_units(struct gk20a *g) { bool opt_ecc_en = g->ops.fuse.is_opt_ecc_enable(g); diff --git a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h index 40cc9ac2e..3c868cdec 100644 --- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h @@ -23,9 +23,34 @@ #ifndef NVGPU_ECC_GV11B_H #define NVGPU_ECC_GV11B_H +#include +#include + struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; void gv11b_ecc_detect_enabled_units(struct gk20a *g); int gv11b_ecc_init(struct gk20a *g); +int gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_fecs_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_gpccs_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gpccs_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_sm_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_sm_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_mmu_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_mmu_err_desc(struct gk20a *g); +int gv11b_gr_intr_inject_gcc_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); +struct nvgpu_hw_err_inject_info_desc * +gv11b_gr_intr_get_gcc_err_desc(struct gk20a *g); #endif /* NVGPU_ECC_GV11B_H */ diff --git a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c index 913633774..6be068e2e 100644 --- a/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/gr/gr_gv11b.c @@ -154,7 +154,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_ECC_CORRECTED, 0, @@ -162,7 +162,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0, @@ -170,7 +170,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0, @@ -195,7 +195,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, if ((l1_tag_ecc_status & (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, @@ -203,7 +203,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, @@ -211,7 +211,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((l1_tag_ecc_status & gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, @@ -293,7 +293,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter += lrf_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_LRF_ECC_CORRECTED, 0, @@ -314,7 +314,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += lrf_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_LRF_ECC_UNCORRECTED, 0, @@ -387,7 +387,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter += cbu_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_CBU_ECC_CORRECTED, @@ -408,7 +408,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += cbu_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_CBU_ECC_UNCORRECTED, @@ -477,7 +477,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter += l1_data_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_DATA_ECC_CORRECTED, @@ -498,7 +498,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, } g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += l1_data_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_L1_DATA_ECC_UNCORRECTED, @@ -575,7 +575,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, 0); if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED, @@ -583,7 +583,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED, @@ -591,7 +591,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED, @@ -599,7 +599,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED, @@ -623,7 +623,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, 0); if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, @@ -631,7 +631,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, @@ -639,7 +639,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, @@ -647,7 +647,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, } if ((icache_ecc_status & gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, (gpc << 8) | tpc, GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c index e0ed351c1..8d4f1f4a3 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.c @@ -50,28 +50,28 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g) fecs_ecc_status.uncorrected_delta; if (fecs_ecc_status.imem_corrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_IMEM_ECC_CORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if (fecs_ecc_status.imem_uncorrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if (fecs_ecc_status.dmem_corrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_DMEM_ECC_CORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if (fecs_ecc_status.dmem_uncorrected_err) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, fecs_ecc_status.ecc_addr, g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); @@ -320,7 +320,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, ); } *corrected_err += gcc_l15_corrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GCC, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc, GPU_GCC_L15_ECC_CORRECTED, 0, *corrected_err); nvgpu_writel(g, @@ -342,7 +342,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, ); } *uncorrected_err += gcc_l15_uncorrected_err_count_delta; - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GCC, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc, GPU_GCC_L15_ECC_UNCORRECTED, 0, *uncorrected_err); nvgpu_writel(g, @@ -430,7 +430,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED, 0, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); @@ -438,7 +438,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, 0, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); @@ -446,7 +446,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED, 0, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); @@ -454,7 +454,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc, GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, 0, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); @@ -537,28 +537,28 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, ecc_addr, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, ecc_addr, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED, ecc_addr, (u32)*corrected_err); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected"); } if ((ecc_status & gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS, gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, ecc_addr, (u32)*uncorrected_err); nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 119299494..7e02cb401 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -64,6 +64,7 @@ #include "hal/fb/fb_gv11b.h" #include "hal/fb/fb_mmu_fault_gv11b.h" #include "hal/fb/intr/fb_intr_gv11b.h" +#include "hal/fb/intr/fb_intr_ecc_gv11b.h" #include "hal/fuse/fuse_gm20b.h" #include "hal/fuse/fuse_gp10b.h" #include "hal/ptimer/ptimer_gk20a.h" @@ -179,6 +180,8 @@ static void gv11b_init_gpu_characteristics(struct gk20a *g) static const struct gpu_ops gv11b_ops = { .ltc = { + .get_ltc_err_desc = + gv11b_ltc_get_err_desc, .determine_L2_size_bytes = gp10b_determine_L2_size_bytes, #ifdef NVGPU_GRAPHICS .set_zbc_s_entry = gv11b_ltc_set_zbc_stencil_entry, @@ -295,6 +298,16 @@ static const struct gpu_ops gv11b_ops = { .ecc = { .detect = gv11b_ecc_detect_enabled_units, .init = gv11b_ecc_init, + .get_mmu_err_desc = + gv11b_gr_intr_get_mmu_err_desc, + .get_gcc_err_desc = + gv11b_gr_intr_get_gcc_err_desc, + .get_sm_err_desc = + gv11b_gr_intr_get_sm_err_desc, + .get_gpccs_err_desc = + gv11b_gr_intr_get_gpccs_err_desc, + .get_fecs_err_desc = + gv11b_gr_intr_get_fecs_err_desc, }, .ctxsw_prog = { .hw_get_fecs_header_size = @@ -651,6 +664,8 @@ static const struct gpu_ops gv11b_ops = { .is_valid_compute = gv11b_class_is_valid_compute, }, .fb = { + .get_hubmmu_err_desc = + gv11b_fb_intr_get_hubmmu_err_desc, .init_hw = gv11b_fb_init_hw, .init_fs_state = gv11b_fb_init_fs_state, .cbc_configure = gv11b_fb_cbc_configure, @@ -998,6 +1013,8 @@ static const struct gpu_ops gv11b_ops = { .elcg_init_idle_filters = gv11b_elcg_init_idle_filters, }, .pmu = { + .get_pmu_err_desc = + gv11b_pmu_intr_get_err_desc, /* * Basic init ops are must, as PMU engine used by ACR to * load & bootstrap GR LS falcons without LS PMU, remaining diff --git a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c index 85c87ae16..2381ead6d 100644 --- a/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c +++ b/drivers/gpu/nvgpu/hal/ltc/intr/ltc_intr_gv11b.c @@ -154,7 +154,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_RSTG_ECC_CORRECTED, ecc_addr, @@ -163,7 +163,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr, @@ -172,7 +172,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_TSTG_ECC_CORRECTED, ecc_addr, @@ -181,7 +181,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr, @@ -193,13 +193,13 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) { if ((dstg_ecc_addr & ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr, g->ecc.ltc.ecc_sec_count[ltc][slice].counter); } else { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_BE_ECC_CORRECTED, ecc_addr, @@ -209,13 +209,13 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g, } if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) { if ((dstg_ecc_addr & ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr, g->ecc.ltc.ecc_ded_count[ltc][slice].counter); } else { - (void) nvgpu_report_ecc_parity_err(g, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_LTC, (ltc << 8U) | slice, GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr, diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c index 9ba7b8479..b656bd05c 100644 --- a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c +++ b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.c @@ -33,6 +33,46 @@ #include +static struct nvgpu_hw_err_inject_info ltc_ecc_err_desc[] = { + NVGPU_ECC_ERR("cache_rstg_ecc_corrected", + gv11b_ltc_inject_ecc_error, + ltc_ltc0_lts0_l1_cache_ecc_control_r, + ltc_ltc0_lts0_l1_cache_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("cache_rstg_ecc_uncorrected", + gv11b_ltc_inject_ecc_error, + ltc_ltc0_lts0_l1_cache_ecc_control_r, + ltc_ltc0_lts0_l1_cache_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc ltc_err_desc; + +struct nvgpu_hw_err_inject_info_desc * gv11b_ltc_get_err_desc(struct gk20a *g) +{ + ltc_err_desc.info_ptr = ltc_ecc_err_desc; + ltc_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(ltc_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return <c_err_desc; +} + +int gv11b_ltc_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + unsigned int ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE); + unsigned int lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE); + unsigned int ltc = (error_info & 0xFF00U) >> 8U; + unsigned int lts = (error_info & 0xFFU); + unsigned int reg_addr = err->get_reg_addr() + ltc * ltc_stride + + lts * lts_stride; + + nvgpu_info(g, "Injecting LTC fault %s for ltc: %d, lts: %d", + err->name, ltc, lts); + nvgpu_writel(g, reg_addr, err->get_reg_val(1U)); + + return 0; +} + #ifdef NVGPU_GRAPHICS /* * Sets the ZBC stencil for the passed index. diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h index 5e2e73584..ea66e066b 100644 --- a/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h +++ b/drivers/gpu/nvgpu/hal/ltc/ltc_gv11b.h @@ -22,7 +22,13 @@ #ifndef LTC_GV11B_H #define LTC_GV11B_H + +#include +#include + struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; #ifdef NVGPU_GRAPHICS void gv11b_ltc_set_zbc_stencil_entry(struct gk20a *g, @@ -30,5 +36,8 @@ void gv11b_ltc_set_zbc_stencil_entry(struct gk20a *g, u32 index); #endif /* NVGPU_GRAPHICS */ void gv11b_ltc_init_fs_state(struct gk20a *g); +struct nvgpu_hw_err_inject_info_desc * gv11b_ltc_get_err_desc(struct gk20a *g); +int gv11b_ltc_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); #endif diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c index ce76202e6..38297e346 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.c @@ -39,6 +39,39 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr); #define ALIGN_4KB 12 +static struct nvgpu_hw_err_inject_info pmu_ecc_err_desc[] = { + NVGPU_ECC_ERR("falcon_imem_ecc_corrected", + gv11b_pmu_inject_ecc_error, + pwr_pmu_falcon_ecc_control_r, + pwr_pmu_falcon_ecc_control_inject_corrected_err_f), + NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected", + gv11b_pmu_inject_ecc_error, + pwr_pmu_falcon_ecc_control_r, + pwr_pmu_falcon_ecc_control_inject_uncorrected_err_f), +}; + +static struct nvgpu_hw_err_inject_info_desc pmu_err_desc; + +struct nvgpu_hw_err_inject_info_desc * +gv11b_pmu_intr_get_err_desc(struct gk20a *g) +{ + pmu_err_desc.info_ptr = pmu_ecc_err_desc; + pmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32( + sizeof(pmu_ecc_err_desc) / + sizeof(struct nvgpu_hw_err_inject_info)); + + return &pmu_err_desc; +} + +int gv11b_pmu_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info) +{ + nvgpu_info(g, "Injecting PMU fault %s", err->name); + nvgpu_writel(g, err->get_reg_addr(), err->get_reg_val(1U)); + + return 0; +} + #ifdef NVGPU_FEATURE_LS_PMU /* PROD settings for ELPG sequencing registers*/ static struct pg_init_sequence_list _pginitseq_gv11b[] = { @@ -443,7 +476,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_IMEM_ECC_CORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); @@ -451,7 +484,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); @@ -460,7 +493,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) } if ((ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_DMEM_ECC_CORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); @@ -468,7 +501,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr) } if ((ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) { - (void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0, + (void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, ecc_addr, g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); diff --git a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h index 9acec46da..2676a4a33 100644 --- a/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h +++ b/drivers/gpu/nvgpu/hal/pmu/pmu_gv11b.h @@ -23,9 +23,12 @@ #ifndef PMU_GV11B_H #define PMU_GV11B_H +#include #include struct gk20a; +struct nvgpu_hw_err_inject_info; +struct nvgpu_hw_err_inject_info_desc; bool gv11b_pmu_is_debug_mode_en(struct gk20a *g); void gv11b_pmu_flcn_setup_boot_config(struct gk20a *g); @@ -43,5 +46,8 @@ void gv11b_clear_pmu_bar0_host_err_status(struct gk20a *g); int gv11b_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status, u32 *etype); bool gv11b_pmu_validate_mem_integrity(struct gk20a *g); +struct nvgpu_hw_err_inject_info_desc * gv11b_pmu_intr_get_err_desc(struct gk20a *g); +int gv11b_pmu_inject_ecc_error(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 error_info); #endif /* PMU_GV11B_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index bc72fb0db..d99a86719 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -214,6 +214,8 @@ enum nvgpu_event_id_type { struct gpu_ops { struct { u64 (*determine_L2_size_bytes)(struct gk20a *gk20a); + struct nvgpu_hw_err_inject_info_desc * (*get_ltc_err_desc) + (struct gk20a *g); #ifdef NVGPU_GRAPHICS void (*set_zbc_color_entry)(struct gk20a *g, u32 *color_val_l2, @@ -411,6 +413,16 @@ struct gpu_ops { struct { void (*detect)(struct gk20a *g); int (*init)(struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_mmu_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_gcc_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_sm_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_gpccs_err_desc) + (struct gk20a *g); + struct nvgpu_hw_err_inject_info_desc * (*get_fecs_err_desc) + (struct gk20a *g); } ecc; struct { u32 (*hw_get_fecs_header_size)(void); @@ -847,6 +859,8 @@ struct gpu_ops { } gpu_class; struct { + struct nvgpu_hw_err_inject_info_desc * (*get_hubmmu_err_desc) + (struct gk20a *g); void (*init_hw)(struct gk20a *g); void (*cbc_configure)(struct gk20a *g, struct nvgpu_cbc *cbc); void (*init_fs_state)(struct gk20a *g); @@ -1317,6 +1331,8 @@ struct gpu_ops { u32 (*idle_slowdown_disable)(struct gk20a *g); } therm; struct { + struct nvgpu_hw_err_inject_info_desc * (*get_pmu_err_desc) + (struct gk20a *g); bool (*is_pmu_supported)(struct gk20a *g); u32 (*falcon_base_addr)(void); /* reset */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 5f46b76d7..8ae307675 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -175,6 +175,27 @@ struct gr_err_info { struct gr_exception_info *exception_info; }; +#define NVGPU_ECC_ERR(err_name, inject_fn, addr, val) \ +{ \ + .name = (err_name), \ + .inject_hw_fault = (inject_fn), \ + .get_reg_addr = (addr), \ + .get_reg_val = (val) \ +} + +struct nvgpu_hw_err_inject_info { + const char *name; + int (*inject_hw_fault)(struct gk20a *g, + struct nvgpu_hw_err_inject_info *err, u32 err_info); + u32 (*get_reg_addr)(void); + u32 (*get_reg_val)(u32 val); +}; + +struct nvgpu_hw_err_inject_info_desc { + struct nvgpu_hw_err_inject_info *info_ptr; + u32 info_size; +}; + /* Functions to report errors to 3LSS */ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_id, u32 intr_info); @@ -182,7 +203,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, int nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_id, u32 intr_info); -int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst, +int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_type, u64 err_addr, u64 err_count); int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id, diff --git a/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c b/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c index 9249e5988..caf73778e 100644 --- a/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c +++ b/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c @@ -30,7 +30,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, return 0; } -int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst, +int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_type, u64 err_addr, u64 err_count) { return 0; diff --git a/drivers/gpu/nvgpu/os/posix/stubs.c b/drivers/gpu/nvgpu/os/posix/stubs.c index a771965ce..5956d0539 100644 --- a/drivers/gpu/nvgpu/os/posix/stubs.c +++ b/drivers/gpu/nvgpu/os/posix/stubs.c @@ -50,7 +50,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, return 0; } -int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst, +int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, u32 err_type, u64 err_addr, u64 err_count) { return 0;