From 2793e76c0642f17ebd35c78257238930d8e85222 Mon Sep 17 00:00:00 2001 From: Rajesh Devaraj Date: Mon, 22 Jul 2019 16:35:13 +0530 Subject: [PATCH] gpu: nvgpu: handle and report graphics exceptions This patch adds the support to handle and report graphics related exceptions to 3LSS. Specifically, it adds the following exceptions: NV_PGRAPH_PRI_BE0_BECS_BE_EXCEPTION_CROP NV_PGRAPH_PRI_BE0_BECS_BE_EXCEPTION_ZROP NV_PGRAPH_PRI_GPC0_GPCCS_GPC_EXCEPTION_PROP NV_PGRAPH_PRI_GPC0_GPCCS_GPC_EXCEPTION_ZCULL NV_PGRAPH_PRI_GPC0_GPCCS_GPC_EXCEPTION_SETUP NV_PGRAPH_PRI_GPC0_GPCCS_GPC_EXCEPTION_PES0 NV_PGRAPH_PRI_GPC0_GPCCS_GPC_EXCEPTION_PES1 NV_PGRAPH_PRI_GPC0_TPC0_TPCCS_TPC_EXCEPTION_PE JIRA NVGPU-3457 Change-Id: Ib24b67ed33ae139317ec85bba3fbb80ba51fd384 Signed-off-by: Rajesh Devaraj Reviewed-on: https://git-master.nvidia.com/r/2158609 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/gr/gr_intr.c | 33 +++++ drivers/gpu/nvgpu/common/gr/gr_intr_priv.h | 1 + .../nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c | 40 +++++ drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h | 9 ++ .../nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c | 138 +++++++++++++++++- drivers/gpu/nvgpu/hal/init/hal_gv11b.c | 10 ++ drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 10 ++ drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 13 ++ 8 files changed, 253 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/nvgpu/common/gr/gr_intr.c b/drivers/gpu/nvgpu/common/gr/gr_intr.c index 432fd673c..578d64f36 100644 --- a/drivers/gpu/nvgpu/common/gr/gr_intr.c +++ b/drivers/gpu/nvgpu/common/gr/gr_intr.c @@ -129,6 +129,15 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc, } } + /* check if a pe exception is pending */ + if (pending_tpc.pe_exception) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GPC%d TPC%d: PE exception pending", gpc, tpc); + if (g->ops.gr.intr.handle_tpc_pe_exception != NULL) { + g->ops.gr.intr.handle_tpc_pe_exception(g, gpc, tpc); + } + } + return ret; } @@ -612,6 +621,30 @@ int nvgpu_gr_intr_handle_gpc_exception(struct gk20a *g, bool *post_event, &g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter); } + /* Handle PROP exception */ + if (g->ops.gr.intr.handle_gpc_prop_exception != NULL) { + g->ops.gr.intr.handle_gpc_prop_exception(g, gpc, + gpc_exception); + } + + /* Handle ZCULL exception */ + if (g->ops.gr.intr.handle_gpc_zcull_exception != NULL) { + g->ops.gr.intr.handle_gpc_zcull_exception(g, gpc, + gpc_exception); + } + + /* Handle SETUP exception */ + if (g->ops.gr.intr.handle_gpc_setup_exception != NULL) { + g->ops.gr.intr.handle_gpc_setup_exception(g, gpc, + gpc_exception); + } + + /* Handle PES exception */ + if (g->ops.gr.intr.handle_gpc_pes_exception != NULL) { + g->ops.gr.intr.handle_gpc_pes_exception(g, gpc, + gpc_exception); + } + } return ret; diff --git a/drivers/gpu/nvgpu/common/gr/gr_intr_priv.h b/drivers/gpu/nvgpu/common/gr/gr_intr_priv.h index 74ef82c77..9abc2c2d2 100644 --- a/drivers/gpu/nvgpu/common/gr/gr_intr_priv.h +++ b/drivers/gpu/nvgpu/common/gr/gr_intr_priv.h @@ -44,6 +44,7 @@ struct nvgpu_gr_tpc_exception { bool tex_exception; bool sm_exception; bool mpc_exception; + bool pe_exception; }; struct nvgpu_gr_isr_data { diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c index ad088cf8d..79e46f893 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b_fusa.c @@ -184,6 +184,40 @@ static u32 gr_gm20b_intr_check_gr_sked_exception(struct gk20a *g, return 0U; } +static u32 gr_gm20b_intr_check_gr_be_crop_exception(struct gk20a *g, + u32 exception) +{ + if ((exception & gr_pri_be0_becs_be_exception_crop_m()) != 0U) { + u32 crop = nvgpu_readl(g, gr_crop_hww_esr_r()); + + nvgpu_gr_intr_report_exception(g, 0, + GPU_PGRAPH_BE_EXCEPTION, + crop, GPU_PGRAPH_BE_EXCEPTION_CROP); + nvgpu_err(g, "crop exception: esr 0x%08x", crop); + nvgpu_writel(g, gr_crop_hww_esr_r(), + gr_crop_hww_esr_reset_active_f()); + return 1U; + } + return 0U; +} + +static u32 gr_gm20b_intr_check_gr_be_zrop_exception(struct gk20a *g, + u32 exception) +{ + if ((exception & gr_pri_be0_becs_be_exception_zrop_m()) != 0U) { + u32 zrop = nvgpu_readl(g, gr_zrop_hww_esr_r()); + + nvgpu_gr_intr_report_exception(g, 0, + GPU_PGRAPH_BE_EXCEPTION, + zrop, GPU_PGRAPH_BE_EXCEPTION_ZROP); + nvgpu_err(g, "zrop exception: esr 0x%08x", zrop); + nvgpu_writel(g, gr_zrop_hww_esr_r(), + gr_zrop_hww_esr_reset_active_f()); + return 1U; + } + return 0U; +} + static u32 gr_gm20b_intr_check_gr_fe_exception(struct gk20a *g, u32 exception) { if ((exception & gr_exception_fe_m()) != 0U) { @@ -286,6 +320,8 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception) gpc_reset |= gr_gm20b_intr_check_gr_ssync_exception(g, exception); gpc_reset |= gr_gm20b_intr_check_gr_mme_exception(g, exception); gpc_reset |= gr_gm20b_intr_check_gr_sked_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_be_crop_exception(g, exception); + gpc_reset |= gr_gm20b_intr_check_gr_be_zrop_exception(g, exception); /* check if a gpc exception has occurred */ if ((exception & gr_exception_gpc_m()) != 0U) { @@ -353,6 +389,10 @@ u32 gm20b_gr_intr_get_tpc_exception(struct gk20a *g, u32 offset, pending_tpc->mpc_exception = true; } + if ((tpc_exception & gr_gpc0_tpc0_tpccs_tpc_exception_pe_m()) != 0U) { + pending_tpc->pe_exception = true; + } + return tpc_exception; } diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h index 8518434d4..fe9c53567 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b.h @@ -71,9 +71,18 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 *corrected_err, u32 *uncorrected_err); void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); +void gv11b_gr_intr_handle_gpc_prop_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception); +void gv11b_gr_intr_handle_gpc_zcull_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception); +void gv11b_gr_intr_handle_gpc_setup_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception); +void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception); void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc); +void gv11b_gr_intr_handle_tpc_pe_exception(struct gk20a *g, u32 gpc, u32 tpc); void gv11b_gr_intr_enable_hww_exceptions(struct gk20a *g); void gv11b_gr_intr_enable_exceptions(struct gk20a *g, struct nvgpu_gr_config *gr_config, diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c index ed48c9bea..9678b7c4f 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c @@ -526,6 +526,121 @@ static void gv11b_gr_intr_report_gpccs_ecc_err(struct gk20a *g, nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); } } + +void gv11b_gr_intr_handle_gpc_prop_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception) +{ + u32 offset = nvgpu_gr_gpc_offset(g, gpc); + u32 hww_esr; + + if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_prop_m()) == 0U) { + return; + } + + hww_esr = nvgpu_readl(g, + nvgpu_safe_add_u32(gr_prop_hww_esr_r(), offset)); + + nvgpu_gr_intr_report_exception(g, (gpc << 8U), + GPU_PGRAPH_GPC_GFX_EXCEPTION, + hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_PROP); + + /* clear the interrupt */ + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_prop_hww_esr_r(), offset), + gr_prop_hww_esr_reset_active_f()); + + nvgpu_log(g, gpu_dbg_intr, + "gpc:%d prop interrupt intr: 0x%x", gpc, hww_esr); +} + +void gv11b_gr_intr_handle_gpc_zcull_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception) +{ + u32 offset = nvgpu_gr_gpc_offset(g, gpc); + u32 hww_esr; + + if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_zcull_m()) == 0U) { + return; + } + + hww_esr = nvgpu_readl(g, + nvgpu_safe_add_u32(gr_zcull_hww_esr_r(), offset)); + + nvgpu_gr_intr_report_exception(g, (gpc << 8U), + GPU_PGRAPH_GPC_GFX_EXCEPTION, + hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_ZCULL); + + /* clear the interrupt */ + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_zcull_hww_esr_r(), offset), + gr_zcull_hww_esr_reset_active_f()); + + nvgpu_log(g, gpu_dbg_intr, + "gpc:%d zcull interrupt intr: 0x%x", gpc, hww_esr); +} + +void gv11b_gr_intr_handle_gpc_setup_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception) +{ + u32 offset = nvgpu_gr_gpc_offset(g, gpc); + u32 hww_esr; + + if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_setup_m()) == 0U) { + return; + } + + hww_esr = nvgpu_readl(g, + nvgpu_safe_add_u32(gr_setup_hww_esr_r(), offset)); + + nvgpu_gr_intr_report_exception(g, (gpc << 8U), + GPU_PGRAPH_GPC_GFX_EXCEPTION, + hww_esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_SETUP); + + /* clear the interrupt */ + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_setup_hww_esr_r(), offset), + gr_setup_hww_esr_reset_active_f()); + + nvgpu_log(g, gpu_dbg_intr, + "gpc:%d setup interrupt intr: 0x%x", gpc, hww_esr); +} + +void gv11b_gr_intr_handle_gpc_pes_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception) +{ + u32 offset = nvgpu_gr_gpc_offset(g, gpc); + u32 hww_esr, sub_err_type; + + if (((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) == 0U) && + ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m()) + == 0U)) { + return; + } + + hww_esr = nvgpu_readl(g, + nvgpu_safe_add_u32(gr_pes_hww_esr_r(), offset)); + + if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes0_m()) != 0U) { + sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES0; + } + + if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_pes1_m()) != 0U) { + sub_err_type = GPU_PGRAPH_GPC_GFX_EXCEPTION_PES1; + } + + nvgpu_gr_intr_report_exception(g, (gpc << 8U), + GPU_PGRAPH_GPC_GFX_EXCEPTION, + hww_esr, sub_err_type); + + /* clear the interrupt */ + nvgpu_writel(g, nvgpu_safe_add_u32( + gr_pes_hww_esr_r(), offset), + gr_pes_hww_esr_reset_task_f()); + + nvgpu_log(g, gpu_dbg_intr, + "gpc:%d pes interrupt intr: 0x%x", gpc, hww_esr); +} + void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err) { @@ -637,6 +752,24 @@ void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc) gr_gpc0_tpc0_mpc_hww_esr_reset_trigger_f()); } +void gv11b_gr_intr_handle_tpc_pe_exception(struct gk20a *g, u32 gpc, u32 tpc) +{ + u32 esr; + u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc), + nvgpu_gr_tpc_offset(g, tpc)); + + esr = nvgpu_readl(g, nvgpu_safe_add_u32(gr_gpc0_tpc0_pe_hww_esr_r(), + offset)); + nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc), + GPU_PGRAPH_GPC_GFX_EXCEPTION, + esr, GPU_PGRAPH_GPC_GFX_EXCEPTION_TPC_PE); + + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "pe hww esr 0x%08x", esr); + + nvgpu_writel(g, nvgpu_safe_add_u32(gr_gpc0_tpc0_pe_hww_esr_r(), offset), + gr_gpc0_tpc0_pe_hww_esr_reset_task_f()); +} + void gv11b_gr_intr_enable_hww_exceptions(struct gk20a *g) { /* enable exceptions */ @@ -688,7 +821,9 @@ void gv11b_gr_intr_enable_exceptions(struct gk20a *g, */ /* enable exceptions */ - nvgpu_writel(g, gr_exception2_en_r(), 0x0U); /* BE not enabled */ + reg_val = gr_exception2_en_be_enabled_f(); + nvgpu_log(g, gpu_dbg_info, "gr_exception2_en 0x%08x", reg_val); + nvgpu_writel(g, gr_exception2_en_r(), reg_val); reg_val = (u32)BIT32(nvgpu_gr_config_get_gpc_count(gr_config)); nvgpu_writel(g, gr_exception1_en_r(), @@ -716,6 +851,7 @@ void gv11b_gr_intr_enable_gpc_exceptions(struct gk20a *g, nvgpu_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f() | + gr_gpcs_tpcs_tpccs_tpc_exception_en_pe_enabled_f() | gr_gpcs_tpcs_tpccs_tpc_exception_en_mpc_enabled_f()); tpc_mask_calc = (u32)BIT32( diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 0635b398f..2bb5ab974 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -586,11 +586,21 @@ static const struct gpu_ops gv11b_ops = { gv11b_gr_intr_handle_gcc_exception, .handle_gpc_gpcmmu_exception = gv11b_gr_intr_handle_gpc_gpcmmu_exception, + .handle_gpc_prop_exception = + gv11b_gr_intr_handle_gpc_prop_exception, + .handle_gpc_zcull_exception = + gv11b_gr_intr_handle_gpc_zcull_exception, + .handle_gpc_setup_exception = + gv11b_gr_intr_handle_gpc_setup_exception, + .handle_gpc_pes_exception = + gv11b_gr_intr_handle_gpc_pes_exception, .handle_gpc_gpccs_exception = gv11b_gr_intr_handle_gpc_gpccs_exception, .get_tpc_exception = gm20b_gr_intr_get_tpc_exception, .handle_tpc_mpc_exception = gv11b_gr_intr_handle_tpc_mpc_exception, + .handle_tpc_pe_exception = + gv11b_gr_intr_handle_tpc_pe_exception, .handle_tex_exception = NULL, .enable_hww_exceptions = gv11b_gr_intr_enable_hww_exceptions, diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 6eee3d011..6d21603c6 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -850,6 +850,14 @@ struct gpu_ops { void (*handle_gpc_gpcmmu_exception)(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); + void (*handle_gpc_prop_exception)(struct gk20a *g, + u32 gpc, u32 gpc_exception); + void (*handle_gpc_zcull_exception)(struct gk20a *g, + u32 gpc, u32 gpc_exception); + void (*handle_gpc_setup_exception)(struct gk20a *g, + u32 gpc, u32 gpc_exception); + void (*handle_gpc_pes_exception)(struct gk20a *g, + u32 gpc, u32 gpc_exception); void (*handle_gpc_gpccs_exception)(struct gk20a *g, u32 gpc, u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err); @@ -857,6 +865,8 @@ struct gpu_ops { struct nvgpu_gr_tpc_exception *pending_tpc); void (*handle_tpc_mpc_exception)(struct gk20a *g, u32 gpc, u32 tpc); + void (*handle_tpc_pe_exception)(struct gk20a *g, + u32 gpc, u32 tpc); void (*handle_tex_exception)(struct gk20a *g, u32 gpc, u32 tpc); void (*enable_hww_exceptions)(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 4a1947db0..731a074df 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -192,6 +192,19 @@ struct ctxsw_err_info { #define GPU_PGRAPH_BE_EXCEPTION (8U) #define GPU_PGRAPH_MPC_EXCEPTION (9U) #define GPU_PGRAPH_ILLEGAL_ERROR (10U) +#define GPU_PGRAPH_GPC_GFX_EXCEPTION (11U) + +/** Sub-errors in GPU_PGRAPH_BE_EXCEPTION. */ +#define GPU_PGRAPH_BE_EXCEPTION_CROP (0U) +#define GPU_PGRAPH_BE_EXCEPTION_ZROP (1U) + +/** Sub-errors in GPU_PGRAPH_GPC_GFX_EXCEPTION. */ +#define GPU_PGRAPH_GPC_GFX_EXCEPTION_PROP (0U) +#define GPU_PGRAPH_GPC_GFX_EXCEPTION_ZCULL (1U) +#define GPU_PGRAPH_GPC_GFX_EXCEPTION_SETUP (2U) +#define GPU_PGRAPH_GPC_GFX_EXCEPTION_PES0 (3U) +#define GPU_PGRAPH_GPC_GFX_EXCEPTION_PES1 (4U) +#define GPU_PGRAPH_GPC_GFX_EXCEPTION_TPC_PE (5U) /** Sub-errors in GPU_PGRAPH_ILLEGAL_ERROR. */ #define GPU_PGRAPH_ILLEGAL_NOTIFY (0U)