From 1b1ebb0a8d89a46eacb7512335e59a86e37eeb47 Mon Sep 17 00:00:00 2001 From: Vinod G Date: Mon, 28 Jan 2019 13:04:46 -0800 Subject: [PATCH] gpu: nvgpu: log mme esr register information Add new hal to log the mme exception register information. Support added for Turing only. On mme exception interrupt, read the mme_hww_esr register and log the error based on esr register bits. JIRA NVGPU-1241 Change-Id: Ied3db0cc8fe6e2a82ecafc9964875e2686ca0d72 Signed-off-by: Vinod G Reviewed-on: https://git-master.nvidia.com/r/2005807 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 4 + drivers/gpu/nvgpu/gm20b/hal_gm20b.c | 1 + drivers/gpu/nvgpu/gp10b/hal_gp10b.c | 1 + drivers/gpu/nvgpu/gv100/hal_gv100.c | 1 + drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 1 + drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 1 + .../include/nvgpu/hw/tu104/hw_gr_tu104.h | 62 ++++++++++++++- drivers/gpu/nvgpu/tu104/gr_tu104.c | 75 +++++++++++++++++++ drivers/gpu/nvgpu/tu104/gr_tu104.h | 1 + drivers/gpu/nvgpu/tu104/hal_tu104.c | 1 + 10 files changed, 147 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 2454c1339..fb0897b87 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -5427,6 +5427,10 @@ int gk20a_gr_isr(struct gk20a *g) nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x", mme, info); + if (g->ops.gr.log_mme_exception != NULL) { + g->ops.gr.log_mme_exception(g); + } + gk20a_writel(g, gr_mme_hww_esr_r(), gr_mme_hww_esr_reset_active_f()); need_reset = true; diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c index 6a637e1d9..d8d637da1 100644 --- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c @@ -338,6 +338,7 @@ static const struct gpu_ops gm20b_ops = { gk20a_gr_get_fecs_ctx_state_store_major_rev_id, .init_gfxp_rtv_cb = NULL, .commit_gfxp_rtv_cb = NULL, + .log_mme_exception = NULL, .ctxsw_prog = { .hw_get_fecs_header_size = gm20b_ctxsw_prog_hw_get_fecs_header_size, diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c index 533dad51b..8d241b2be 100644 --- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c @@ -376,6 +376,7 @@ static const struct gpu_ops gp10b_ops = { gk20a_gr_get_fecs_ctx_state_store_major_rev_id, .init_gfxp_rtv_cb = NULL, .commit_gfxp_rtv_cb = NULL, + .log_mme_exception = NULL, .ctxsw_prog = { .hw_get_fecs_header_size = gm20b_ctxsw_prog_hw_get_fecs_header_size, diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index bbf1181fc..aa5ef80ae 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -502,6 +502,7 @@ static const struct gpu_ops gv100_ops = { gk20a_gr_get_fecs_ctx_state_store_major_rev_id, .init_gfxp_rtv_cb = NULL, .commit_gfxp_rtv_cb = NULL, + .log_mme_exception = NULL, .ctxsw_prog = { .hw_get_fecs_header_size = gm20b_ctxsw_prog_hw_get_fecs_header_size, diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index ce2d40f9f..747e0d21d 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -457,6 +457,7 @@ static const struct gpu_ops gv11b_ops = { gk20a_gr_get_fecs_ctx_state_store_major_rev_id, .init_gfxp_rtv_cb = NULL, .commit_gfxp_rtv_cb = NULL, + .log_mme_exception = NULL, .ctxsw_prog = { .hw_get_fecs_header_size = gm20b_ctxsw_prog_hw_get_fecs_header_size, diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index ff3575834..330004cba 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -530,6 +530,7 @@ struct gpu_ops { struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm); void (*commit_gfxp_rtv_cb)(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, bool patch); + void (*log_mme_exception)(struct gk20a *g); struct { u32 (*hw_get_fecs_header_size)(void); u32 (*hw_get_gpccs_header_size)(void); diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h index 83fa99889..2c9e898b1 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -1050,6 +1050,50 @@ static inline u32 gr_mme_hww_esr_r(void) { return 0x00404490U; } +static inline u32 gr_mme_hww_esr_missing_macro_data_pending_f(void) +{ + return 0x1U; +} +static inline u32 gr_mme_hww_esr_illegal_opcode_pending_f(void) +{ + return 0x4U; +} +static inline u32 gr_mme_hww_esr_branch_in_delay_pending_f(void) +{ + return 0x8U; +} +static inline u32 gr_mme_hww_esr_inst_ram_acess_pending_f(void) +{ + return 0x20U; +} +static inline u32 gr_mme_hww_esr_data_ram_access_pending_f(void) +{ + return 0x40U; +} +static inline u32 gr_mme_hww_esr_illegal_mme_method_pending_f(void) +{ + return 0x80U; +} +static inline u32 gr_mme_hww_esr_dma_dram_access_pending_f(void) +{ + return 0x10000U; +} +static inline u32 gr_mme_hww_esr_dma_read_pb_pending_f(void) +{ + return 0x20000U; +} +static inline u32 gr_mme_hww_esr_dma_illegal_fifo_pending_f(void) +{ + return 0x40000U; +} +static inline u32 gr_mme_hww_esr_dma_read_overflow_pending_f(void) +{ + return 0x80000U; +} +static inline u32 gr_mme_hww_esr_dma_fifo_resized_pending_f(void) +{ + return 0x100000U; +} static inline u32 gr_mme_hww_esr_reset_active_f(void) { return 0x40000000U; @@ -1062,6 +1106,22 @@ static inline u32 gr_mme_hww_esr_info_r(void) { return 0x00404494U; } +static inline u32 gr_mme_hww_esr_info_pc_valid_v(u32 r) +{ + return (r >> 28U) & 0x1U; +} +static inline u32 gr_mme_hww_esr_info2_r(void) +{ + return 0x0040449cU; +} +static inline u32 gr_mme_hww_esr_info3_r(void) +{ + return 0x004044a8U; +} +static inline u32 gr_mme_hww_esr_info4_r(void) +{ + return 0x004044acU; +} static inline u32 gr_memfmt_hww_esr_r(void) { return 0x00404600U; diff --git a/drivers/gpu/nvgpu/tu104/gr_tu104.c b/drivers/gpu/nvgpu/tu104/gr_tu104.c index 966faec79..86d5f2ab7 100644 --- a/drivers/gpu/nvgpu/tu104/gr_tu104.c +++ b/drivers/gpu/nvgpu/tu104/gr_tu104.c @@ -491,3 +491,78 @@ void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g, *sm_dsm_perf_ctrl_regs = NULL; *ctrl_register_stride = 0; } + +void gr_tu104_log_mme_exception(struct gk20a *g) +{ + u32 mme_hww_esr = nvgpu_readl(g, gr_mme_hww_esr_r()); + u32 mme_hww_info = nvgpu_readl(g, gr_mme_hww_esr_info_r()); + + if ((mme_hww_esr & + gr_mme_hww_esr_missing_macro_data_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: MISSING_MACRO_DATA"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_illegal_mme_method_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: ILLEGAL_MME_METHOD"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_dram_access_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_DRAM_ACCESS_OUT_OF_BOUNDS"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_illegal_fifo_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_ILLEGAL_FIFO_CONFIG"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_read_overflow_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_READ_FIFOED_OVERFLOW"); + } + + if ((mme_hww_esr & + gr_mme_hww_esr_dma_fifo_resized_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_FIFO_RESIZED_WHEN_NONIDLE"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_illegal_opcode_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: ILLEGAL_OPCODE"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_branch_in_delay_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: BRANCH_IN_DELAY_SHOT"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_inst_ram_acess_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: INSTR_RAM_ACCESS_OUT_OF_BOUNDS"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_data_ram_access_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DATA_RAM_ACCESS_OUT_OF_BOUNDS"); + } + + if ((mme_hww_esr & gr_mme_hww_esr_dma_read_pb_pending_f()) != 0U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: DMA_READ_FIFOED_FROM_PB"); + } + + if (gr_mme_hww_esr_info_pc_valid_v(mme_hww_info) == 0x1U) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, + "GR MME EXCEPTION: INFO2 0x%x, INFO3 0x%x, INFO4 0x%x", + nvgpu_readl(g, gr_mme_hww_esr_info2_r()), + nvgpu_readl(g, gr_mme_hww_esr_info3_r()), + nvgpu_readl(g, gr_mme_hww_esr_info4_r())); + } +} diff --git a/drivers/gpu/nvgpu/tu104/gr_tu104.h b/drivers/gpu/nvgpu/tu104/gr_tu104.h index 868636664..10e1139ae 100644 --- a/drivers/gpu/nvgpu/tu104/gr_tu104.h +++ b/drivers/gpu/nvgpu/tu104/gr_tu104.h @@ -92,4 +92,5 @@ void gr_tu104_init_sm_dsm_reg_info(void); void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g, u32 *num_sm_dsm_perf_ctrl_regs, u32 **sm_dsm_perf_ctrl_regs, u32 *ctrl_register_stride); +void gr_tu104_log_mme_exception(struct gk20a *g); #endif /* NVGPU_GR_TU104_H */ diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index 9da1eab90..237cabe6f 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -527,6 +527,7 @@ static const struct gpu_ops tu104_ops = { .dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats, .get_fecs_ctx_state_store_major_rev_id = gk20a_gr_get_fecs_ctx_state_store_major_rev_id, + .log_mme_exception = gr_tu104_log_mme_exception, .ctxsw_prog = { .hw_get_fecs_header_size = gm20b_ctxsw_prog_hw_get_fecs_header_size,