gpu: nvgpu: log mme esr register information

Add new hal to log the mme exception register information. Support added for Turing only. On mme exception interrupt, read the mme_hww_esr register and log the error based on esr register bits. JIRA NVGPU-1241 Change-Id: Ied3db0cc8fe6e2a82ecafc9964875e2686ca0d72 Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2005807 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-01-28 13:04:46 -08:00
parent 0f84c9024f
commit 1b1ebb0a8d
10 changed files with 147 additions and 1 deletions
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5427,6 +5427,10 @@ int gk20a_gr_isr(struct gk20a *g)

 			nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
 					mme, info);
+			if (g->ops.gr.log_mme_exception != NULL) {
+				g->ops.gr.log_mme_exception(g);
+			}
+
 			gk20a_writel(g, gr_mme_hww_esr_r(),
 				gr_mme_hww_esr_reset_active_f());
 			need_reset = true;
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -338,6 +338,7 @@ static const struct gpu_ops gm20b_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.init_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.log_mme_exception = NULL,
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
 				gm20b_ctxsw_prog_hw_get_fecs_header_size,
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -376,6 +376,7 @@ static const struct gpu_ops gp10b_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.init_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.log_mme_exception = NULL,
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
 				gm20b_ctxsw_prog_hw_get_fecs_header_size,
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -502,6 +502,7 @@ static const struct gpu_ops gv100_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.init_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.log_mme_exception = NULL,
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
 				gm20b_ctxsw_prog_hw_get_fecs_header_size,
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -457,6 +457,7 @@ static const struct gpu_ops gv11b_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.init_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.log_mme_exception = NULL,
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
 				gm20b_ctxsw_prog_hw_get_fecs_header_size,
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -530,6 +530,7 @@ struct gpu_ops {
 			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm);
 		void (*commit_gfxp_rtv_cb)(struct gk20a *g,
 			  struct nvgpu_gr_ctx *gr_ctx, bool patch);
+		void (*log_mme_exception)(struct gk20a *g);
 		struct {
 			u32 (*hw_get_fecs_header_size)(void);
 			u32 (*hw_get_gpccs_header_size)(void);
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -1050,6 +1050,50 @@ static inline u32 gr_mme_hww_esr_r(void)
 {
 	return 0x00404490U;
 }
+static inline u32 gr_mme_hww_esr_missing_macro_data_pending_f(void)
+{
+	return 0x1U;
+}
+static inline u32 gr_mme_hww_esr_illegal_opcode_pending_f(void)
+{
+	return 0x4U;
+}
+static inline u32 gr_mme_hww_esr_branch_in_delay_pending_f(void)
+{
+	return 0x8U;
+}
+static inline u32 gr_mme_hww_esr_inst_ram_acess_pending_f(void)
+{
+	return 0x20U;
+}
+static inline u32 gr_mme_hww_esr_data_ram_access_pending_f(void)
+{
+	return 0x40U;
+}
+static inline u32 gr_mme_hww_esr_illegal_mme_method_pending_f(void)
+{
+	return 0x80U;
+}
+static inline u32 gr_mme_hww_esr_dma_dram_access_pending_f(void)
+{
+	return 0x10000U;
+}
+static inline u32 gr_mme_hww_esr_dma_read_pb_pending_f(void)
+{
+	return 0x20000U;
+}
+static inline u32 gr_mme_hww_esr_dma_illegal_fifo_pending_f(void)
+{
+	return 0x40000U;
+}
+static inline u32 gr_mme_hww_esr_dma_read_overflow_pending_f(void)
+{
+	return 0x80000U;
+}
+static inline u32 gr_mme_hww_esr_dma_fifo_resized_pending_f(void)
+{
+	return 0x100000U;
+}
 static inline u32 gr_mme_hww_esr_reset_active_f(void)
 {
 	return 0x40000000U;
@@ -1062,6 +1106,22 @@ static inline u32 gr_mme_hww_esr_info_r(void)
 {
 	return 0x00404494U;
 }
+static inline u32 gr_mme_hww_esr_info_pc_valid_v(u32 r)
+{
+	return (r >> 28U) & 0x1U;
+}
+static inline u32 gr_mme_hww_esr_info2_r(void)
+{
+	return 0x0040449cU;
+}
+static inline u32 gr_mme_hww_esr_info3_r(void)
+{
+	return 0x004044a8U;
+}
+static inline u32 gr_mme_hww_esr_info4_r(void)
+{
+	return 0x004044acU;
+}
 static inline u32 gr_memfmt_hww_esr_r(void)
 {
 	return 0x00404600U;
--- a/drivers/gpu/nvgpu/tu104/gr_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/gr_tu104.c
@@ -491,3 +491,78 @@ void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
 	*sm_dsm_perf_ctrl_regs = NULL;
 	*ctrl_register_stride = 0;
 }
+
+void gr_tu104_log_mme_exception(struct gk20a *g)
+{
+	u32 mme_hww_esr = nvgpu_readl(g, gr_mme_hww_esr_r());
+	u32 mme_hww_info = nvgpu_readl(g, gr_mme_hww_esr_info_r());
+
+	if ((mme_hww_esr &
+	     gr_mme_hww_esr_missing_macro_data_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: MISSING_MACRO_DATA");
+	}
+
+	if ((mme_hww_esr &
+	     gr_mme_hww_esr_illegal_mme_method_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: ILLEGAL_MME_METHOD");
+	}
+
+	if ((mme_hww_esr &
+	     gr_mme_hww_esr_dma_dram_access_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: DMA_DRAM_ACCESS_OUT_OF_BOUNDS");
+	}
+
+	if ((mme_hww_esr &
+	     gr_mme_hww_esr_dma_illegal_fifo_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: DMA_ILLEGAL_FIFO_CONFIG");
+	}
+
+	if ((mme_hww_esr &
+	     gr_mme_hww_esr_dma_read_overflow_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: DMA_READ_FIFOED_OVERFLOW");
+	}
+
+	if ((mme_hww_esr &
+	     gr_mme_hww_esr_dma_fifo_resized_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: DMA_FIFO_RESIZED_WHEN_NONIDLE");
+	}
+
+	if ((mme_hww_esr & gr_mme_hww_esr_illegal_opcode_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: ILLEGAL_OPCODE");
+	}
+
+	if ((mme_hww_esr & gr_mme_hww_esr_branch_in_delay_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: BRANCH_IN_DELAY_SHOT");
+	}
+
+	if ((mme_hww_esr & gr_mme_hww_esr_inst_ram_acess_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: INSTR_RAM_ACCESS_OUT_OF_BOUNDS");
+	}
+
+	if ((mme_hww_esr & gr_mme_hww_esr_data_ram_access_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: DATA_RAM_ACCESS_OUT_OF_BOUNDS");
+	}
+
+	if ((mme_hww_esr & gr_mme_hww_esr_dma_read_pb_pending_f()) != 0U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: DMA_READ_FIFOED_FROM_PB");
+	}
+
+	if (gr_mme_hww_esr_info_pc_valid_v(mme_hww_info) == 0x1U) {
+		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+			 "GR MME EXCEPTION: INFO2 0x%x, INFO3 0x%x, INFO4 0x%x",
+			 nvgpu_readl(g, gr_mme_hww_esr_info2_r()),
+			 nvgpu_readl(g, gr_mme_hww_esr_info3_r()),
+			 nvgpu_readl(g, gr_mme_hww_esr_info4_r()));
+	}
+}
--- a/drivers/gpu/nvgpu/tu104/gr_tu104.h
+++ b/drivers/gpu/nvgpu/tu104/gr_tu104.h
@@ -92,4 +92,5 @@ void gr_tu104_init_sm_dsm_reg_info(void);
 void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
 	u32 *num_sm_dsm_perf_ctrl_regs, u32 **sm_dsm_perf_ctrl_regs,
 	u32 *ctrl_register_stride);
+void gr_tu104_log_mme_exception(struct gk20a *g);
 #endif /* NVGPU_GR_TU104_H */
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -527,6 +527,7 @@ static const struct gpu_ops tu104_ops = {
 		.dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats,
 		.get_fecs_ctx_state_store_major_rev_id =
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
+		.log_mme_exception = gr_tu104_log_mme_exception,
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
 				gm20b_ctxsw_prog_hw_get_fecs_header_size,