gpu: nvgpu: handle SM reported MMU_NACK exception

Upon receiving MMU_FAULT error, MMU will forward MMU_NACK to SM If MMU_NACK is masked out, SM will simply release the semaphores And if semaphores are released before MMU fault is handled, user space could see that operation as successful incorrectly Fix this by handling SM reported MMU_NACK exception Enable MMU_NACK reporting in gv11b_gr_set_hww_esr_report_mask In MMU_NACK handling path, we just set the error notifier and clear the interrupt so that the User Space sees the error as soon as semaphores are released by SM And MMU_FAULT handling path will take care of triggering RC recovery anyways Also add necessary h/w accessors for mmu_nack Bug 2040594 Jira NVGPU-473 Change-Id: Ic925c2d3f3069016c57d177713066c29ab39dc3d Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1631708 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-23 01:50:07 +03:00 · 2018-01-04 05:02:46 -08:00
parent 6170f1eed5
commit 5b10690479
3 changed files with 61 additions and 3 deletions
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1994,6 +1994,39 @@ void gr_gv11b_get_access_map(struct gk20a *g,
 	*num_entries = ARRAY_SIZE(wl_addr_gv11b);
 }
 static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
 	u32 gpc, u32 tpc, u32 sm,
 	u32 warp_esr,
 	struct channel_gk20a *fault_ch)
 {
 	struct tsg_gk20a *tsg;
 	u32 offset;
 	if (fault_ch) {
 		tsg = &g->fifo.tsg[fault_ch->tsgid];
 		/*
 		 * Upon receiving MMU_FAULT error, MMU will forward MMU_NACK
 		 * to SM. So MMU_FAULT handling path will take care of
 		 * triggering RC recovery
 		 *
 		 * In MMU_NACK handling path, we just set the error notifier
 		 * and clear the interrupt so that the User Space sees the error
 		 * as soon as semaphores are released by SM
 		 */
 		gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
 	}
 	/* clear interrupt */
 	offset = gk20a_gr_gpc_offset(g, gpc) +
 			gk20a_gr_tpc_offset(g, tpc) +
 			gv11b_gr_sm_offset(g, sm);
 	nvgpu_writel(g,
 		gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0);
 	return 0;
 }
 /* @brief pre-process work on the SM exceptions to determine if we clear them or not.
 *
 * On Pascal, if we are in CILP preemtion mode, preempt the channel and handle errors with special processing
@@ -2013,6 +2046,14 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
 	*early_exit = false;
 	*ignore_debugger = false;
 	/*
 	 * We don't need to trigger CILP in case of MMU_NACK
 	 * So just handle MMU_NACK and return
 	 */
 	if (warp_esr & gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f())
 		return gr_gv11b_handle_warp_esr_error_mmu_nack(g, gpc, tpc, sm,
 				warp_esr, fault_ch);
 	if (fault_ch)
 		cilp_enabled = (fault_ch->ch_ctx.gr_ctx->compute_preempt_mode ==
 			NVGPU_PREEMPTION_MODE_COMPUTE_CILP);
@@ -2992,7 +3033,8 @@ void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g)
 		gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
 		gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
 		gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_report_f() |
-		gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_fault_report_f());
+		gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_fault_report_f() |
 		gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f());
 	/* setup sm global esr report mask. vat_alarm_report is not enabled */
 	gk20a_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r(),
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -3344,6 +3344,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_repor
 {
 	return 0x400000U;
 }
 static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f(void)
 {
 	return 0x4000000U;
 }
 static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_r(void)
 {
 	return 0x00419d0cU;
@@ -3552,6 +3556,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_v(void)
 {
 	return 0x00000000U;
 }
 static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void)
 {
 	return 0x20U;
 }
 static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void)
 {
 	return 0x0U;
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -3940,6 +3940,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_repor
 {
 	return 0x400000U;
 }
 static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f(void)
 {
 	return 0x4000000U;
 }
 static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_r(void)
 {
 	return 0x00419d0cU;
@@ -4240,6 +4244,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void)
 {
 	return 0x0U;
 }
 static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void)
 {
 	return 0x20U;
 }
 static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_wrap_id_m(void)
 {
 	return 0xffU << 16U;