gpu: nvgpu: Exit early on VAB_ERROR MMU fault

This patch updates the interaction between the VAB packet polling code and the VAB_ERROR MMU fault handling code. A shared atomic flag is used to determine if a VAB_ERROR MMU fault has happened while polling, which will result in polling be terminated immediately instead of waiting on a timeout to happen. This allows testing VAB_ERROR MMU fault handling in environments where a timeout may never happen or happen very slowly. The sequence for this to work is the following: 1) before requesting a VAB dump, which may trigger a fault, the atomic flag is atomically reset to 0. 2) polling eventually starts which atomically checks the flag in the loop. If flag is set, polling exits because the VAB result will never be available. 3) If a VAB_ERROR MMU fault is raised, this sets the flag to 1 atomically. Note that while there could be a race in this sequence if the VAB_ERROR MMU fault handling is somehow delayed, the chance is extremely slim because: 1) the race could only happen if the VAB dump code is re-entered before the earlier VAB_ERROR MMU fault is still pending. 2) the polling code has a large timeout 3) re-entering means a new ioctl/devctl Bug 3425981 Change-Id: I422b15b581b0c3417abd4c66fbcdde9a0ff8cd9b Signed-off-by: Martin Radev <mradev@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2664103 Reviewed-by: svcacv <svcacv@nvidia.com> Reviewed-by: Vedashree Vidwans <vvidwans@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com> GVS: Gerrit_Virtual_Submit
2025-12-22 09:12:24 +03:00 · 2022-02-04 12:57:33 +02:00
parent 852717ccc1
commit 3e4fb49270
2 changed files with 32 additions and 2 deletions
--- a/drivers/gpu/nvgpu/hal/fb/vab/vab_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/fb/vab/vab_ga10b.c
@@ -153,6 +153,18 @@ void ga10b_fb_vab_recover(struct gk20a *g)
 	 */
 	struct nvgpu_mem *vab_buf = &g->vab.buffer;
 	/*
 	 * Share with polling thread that a VAB_ERROR MMU fault has happened.
 	 * When this flag is set, either the other thread is still polling or
 	 * polling has already timed out. This should be safe because when a
 	 * new VAB dump request would be triggered, the flag would be reset.
 	 * The chance of the problematic sequence (enter trigger (vab mmu fault
 	 * raised) -> timeout -> enter new trigger -> just then set flag) is
 	 * incredibly slim due to timing. Each trigger is a new ioctl with polling
 	 * having a large timeout.
 	 */
 	nvgpu_atomic_set(&g->vab.mmu_vab_error_flag, 1U);
 	ga10b_fb_vab_enable(g, false);
 	if (nvgpu_mem_is_valid(vab_buf)) {
@@ -283,6 +295,16 @@ static int ga10b_fb_vab_request_dump(struct gk20a *g)
 	u32 vab_dump_reg;
 	u32 trigger_set;
 	u32 trigger_reset;
 	struct nvgpu_vab *vab = &g->vab;
 	/*
 	 * Reset VAB_ERROR MMU flag to 0 before attempting to request dump.
 	 * Later, if a VAB_ERROR MMU fault is triggered, the handler will set the flag.
 	 * This enables the dumping code to exit early from polling.
 	 * Doing this is safe, because a VAB_ERROR MMU fault can only be raised after
 	 * requesting a dump.
 	 */
 	nvgpu_atomic_set(&vab->mmu_vab_error_flag, 0U);
 	/* Set trigger to start vab dump */
 	trigger_set = fb_mmu_vidmem_access_bit_dump_trigger_f(
@@ -302,7 +324,8 @@ static int ga10b_fb_vab_request_dump(struct gk20a *g)
 		}
 		nvgpu_usleep_range(delay, delay * 2U);
 		delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US);
-	} while (nvgpu_timeout_expired(&timeout) == 0);
+	} while (nvgpu_timeout_expired(&timeout) == 0
 				&& nvgpu_atomic_read(&vab->mmu_vab_error_flag) == 0);
 	return -ETIMEDOUT;
 }
@@ -322,7 +345,8 @@ static int ga10b_fb_vab_query_valid_bit(struct gk20a *g,
 		}
 		nvgpu_usleep_range(delay, delay * 2U);
 		delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US);
-	} while (nvgpu_timeout_expired(&timeout) == 0);
+	} while (nvgpu_timeout_expired(&timeout) == 0
 				&& nvgpu_atomic_read(&g->vab.mmu_vab_error_flag) == 0);
 	nvgpu_err(g, "VAB write bit not valid");
 	return -ETIMEDOUT;
 }
--- a/drivers/gpu/nvgpu/include/nvgpu/fb.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/fb.h
@@ -60,6 +60,12 @@ struct nvgpu_vab {
 	u32 num_entries;
 	unsigned long entry_size;
 	struct nvgpu_mem buffer;
 	/*
 	 * Evaluates to true if a VAB_ERROR mmu fault has happened since
 	 * dump has started
 	 */
 	nvgpu_atomic_t mmu_vab_error_flag;
 };
 int nvgpu_fb_vab_init_hal(struct gk20a *g);