From 3e4fb4927052d859bb9181921da5d2253a51e992 Mon Sep 17 00:00:00 2001
From: Martin Radev <mradev@nvidia.com>
Date: Fri, 4 Feb 2022 12:57:33 +0200
Subject: [PATCH] gpu: nvgpu: Exit early on VAB_ERROR MMU fault

This patch updates the interaction between the VAB
packet polling code and the VAB_ERROR MMU fault handling
code. A shared atomic flag is used to determine if a
VAB_ERROR MMU fault has happened while polling, which will
result in polling be terminated immediately instead of
waiting on a timeout to happen. This allows testing VAB_ERROR
MMU fault handling in environments where a timeout may never
happen or happen very slowly.

The sequence for this to work is the following:
1) before requesting a VAB dump, which may trigger a fault,
   the atomic flag is atomically reset to 0.
2) polling eventually starts which atomically checks the flag
   in the loop. If flag is set, polling exits because the VAB
   result will never be available.
3) If a VAB_ERROR MMU fault is raised, this sets the flag to 1
   atomically.

Note that while there could be a race in this sequence if the
VAB_ERROR MMU fault handling is somehow delayed, the chance is
extremely slim because:
1) the race could only happen if the VAB dump code is re-entered
   before the earlier VAB_ERROR MMU fault is still pending.
2) the polling code has a large timeout
3) re-entering means a new ioctl/devctl

Bug 3425981

Change-Id: I422b15b581b0c3417abd4c66fbcdde9a0ff8cd9b
Signed-off-by: Martin Radev <mradev@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2664103
Reviewed-by: svcacv <svcacv@nvidia.com>
Reviewed-by: Vedashree Vidwans <vvidwans@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
GVS: Gerrit_Virtual_Submit
---
 drivers/gpu/nvgpu/hal/fb/vab/vab_ga10b.c | 28 ++++++++++++++++++++++--
 drivers/gpu/nvgpu/include/nvgpu/fb.h     |  6 +++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/nvgpu/hal/fb/vab/vab_ga10b.c b/drivers/gpu/nvgpu/hal/fb/vab/vab_ga10b.c
index 6ad3050b8..88ae32ce5 100644
--- a/drivers/gpu/nvgpu/hal/fb/vab/vab_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/fb/vab/vab_ga10b.c
@@ -153,6 +153,18 @@ void ga10b_fb_vab_recover(struct gk20a *g)
 	 */
 	struct nvgpu_mem *vab_buf = &g->vab.buffer;
 
+	/*
+	 * Share with polling thread that a VAB_ERROR MMU fault has happened.
+	 * When this flag is set, either the other thread is still polling or
+	 * polling has already timed out. This should be safe because when a
+	 * new VAB dump request would be triggered, the flag would be reset.
+	 * The chance of the problematic sequence (enter trigger (vab mmu fault
+	 * raised) -> timeout -> enter new trigger -> just then set flag) is
+	 * incredibly slim due to timing. Each trigger is a new ioctl with polling
+	 * having a large timeout.
+	 */
+	nvgpu_atomic_set(&g->vab.mmu_vab_error_flag, 1U);
+
 	ga10b_fb_vab_enable(g, false);
 
 	if (nvgpu_mem_is_valid(vab_buf)) {
@@ -283,6 +295,16 @@ static int ga10b_fb_vab_request_dump(struct gk20a *g)
 	u32 vab_dump_reg;
 	u32 trigger_set;
 	u32 trigger_reset;
+	struct nvgpu_vab *vab = &g->vab;
+
+	/*
+	 * Reset VAB_ERROR MMU flag to 0 before attempting to request dump.
+	 * Later, if a VAB_ERROR MMU fault is triggered, the handler will set the flag.
+	 * This enables the dumping code to exit early from polling.
+	 * Doing this is safe, because a VAB_ERROR MMU fault can only be raised after
+	 * requesting a dump.
+	 */
+	nvgpu_atomic_set(&vab->mmu_vab_error_flag, 0U);
 
 	/* Set trigger to start vab dump */
 	trigger_set = fb_mmu_vidmem_access_bit_dump_trigger_f(
@@ -302,7 +324,8 @@ static int ga10b_fb_vab_request_dump(struct gk20a *g)
 		}
 		nvgpu_usleep_range(delay, delay * 2U);
 		delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US);
-	} while (nvgpu_timeout_expired(&timeout) == 0);
+	} while (nvgpu_timeout_expired(&timeout) == 0
+				&& nvgpu_atomic_read(&vab->mmu_vab_error_flag) == 0);
 	return -ETIMEDOUT;
 }
 
@@ -322,7 +345,8 @@ static int ga10b_fb_vab_query_valid_bit(struct gk20a *g,
 		}
 		nvgpu_usleep_range(delay, delay * 2U);
 		delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US);
-	} while (nvgpu_timeout_expired(&timeout) == 0);
+	} while (nvgpu_timeout_expired(&timeout) == 0
+				&& nvgpu_atomic_read(&g->vab.mmu_vab_error_flag) == 0);
 	nvgpu_err(g, "VAB write bit not valid");
 	return -ETIMEDOUT;
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/fb.h b/drivers/gpu/nvgpu/include/nvgpu/fb.h
index a226f6d46..c8d61407e 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/fb.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/fb.h
@@ -60,6 +60,12 @@ struct nvgpu_vab {
 	u32 num_entries;
 	unsigned long entry_size;
 	struct nvgpu_mem buffer;
+
+	/*
+	 * Evaluates to true if a VAB_ERROR mmu fault has happened since
+	 * dump has started
+	 */
+	nvgpu_atomic_t mmu_vab_error_flag;
 };
 
 int nvgpu_fb_vab_init_hal(struct gk20a *g);