gpu: nvgpu: Exit early on VAB_ERROR MMU fault

This patch updates the interaction between the VAB
packet polling code and the VAB_ERROR MMU fault handling
code. A shared atomic flag is used to determine if a
VAB_ERROR MMU fault has happened while polling, which will
result in polling be terminated immediately instead of
waiting on a timeout to happen. This allows testing VAB_ERROR
MMU fault handling in environments where a timeout may never
happen or happen very slowly.

The sequence for this to work is the following:
1) before requesting a VAB dump, which may trigger a fault,
   the atomic flag is atomically reset to 0.
2) polling eventually starts which atomically checks the flag
   in the loop. If flag is set, polling exits because the VAB
   result will never be available.
3) If a VAB_ERROR MMU fault is raised, this sets the flag to 1
   atomically.

Note that while there could be a race in this sequence if the
VAB_ERROR MMU fault handling is somehow delayed, the chance is
extremely slim because:
1) the race could only happen if the VAB dump code is re-entered
   before the earlier VAB_ERROR MMU fault is still pending.
2) the polling code has a large timeout
3) re-entering means a new ioctl/devctl

Bug 3425981

Change-Id: I422b15b581b0c3417abd4c66fbcdde9a0ff8cd9b
Signed-off-by: Martin Radev <mradev@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2664103
Reviewed-by: svcacv <svcacv@nvidia.com>
Reviewed-by: Vedashree Vidwans <vvidwans@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Martin Radev
2022-02-04 12:57:33 +02:00
committed by mobile promotions
parent 852717ccc1
commit 3e4fb49270
2 changed files with 32 additions and 2 deletions

View File

@@ -153,6 +153,18 @@ void ga10b_fb_vab_recover(struct gk20a *g)
*/ */
struct nvgpu_mem *vab_buf = &g->vab.buffer; struct nvgpu_mem *vab_buf = &g->vab.buffer;
/*
* Share with polling thread that a VAB_ERROR MMU fault has happened.
* When this flag is set, either the other thread is still polling or
* polling has already timed out. This should be safe because when a
* new VAB dump request would be triggered, the flag would be reset.
* The chance of the problematic sequence (enter trigger (vab mmu fault
* raised) -> timeout -> enter new trigger -> just then set flag) is
* incredibly slim due to timing. Each trigger is a new ioctl with polling
* having a large timeout.
*/
nvgpu_atomic_set(&g->vab.mmu_vab_error_flag, 1U);
ga10b_fb_vab_enable(g, false); ga10b_fb_vab_enable(g, false);
if (nvgpu_mem_is_valid(vab_buf)) { if (nvgpu_mem_is_valid(vab_buf)) {
@@ -283,6 +295,16 @@ static int ga10b_fb_vab_request_dump(struct gk20a *g)
u32 vab_dump_reg; u32 vab_dump_reg;
u32 trigger_set; u32 trigger_set;
u32 trigger_reset; u32 trigger_reset;
struct nvgpu_vab *vab = &g->vab;
/*
* Reset VAB_ERROR MMU flag to 0 before attempting to request dump.
* Later, if a VAB_ERROR MMU fault is triggered, the handler will set the flag.
* This enables the dumping code to exit early from polling.
* Doing this is safe, because a VAB_ERROR MMU fault can only be raised after
* requesting a dump.
*/
nvgpu_atomic_set(&vab->mmu_vab_error_flag, 0U);
/* Set trigger to start vab dump */ /* Set trigger to start vab dump */
trigger_set = fb_mmu_vidmem_access_bit_dump_trigger_f( trigger_set = fb_mmu_vidmem_access_bit_dump_trigger_f(
@@ -302,7 +324,8 @@ static int ga10b_fb_vab_request_dump(struct gk20a *g)
} }
nvgpu_usleep_range(delay, delay * 2U); nvgpu_usleep_range(delay, delay * 2U);
delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US); delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US);
} while (nvgpu_timeout_expired(&timeout) == 0); } while (nvgpu_timeout_expired(&timeout) == 0
&& nvgpu_atomic_read(&vab->mmu_vab_error_flag) == 0);
return -ETIMEDOUT; return -ETIMEDOUT;
} }
@@ -322,7 +345,8 @@ static int ga10b_fb_vab_query_valid_bit(struct gk20a *g,
} }
nvgpu_usleep_range(delay, delay * 2U); nvgpu_usleep_range(delay, delay * 2U);
delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US); delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US);
} while (nvgpu_timeout_expired(&timeout) == 0); } while (nvgpu_timeout_expired(&timeout) == 0
&& nvgpu_atomic_read(&g->vab.mmu_vab_error_flag) == 0);
nvgpu_err(g, "VAB write bit not valid"); nvgpu_err(g, "VAB write bit not valid");
return -ETIMEDOUT; return -ETIMEDOUT;
} }

View File

@@ -60,6 +60,12 @@ struct nvgpu_vab {
u32 num_entries; u32 num_entries;
unsigned long entry_size; unsigned long entry_size;
struct nvgpu_mem buffer; struct nvgpu_mem buffer;
/*
* Evaluates to true if a VAB_ERROR mmu fault has happened since
* dump has started
*/
nvgpu_atomic_t mmu_vab_error_flag;
}; };
int nvgpu_fb_vab_init_hal(struct gk20a *g); int nvgpu_fb_vab_init_hal(struct gk20a *g);