gpu: nvgpu: handle SM reported MMU_NACK exception

Upon receiving MMU_FAULT error, MMU will forward MMU_NACK to SM
If MMU_NACK is masked out, SM will simply release the semaphores

And if semaphores are released before MMU fault is handled, user space
could see that operation as successful incorrectly

Fix this by handling SM reported MMU_NACK exception

Enable MMU_NACK reporting in gv11b_gr_set_hww_esr_report_mask

In MMU_NACK handling path, we just set the error notifier and clear
the interrupt so that the User Space sees the error as soon as
semaphores are released by SM
And MMU_FAULT handling path will take care of triggering RC recovery
anyways

Also add necessary h/w accessors for mmu_nack

Bug 2040594
Jira NVGPU-473

Change-Id: Ic925c2d3f3069016c57d177713066c29ab39dc3d
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1631708
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Deepak Nibade
2018-01-04 05:02:46 -08:00
committed by mobile promotions
parent 6170f1eed5
commit 5b10690479
3 changed files with 61 additions and 3 deletions

View File

@@ -1994,6 +1994,39 @@ void gr_gv11b_get_access_map(struct gk20a *g,
*num_entries = ARRAY_SIZE(wl_addr_gv11b); *num_entries = ARRAY_SIZE(wl_addr_gv11b);
} }
static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm,
u32 warp_esr,
struct channel_gk20a *fault_ch)
{
struct tsg_gk20a *tsg;
u32 offset;
if (fault_ch) {
tsg = &g->fifo.tsg[fault_ch->tsgid];
/*
* Upon receiving MMU_FAULT error, MMU will forward MMU_NACK
* to SM. So MMU_FAULT handling path will take care of
* triggering RC recovery
*
* In MMU_NACK handling path, we just set the error notifier
* and clear the interrupt so that the User Space sees the error
* as soon as semaphores are released by SM
*/
gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
}
/* clear interrupt */
offset = gk20a_gr_gpc_offset(g, gpc) +
gk20a_gr_tpc_offset(g, tpc) +
gv11b_gr_sm_offset(g, sm);
nvgpu_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0);
return 0;
}
/* @brief pre-process work on the SM exceptions to determine if we clear them or not. /* @brief pre-process work on the SM exceptions to determine if we clear them or not.
* *
* On Pascal, if we are in CILP preemtion mode, preempt the channel and handle errors with special processing * On Pascal, if we are in CILP preemtion mode, preempt the channel and handle errors with special processing
@@ -2013,6 +2046,14 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
*early_exit = false; *early_exit = false;
*ignore_debugger = false; *ignore_debugger = false;
/*
* We don't need to trigger CILP in case of MMU_NACK
* So just handle MMU_NACK and return
*/
if (warp_esr & gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f())
return gr_gv11b_handle_warp_esr_error_mmu_nack(g, gpc, tpc, sm,
warp_esr, fault_ch);
if (fault_ch) if (fault_ch)
cilp_enabled = (fault_ch->ch_ctx.gr_ctx->compute_preempt_mode == cilp_enabled = (fault_ch->ch_ctx.gr_ctx->compute_preempt_mode ==
NVGPU_PREEMPTION_MODE_COMPUTE_CILP); NVGPU_PREEMPTION_MODE_COMPUTE_CILP);
@@ -2992,7 +3033,8 @@ void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g)
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_addr_space_report_f() | gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() | gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_report_f() | gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_report_f() |
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_fault_report_f()); gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_fault_report_f() |
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f());
/* setup sm global esr report mask. vat_alarm_report is not enabled */ /* setup sm global esr report mask. vat_alarm_report is not enabled */
gk20a_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r(), gk20a_writel(g, gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r(),

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -3344,6 +3344,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_repor
{ {
return 0x400000U; return 0x400000U;
} }
static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f(void)
{
return 0x4000000U;
}
static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_r(void) static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_r(void)
{ {
return 0x00419d0cU; return 0x00419d0cU;
@@ -3552,6 +3556,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_v(void)
{ {
return 0x00000000U; return 0x00000000U;
} }
static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void)
{
return 0x20U;
}
static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void) static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void)
{ {
return 0x0U; return 0x0U;

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -3940,6 +3940,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_stack_overflow_repor
{ {
return 0x400000U; return 0x400000U;
} }
static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_mmu_nack_report_f(void)
{
return 0x4000000U;
}
static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_r(void) static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_r(void)
{ {
return 0x00419d0cU; return 0x00419d0cU;
@@ -4240,6 +4244,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void)
{ {
return 0x0U; return 0x0U;
} }
static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void)
{
return 0x20U;
}
static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_wrap_id_m(void) static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_wrap_id_m(void)
{ {
return 0xffU << 16U; return 0xffU << 16U;