gpu: nvgpu: fix crash due to accessing incorrect TSG pointer

In gk20a_gr_isr(), we handle various errors including GPC/TPC errors.
And then if BPT errors are pending we call gk20a_gr_post_bpt_events() at the
end and pass channel pointer to it

gk20a_gr_post_bpt_events() extracts TSG pointer based on ch->tsgid

But in some race conditions it is possible that we clear the error and trigger
recovery and as a result channel is unbounded from TSG and closed by user space
before calling gk20a_gr_post_bpt_events()

And in that case the code above results in getting incorrect TSG pointer and
hence crashes as below

Unable to handle kernel paging request at virtual address ffffff8012000c08
...
[<ffffff8008081f84>] el1_da+0x24/0xb4
[<ffffff80086e72e0>] gk20a_tsg_get_event_data_from_id+0x30/0xb0
[<ffffff80086e7560>] gk20a_tsg_event_id_post_event+0x50/0xc8
[<ffffff800872922c>] gk20a_gr_isr+0x27c/0x12e0

To fix this extract the TSG pointer before handling all the errors and pass
this pointer to gk20a_gr_post_bpt_events() will post the events if they are
enabled and if TSG is still open

Bug 200404720

Change-Id: I4861c72e338a2cec96f31cb9488af665c5f2be39
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1735415
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vinod Gopalakrishnakurup <vinodg@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Deepak Nibade
2018-05-30 18:47:44 -07:00
committed by Tejal Kudav
parent 7aa928fa07
commit 4252e00aa6

View File

@@ -5837,19 +5837,14 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
return ret; return ret;
} }
static int gk20a_gr_post_bpt_events(struct gk20a *g, struct channel_gk20a *ch, static int gk20a_gr_post_bpt_events(struct gk20a *g, struct tsg_gk20a *tsg,
u32 global_esr) u32 global_esr)
{ {
if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) { if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f())
struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT); g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT);
}
if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) {
struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f())
g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE); g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE);
}
return 0; return 0;
} }
@@ -5864,6 +5859,7 @@ int gk20a_gr_isr(struct gk20a *g)
struct channel_gk20a *ch = NULL; struct channel_gk20a *ch = NULL;
struct channel_gk20a *fault_ch = NULL; struct channel_gk20a *fault_ch = NULL;
int tsgid = NVGPU_INVALID_TSG_ID; int tsgid = NVGPU_INVALID_TSG_ID;
struct tsg_gk20a *tsg = NULL;
u32 gr_engine_id; u32 gr_engine_id;
u32 global_esr = 0; u32 global_esr = 0;
@@ -5903,6 +5899,9 @@ int gk20a_gr_isr(struct gk20a *g)
nvgpu_err(g, "ch id is INVALID 0xffffffff"); nvgpu_err(g, "ch id is INVALID 0xffffffff");
} }
if (ch && gk20a_is_channel_marked_as_tsg(ch))
tsg = &g->fifo.tsg[ch->tsgid];
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"channel %d: addr 0x%08x, " "channel %d: addr 0x%08x, "
"data 0x%08x 0x%08x," "data 0x%08x 0x%08x,"
@@ -6126,8 +6125,8 @@ int gk20a_gr_isr(struct gk20a *g)
"unhandled gr interrupt 0x%08x", gr_intr); "unhandled gr interrupt 0x%08x", gr_intr);
/* Posting of BPT events should be the last thing in this function */ /* Posting of BPT events should be the last thing in this function */
if (global_esr && fault_ch) if (global_esr && tsg)
gk20a_gr_post_bpt_events(g, fault_ch, global_esr); gk20a_gr_post_bpt_events(g, tsg, global_esr);
if (ch) if (ch)
gk20a_channel_put(ch); gk20a_channel_put(ch);