mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 10:34:43 +03:00
gpu: nvgpu: add new tsg functions for ctxsw timeout re-org
Add nvgpu_tsg_set_error_notifier function for setting error_notifier for all channels of a tsg. Add nvgpu_tsg_timeout_debug_dump_state function for finding if timeout_debug_dump is set for any of the channels of a tsg. Add nvgpu_tsg_set_timeout_accumulated_ms to set timeout_accumulated_ms for all the channels of a tsg. JIRA NVGPU-1312 Change-Id: Ib2daf2d462c2cf767f5a6e6fd3436abf6860091d Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2077626 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
540241e47c
commit
27e3546175
@@ -1397,12 +1397,18 @@ bool nvgpu_channel_mark_error(struct gk20a *g, struct channel_gk20a *ch)
|
||||
return verbose;
|
||||
}
|
||||
|
||||
void nvgpu_channel_set_error_notifier(struct gk20a *g, struct channel_gk20a *ch,
|
||||
u32 error_notifier)
|
||||
{
|
||||
g->ops.fifo.set_error_notifier(ch, error_notifier);
|
||||
}
|
||||
|
||||
void nvgpu_channel_set_ctx_mmu_error(struct gk20a *g,
|
||||
struct channel_gk20a *ch)
|
||||
{
|
||||
nvgpu_err(g,
|
||||
"channel %d generated a mmu fault", ch->chid);
|
||||
g->ops.fifo.set_error_notifier(ch,
|
||||
nvgpu_channel_set_error_notifier(g, ch,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT);
|
||||
}
|
||||
|
||||
@@ -1441,7 +1447,7 @@ bool nvgpu_channel_check_ctxsw_timeout(struct channel_gk20a *ch,
|
||||
*verbose = ch->timeout_debug_dump;
|
||||
*ms = ch->timeout_accumulated_ms;
|
||||
if (recover) {
|
||||
g->ops.fifo.set_error_notifier(ch,
|
||||
nvgpu_channel_set_error_notifier(g, ch,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
|
||||
}
|
||||
|
||||
|
||||
@@ -354,22 +354,60 @@ bool nvgpu_tsg_mark_error(struct gk20a *g,
|
||||
|
||||
}
|
||||
|
||||
void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g,
|
||||
struct tsg_gk20a *tsg)
|
||||
void nvgpu_tsg_set_timeout_accumulated_ms(struct tsg_gk20a *tsg, u32 ms)
|
||||
{
|
||||
struct channel_gk20a *ch = NULL;
|
||||
|
||||
nvgpu_err(g, "TSG %d generated a mmu fault", tsg->tsgid);
|
||||
|
||||
nvgpu_rwsem_down_read(&tsg->ch_list_lock);
|
||||
nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
|
||||
if (gk20a_channel_get(ch) != NULL) {
|
||||
nvgpu_channel_set_ctx_mmu_error(g, ch);
|
||||
ch->timeout_accumulated_ms = ms;
|
||||
gk20a_channel_put(ch);
|
||||
}
|
||||
}
|
||||
nvgpu_rwsem_up_read(&tsg->ch_list_lock);
|
||||
}
|
||||
|
||||
bool nvgpu_tsg_timeout_debug_dump_state(struct tsg_gk20a *tsg)
|
||||
{
|
||||
struct channel_gk20a *ch = NULL;
|
||||
bool verbose = false;
|
||||
|
||||
nvgpu_rwsem_down_read(&tsg->ch_list_lock);
|
||||
nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
|
||||
if (gk20a_channel_get(ch) != NULL) {
|
||||
if (ch->timeout_debug_dump) {
|
||||
verbose = true;
|
||||
}
|
||||
gk20a_channel_put(ch);
|
||||
}
|
||||
}
|
||||
nvgpu_rwsem_up_read(&tsg->ch_list_lock);
|
||||
|
||||
return verbose;
|
||||
}
|
||||
|
||||
void nvgpu_tsg_set_error_notifier(struct gk20a *g, struct tsg_gk20a *tsg,
|
||||
u32 error_notifier)
|
||||
{
|
||||
struct channel_gk20a *ch = NULL;
|
||||
|
||||
nvgpu_rwsem_down_read(&tsg->ch_list_lock);
|
||||
nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
|
||||
if (gk20a_channel_get(ch) != NULL) {
|
||||
nvgpu_channel_set_error_notifier(g, ch, error_notifier);
|
||||
gk20a_channel_put(ch);
|
||||
}
|
||||
}
|
||||
nvgpu_rwsem_up_read(&tsg->ch_list_lock);
|
||||
}
|
||||
|
||||
void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g, struct tsg_gk20a *tsg)
|
||||
{
|
||||
nvgpu_err(g, "TSG %d generated a mmu fault", tsg->tsgid);
|
||||
|
||||
nvgpu_tsg_set_error_notifier(g, tsg,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT);
|
||||
}
|
||||
|
||||
bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg,
|
||||
@@ -411,17 +449,9 @@ bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg,
|
||||
tsg->tsgid, ch->chid);
|
||||
*ms = ch->timeout_accumulated_ms;
|
||||
gk20a_channel_put(ch);
|
||||
nvgpu_list_for_each_entry(ch, &tsg->ch_list,
|
||||
channel_gk20a, ch_entry) {
|
||||
if (gk20a_channel_get(ch) != NULL) {
|
||||
ch->g->ops.fifo.set_error_notifier(ch,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
|
||||
if (ch->timeout_debug_dump) {
|
||||
*verbose = true;
|
||||
}
|
||||
gk20a_channel_put(ch);
|
||||
}
|
||||
}
|
||||
nvgpu_tsg_set_error_notifier(g, tsg,
|
||||
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
|
||||
*verbose = nvgpu_tsg_timeout_debug_dump_state(tsg);
|
||||
} else if (progress) {
|
||||
/*
|
||||
* if at least one channel in the TSG made some progress, reset
|
||||
@@ -433,13 +463,7 @@ bool nvgpu_tsg_check_ctxsw_timeout(struct tsg_gk20a *tsg,
|
||||
tsg->tsgid, ch->chid);
|
||||
gk20a_channel_put(ch);
|
||||
*ms = g->fifo_eng_timeout_us / 1000U;
|
||||
nvgpu_list_for_each_entry(ch, &tsg->ch_list,
|
||||
channel_gk20a, ch_entry) {
|
||||
if (gk20a_channel_get(ch) != NULL) {
|
||||
ch->timeout_accumulated_ms = *ms;
|
||||
gk20a_channel_put(ch);
|
||||
}
|
||||
}
|
||||
nvgpu_tsg_set_timeout_accumulated_ms(tsg, *ms);
|
||||
}
|
||||
|
||||
/* if we could not detect progress on any of the channel, but none
|
||||
|
||||
@@ -2938,7 +2938,6 @@ static void gk20a_gr_set_error_notifier(struct gk20a *g,
|
||||
{
|
||||
struct channel_gk20a *ch;
|
||||
struct tsg_gk20a *tsg;
|
||||
struct channel_gk20a *ch_tsg;
|
||||
|
||||
ch = isr_data->ch;
|
||||
|
||||
@@ -2948,16 +2947,7 @@ static void gk20a_gr_set_error_notifier(struct gk20a *g,
|
||||
|
||||
tsg = tsg_gk20a_from_ch(ch);
|
||||
if (tsg != NULL) {
|
||||
nvgpu_rwsem_down_read(&tsg->ch_list_lock);
|
||||
nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
|
||||
channel_gk20a, ch_entry) {
|
||||
if (gk20a_channel_get(ch_tsg) != NULL) {
|
||||
g->ops.fifo.set_error_notifier(ch_tsg,
|
||||
error_notifier);
|
||||
gk20a_channel_put(ch_tsg);
|
||||
}
|
||||
}
|
||||
nvgpu_rwsem_up_read(&tsg->ch_list_lock);
|
||||
nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
|
||||
} else {
|
||||
nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
|
||||
}
|
||||
|
||||
@@ -496,4 +496,7 @@ static inline u64 gk20a_channel_userd_gpu_va(struct channel_gk20a *c)
|
||||
struct nvgpu_mem *mem = c->userd_mem;
|
||||
return (mem->gpu_va != 0ULL) ? mem->gpu_va + c->userd_offset : 0ULL;
|
||||
}
|
||||
|
||||
void nvgpu_channel_set_error_notifier(struct gk20a *g, struct channel_gk20a *ch,
|
||||
u32 error_notifier);
|
||||
#endif
|
||||
|
||||
@@ -145,4 +145,8 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node)
|
||||
((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node));
|
||||
};
|
||||
|
||||
void nvgpu_tsg_set_error_notifier(struct gk20a *g, struct tsg_gk20a *tsg,
|
||||
u32 error_notifier);
|
||||
bool nvgpu_tsg_timeout_debug_dump_state(struct tsg_gk20a *tsg);
|
||||
void nvgpu_tsg_set_timeout_accumulated_ms(struct tsg_gk20a *tsg, u32 ms);
|
||||
#endif /* TSG_GK20A_H */
|
||||
|
||||
Reference in New Issue
Block a user