gpu: nvgpu: non abortable TSG for vidmem-clear

When an engine faults due to unbound instance block, all
active TSGs are currently aborted. This includes the TSG
used by vidmem-clear task to clear vidmem buffers. From
this point nvgpu_vidmem_clear cannot submit jobs anymore.

Define TSG in MM CE context as non-abortable, and skip it
when aborting active TSGs.

Bug 2486146

Change-Id: I221259aec468e8ee3a24e80fab8d8fb7ee8607b0
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2008954
(cherry picked from commit 6f2444dc5e128aa2b870796bd1e9dee7853f90af)
Reviewed-on: https://git-master.nvidia.com/r/2008942
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Thomas Fleury
2019-01-31 10:36:05 -08:00
committed by mobile promotions
parent bccb49d8fb
commit 13afcc24c3
5 changed files with 17 additions and 1 deletions

View File

@@ -228,6 +228,7 @@ int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid)
tsg->in_use = false; tsg->in_use = false;
tsg->tsgid = tsgid; tsg->tsgid = tsgid;
tsg->abortable = true;
nvgpu_init_list_node(&tsg->ch_list); nvgpu_init_list_node(&tsg->ch_list);
nvgpu_rwsem_init(&tsg->ch_list_lock); nvgpu_rwsem_init(&tsg->ch_list_lock);

View File

@@ -128,6 +128,7 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
struct nvgpu_list_node *list = &ce_ctx->list; struct nvgpu_list_node *list = &ce_ctx->list;
ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED; ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
ce_ctx->tsg->abortable = true;
nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
@@ -478,6 +479,9 @@ u32 gk20a_ce_create_context(struct gk20a *g,
goto end; goto end;
} }
/* this TSG should never be aborted */
ce_ctx->tsg->abortable = false;
/* always kernel client needs privileged channel */ /* always kernel client needs privileged channel */
ce_ctx->ch = gk20a_open_new_channel(g, runlist_id, true, ce_ctx->ch = gk20a_open_new_channel(g, runlist_id, true,
nvgpu_current_pid(g), nvgpu_current_tid(g)); nvgpu_current_pid(g), nvgpu_current_tid(g));

View File

@@ -1232,6 +1232,8 @@ void gk20a_fifo_abort_tsg(struct gk20a *g, struct tsg_gk20a *tsg, bool preempt)
nvgpu_log_fn(g, " "); nvgpu_log_fn(g, " ");
WARN_ON(tsg->abortable == false);
g->ops.fifo.disable_tsg(tsg); g->ops.fifo.disable_tsg(tsg);
if (preempt) { if (preempt) {

View File

@@ -944,8 +944,16 @@ static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
for_each_set_bit(tsgid, runlist->active_tsgs, for_each_set_bit(tsgid, runlist->active_tsgs,
g->fifo.num_channels) { g->fifo.num_channels) {
nvgpu_log(g, gpu_dbg_info, "abort tsg id %lu", tsgid);
tsg = &g->fifo.tsg[tsgid]; tsg = &g->fifo.tsg[tsgid];
if (!tsg->abortable) {
nvgpu_log(g, gpu_dbg_info,
"tsg %lu is not abortable, skipping",
tsgid);
continue;
}
nvgpu_log(g, gpu_dbg_info, "abort tsg id %lu", tsgid);
gk20a_disable_tsg(tsg); gk20a_disable_tsg(tsg);
/* assume all pbdma and eng faulted are set */ /* assume all pbdma and eng faulted are set */

View File

@@ -71,6 +71,7 @@ struct tsg_gk20a {
u8 tpc_pg_enabled; u8 tpc_pg_enabled;
bool tpc_num_initialized; bool tpc_num_initialized;
bool in_use; bool in_use;
bool abortable;
struct nvgpu_tsg_sm_error_state *sm_error_states; struct nvgpu_tsg_sm_error_state *sm_error_states;