gpu: nvgpu: add refcounting for ctxsw disable/enable

ctxsw disable could be called recursively for RM server. Suspend
contexts disables ctxsw at the beginning, then call tsg disable and
preempt. If preempt timeout happens, it goes to recovery path, which
will try to disable ctxsw again. More details on Bug 200331110.

Jira VQRM-2982

Change-Id: I4659c842ae73ed59be51ae65b25366f24abcaf22
Signed-off-by: Shashank Singh <shashsingh@nvidia.com>
Signed-off-by: Richard Zhao <rizhao@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1671716
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Sourab Gupta <sourabg@nvidia.com>
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Shashank Singh
2017-12-28 11:52:14 +05:30
committed by mobile promotions
parent 02956adcd3
commit db089a73a5
4 changed files with 29 additions and 7 deletions

View File

@@ -62,6 +62,7 @@ static void nvgpu_init_vars(struct gk20a *g)
nvgpu_mutex_init(&g->client_lock);
nvgpu_mutex_init(&g->poweron_lock);
nvgpu_mutex_init(&g->poweroff_lock);
nvgpu_mutex_init(&g->ctxsw_disable_lock);
l->regs_saved = l->regs;
l->bar1_saved = l->bar1;

View File

@@ -76,6 +76,7 @@ static void vgpu_init_vars(struct gk20a *g, struct gk20a_platform *platform)
nvgpu_mutex_init(&g->poweron_lock);
nvgpu_mutex_init(&g->poweroff_lock);
nvgpu_mutex_init(&g->ctxsw_disable_lock);
l->regs_saved = l->regs;
l->bar1_saved = l->bar1;

View File

@@ -1191,6 +1191,9 @@ struct gk20a {
nvgpu_atomic_t usage_count;
struct nvgpu_mutex ctxsw_disable_lock;
int ctxsw_disable_count;
struct nvgpu_ref refcount;
const char *name;

View File

@@ -616,22 +616,39 @@ static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
.cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
}
/* Stop processing (stall) context switches at FECS.
* The caller must hold the dbg_sessions_lock, else if mutliple stop methods
* are sent to the ucode in sequence, it can get into an undefined state. */
/* Stop processing (stall) context switches at FECS. */
int gr_gk20a_disable_ctxsw(struct gk20a *g)
{
int err = 0;
gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
return gr_gk20a_ctrl_ctxsw(g,
nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
g->ctxsw_disable_count++;
if (g->ctxsw_disable_count == 1)
err = gr_gk20a_ctrl_ctxsw(g,
gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
nvgpu_mutex_release(&g->ctxsw_disable_lock);
return err;
}
/* Start processing (continue) context switches at FECS */
int gr_gk20a_enable_ctxsw(struct gk20a *g)
{
int err = 0;
gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
return gr_gk20a_ctrl_ctxsw(g,
nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
g->ctxsw_disable_count--;
WARN_ON(g->ctxsw_disable_count < 0);
if (g->ctxsw_disable_count == 0)
err = gr_gk20a_ctrl_ctxsw(g,
gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
nvgpu_mutex_release(&g->ctxsw_disable_lock);
return err;
}
int gr_gk20a_halt_pipe(struct gk20a *g)