gpu: nvgpu: vgpu: fix tsg_unbind in recovery case

When unbinding a channel from a tsg when virtual, vgpu_tsg_unbind_channel
would return an error if unbinding the channel on the guest side failed,
and did so before notifying the RM server of the unbind.

Later on in the recovery process, the guest OS would remove the channel from the
TSG's list, but this would leave the RM server with an out-of-date channel list.

Fix this by making the tsg_unbind_channel HAL optional and implemented only for vgpu:
the vgpu version now just notifies the RM server so that it can clean up its version
of the TSG; if vgpu, always call the tsg_unbind_channel HAL whether or not
the local unbind succeeded.

Minimal port from dev-main of https://git-master.nvidia.com/r/c/linux-nvgpu/+/2084029

Bug 2766920
Bug 200587845

Change-Id: I75bddf3a28ac20bf4fb7510ff64097a32c7eec3f
Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2287774
(cherry picked from commit 471c72c1efcc4fe6d547f556edf7773827fd2674)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2289928
Reviewed-by: Thomas Steinle <tsteinle@nvidia.com>
Reviewed-by: Satish Arora <satisha@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Peter Daifuku
2020-01-30 10:58:54 -08:00
committed by mobile promotions
parent e1683ce076
commit ea14973b14
7 changed files with 14 additions and 13 deletions

View File

@@ -158,7 +158,7 @@ int gk20a_tsg_unbind_channel(struct channel_gk20a *ch)
return -EINVAL; return -EINVAL;
} }
err = g->ops.fifo.tsg_unbind_channel(ch); err = gk20a_fifo_tsg_unbind_channel(ch);
if (err) { if (err) {
nvgpu_err(g, "Channel %d unbind failed, tearing down TSG %d", nvgpu_err(g, "Channel %d unbind failed, tearing down TSG %d",
ch->chid, tsg->tsgid); ch->chid, tsg->tsgid);
@@ -172,6 +172,11 @@ int gk20a_tsg_unbind_channel(struct channel_gk20a *ch)
ch->tsgid = NVGPU_INVALID_TSG_ID; ch->tsgid = NVGPU_INVALID_TSG_ID;
nvgpu_rwsem_up_write(&tsg->ch_list_lock); nvgpu_rwsem_up_write(&tsg->ch_list_lock);
} }
if (g->ops.fifo.tsg_unbind_channel != NULL) {
err = g->ops.fifo.tsg_unbind_channel(ch);
}
nvgpu_log(g, gpu_dbg_fn, "UNBIND tsg:%d channel:%d", nvgpu_log(g, gpu_dbg_fn, "UNBIND tsg:%d channel:%d",
tsg->tsgid, ch->chid); tsg->tsgid, ch->chid);

View File

@@ -459,7 +459,7 @@ static const struct gpu_ops gm20b_ops = {
.handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0, .handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0,
.handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1, .handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1,
.tsg_bind_channel = gk20a_tsg_bind_channel, .tsg_bind_channel = gk20a_tsg_bind_channel,
.tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, .tsg_unbind_channel = NULL,
.post_event_id = gk20a_tsg_event_id_post_event, .post_event_id = gk20a_tsg_event_id_post_event,
.ch_abort_clean_up = gk20a_channel_abort_clean_up, .ch_abort_clean_up = gk20a_channel_abort_clean_up,
.check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout,

View File

@@ -1,7 +1,7 @@
/* /*
* GP106 HAL interface * GP106 HAL interface
* *
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -527,7 +527,7 @@ static const struct gpu_ops gp106_ops = {
.handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0, .handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0,
.handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1, .handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1,
.tsg_bind_channel = gk20a_tsg_bind_channel, .tsg_bind_channel = gk20a_tsg_bind_channel,
.tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, .tsg_unbind_channel = NULL,
.post_event_id = gk20a_tsg_event_id_post_event, .post_event_id = gk20a_tsg_event_id_post_event,
.ch_abort_clean_up = gk20a_channel_abort_clean_up, .ch_abort_clean_up = gk20a_channel_abort_clean_up,
.check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout,

View File

@@ -496,7 +496,7 @@ static const struct gpu_ops gp10b_ops = {
.handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0, .handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0,
.handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1, .handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1,
.tsg_bind_channel = gk20a_tsg_bind_channel, .tsg_bind_channel = gk20a_tsg_bind_channel,
.tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, .tsg_unbind_channel = NULL,
.post_event_id = gk20a_tsg_event_id_post_event, .post_event_id = gk20a_tsg_event_id_post_event,
.ch_abort_clean_up = gk20a_channel_abort_clean_up, .ch_abort_clean_up = gk20a_channel_abort_clean_up,
.check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout,

View File

@@ -651,7 +651,7 @@ static const struct gpu_ops gv100_ops = {
.deinit_eng_method_buffers = .deinit_eng_method_buffers =
gv11b_fifo_deinit_eng_method_buffers, gv11b_fifo_deinit_eng_method_buffers,
.tsg_bind_channel = gk20a_tsg_bind_channel, .tsg_bind_channel = gk20a_tsg_bind_channel,
.tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, .tsg_unbind_channel = NULL,
.post_event_id = gk20a_tsg_event_id_post_event, .post_event_id = gk20a_tsg_event_id_post_event,
.ch_abort_clean_up = gk20a_channel_abort_clean_up, .ch_abort_clean_up = gk20a_channel_abort_clean_up,
.check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout,

View File

@@ -618,7 +618,7 @@ static const struct gpu_ops gv11b_ops = {
.deinit_eng_method_buffers = .deinit_eng_method_buffers =
gv11b_fifo_deinit_eng_method_buffers, gv11b_fifo_deinit_eng_method_buffers,
.tsg_bind_channel = gk20a_tsg_bind_channel, .tsg_bind_channel = gk20a_tsg_bind_channel,
.tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, .tsg_unbind_channel = NULL,
.post_event_id = gk20a_tsg_event_id_post_event, .post_event_id = gk20a_tsg_event_id_post_event,
.ch_abort_clean_up = gk20a_channel_abort_clean_up, .ch_abort_clean_up = gk20a_channel_abort_clean_up,
.check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout,

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -128,12 +128,8 @@ int vgpu_tsg_unbind_channel(struct channel_gk20a *ch)
nvgpu_log_fn(g, " "); nvgpu_log_fn(g, " ");
err = gk20a_fifo_tsg_unbind_channel(ch);
if (err)
return err;
msg.cmd = TEGRA_VGPU_CMD_TSG_UNBIND_CHANNEL; msg.cmd = TEGRA_VGPU_CMD_TSG_UNBIND_CHANNEL;
msg.handle = vgpu_get_handle(ch->g); msg.handle = vgpu_get_handle(g);
p->ch_handle = ch->virt_ctx; p->ch_handle = ch->virt_ctx;
err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg)); err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
err = err ? err : msg.ret; err = err ? err : msg.ret;