From ea14973b149930669a8fe2aa37cdfad41759001d Mon Sep 17 00:00:00 2001 From: Peter Daifuku Date: Thu, 30 Jan 2020 10:58:54 -0800 Subject: [PATCH] gpu: nvgpu: vgpu: fix tsg_unbind in recovery case When unbinding a channel from a tsg when virtual, vgpu_tsg_unbind_channel would return an error if unbinding the channel on the guest side failed, and did so before notifying the RM server of the unbind. Later on in the recovery process, the guest OS would remove the channel from the TSG's list, but this would leave the RM server with an out-of-date channel list. Fix this by making the tsg_unbind_channel HAL optional and implemented only for vgpu: the vgpu version now just notifies the RM server so that it can clean up its version of the TSG; if vgpu, always call the tsg_unbind_channel HAL whether or not the local unbind succeeded. Minimal port from dev-main of https://git-master.nvidia.com/r/c/linux-nvgpu/+/2084029 Bug 2766920 Bug 200587845 Change-Id: I75bddf3a28ac20bf4fb7510ff64097a32c7eec3f Signed-off-by: Peter Daifuku Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2287774 (cherry picked from commit 471c72c1efcc4fe6d547f556edf7773827fd2674) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2289928 Reviewed-by: Thomas Steinle Reviewed-by: Satish Arora Reviewed-by: svc-mobile-coverity Reviewed-by: mobile promotions Tested-by: mobile promotions GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/common/fifo/tsg.c | 7 ++++++- drivers/gpu/nvgpu/gm20b/hal_gm20b.c | 2 +- drivers/gpu/nvgpu/gp106/hal_gp106.c | 4 ++-- drivers/gpu/nvgpu/gp10b/hal_gp10b.c | 2 +- drivers/gpu/nvgpu/gv100/hal_gv100.c | 2 +- drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 2 +- drivers/gpu/nvgpu/vgpu/tsg_vgpu.c | 8 ++------ 7 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index 65cee225b..f6c718f04 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -158,7 +158,7 @@ int gk20a_tsg_unbind_channel(struct channel_gk20a *ch) return -EINVAL; } - err = g->ops.fifo.tsg_unbind_channel(ch); + err = gk20a_fifo_tsg_unbind_channel(ch); if (err) { nvgpu_err(g, "Channel %d unbind failed, tearing down TSG %d", ch->chid, tsg->tsgid); @@ -172,6 +172,11 @@ int gk20a_tsg_unbind_channel(struct channel_gk20a *ch) ch->tsgid = NVGPU_INVALID_TSG_ID; nvgpu_rwsem_up_write(&tsg->ch_list_lock); } + + if (g->ops.fifo.tsg_unbind_channel != NULL) { + err = g->ops.fifo.tsg_unbind_channel(ch); + } + nvgpu_log(g, gpu_dbg_fn, "UNBIND tsg:%d channel:%d", tsg->tsgid, ch->chid); diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c index 9898c6836..c470f5203 100644 --- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c @@ -459,7 +459,7 @@ static const struct gpu_ops gm20b_ops = { .handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0, .handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1, .tsg_bind_channel = gk20a_tsg_bind_channel, - .tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, + .tsg_unbind_channel = NULL, .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c index d033a5167..17f3ccf57 100644 --- a/drivers/gpu/nvgpu/gp106/hal_gp106.c +++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c @@ -1,7 +1,7 @@ /* * GP106 HAL interface * - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -527,7 +527,7 @@ static const struct gpu_ops gp106_ops = { .handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0, .handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1, .tsg_bind_channel = gk20a_tsg_bind_channel, - .tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, + .tsg_unbind_channel = NULL, .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c index 3cdba8acc..d3409b092 100644 --- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c @@ -496,7 +496,7 @@ static const struct gpu_ops gp10b_ops = { .handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0, .handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1, .tsg_bind_channel = gk20a_tsg_bind_channel, - .tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, + .tsg_unbind_channel = NULL, .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index 0e0417a06..696316f5c 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -651,7 +651,7 @@ static const struct gpu_ops gv100_ops = { .deinit_eng_method_buffers = gv11b_fifo_deinit_eng_method_buffers, .tsg_bind_channel = gk20a_tsg_bind_channel, - .tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, + .tsg_unbind_channel = NULL, .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index f7fabf682..2f7b4abca 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -618,7 +618,7 @@ static const struct gpu_ops gv11b_ops = { .deinit_eng_method_buffers = gv11b_fifo_deinit_eng_method_buffers, .tsg_bind_channel = gk20a_tsg_bind_channel, - .tsg_unbind_channel = gk20a_fifo_tsg_unbind_channel, + .tsg_unbind_channel = NULL, .post_event_id = gk20a_tsg_event_id_post_event, .ch_abort_clean_up = gk20a_channel_abort_clean_up, .check_tsg_ctxsw_timeout = gk20a_fifo_check_tsg_ctxsw_timeout, diff --git a/drivers/gpu/nvgpu/vgpu/tsg_vgpu.c b/drivers/gpu/nvgpu/vgpu/tsg_vgpu.c index 3553bf51d..d6060e06f 100644 --- a/drivers/gpu/nvgpu/vgpu/tsg_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/tsg_vgpu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -128,12 +128,8 @@ int vgpu_tsg_unbind_channel(struct channel_gk20a *ch) nvgpu_log_fn(g, " "); - err = gk20a_fifo_tsg_unbind_channel(ch); - if (err) - return err; - msg.cmd = TEGRA_VGPU_CMD_TSG_UNBIND_CHANNEL; - msg.handle = vgpu_get_handle(ch->g); + msg.handle = vgpu_get_handle(g); p->ch_handle = ch->virt_ctx; err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg)); err = err ? err : msg.ret;