From 9328f057a726a0af7725e34f94ed9cc28f7696b3 Mon Sep 17 00:00:00 2001 From: Debarshi Dutta Date: Thu, 9 Sep 2021 12:38:30 +0530 Subject: [PATCH] gpu: nvgpu: fix use-after-free use case of CE APP. The following issue is reported when running sudo modprobe -r nvgpu [ 134.066392] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 [ 134.066428] Mem abort info: [ 134.066431] ESR = 0x96000004 [ 134.066434] EC = 0x25: DABT (current EL), IL = 32 bit [ 134.066450] [0000000000000058] pgd=0000000000000000, p4d=0000000000000000 [ 134.066459] Internal error: Oops: 96000004 [#1] PREEMPT_RT SMP [ 134.066639] pc : nvgpu_cic_rm_wait_for_stall_interrupts+0x78/0xd0 [nvgpu] [ 134.066847] lr : nvgpu_cic_rm_wait_for_stall_interrupts+0x74/0xd0 [nvgpu] [ 134.067043] sp : ffff80001971ba80 [ 134.067046] x29: ffff80001971ba80 x28: ffff000093b0da00 [ 134.067054] x27: 0000000000000000 x26: ffff80001c28b990 [ 134.067061] x25: ffff00008cd01000 x24: 0000000000000bb8 [ 134.067067] x23: 0000000000000000 x22: ffff0000915b0000 [ 134.067073] x21: ffff000093b0da00 x20: ffff0000915b0000 [ 134.067079] x19: ffff0000915b0000 x18: 0000000000000036 [ 134.067085] x17: 0000000000000000 x16: 0000000000000000 [ 134.067091] x15: ffff8000126b5fd8 x14: 7373616c633d4d45 [ 134.067097] x13: ffff8000098abef0 x12: 0000000000000000 [ 134.067102] x11: ffff8000098ab5a0 x10: ffff8000098abef8 [ 134.067108] x9 : ffff80001010e844 x8 : ffff80001971ba48 [ 134.067115] x7 : 2222222222222222 x6 : ffff000093b0da00 [ 134.067122] x5 : ffff8000098b1fd8 x4 : 0000000000000000 [ 134.067127] x3 : 0000000000000000 x2 : 0000000000000000 [ 134.067133] x1 : 0000000000000000 x0 : 0000000000000000 [ 134.067138] Call trace: [ 134.067140] nvgpu_cic_rm_wait_for_stall_interrupts+0x78/0xd0 [nvgpu] [ 134.067328] nvgpu_cic_rm_wait_for_deferred_interrupts+0x20/0xb0 [nvgpu] [ 134.067517] nvgpu_channel_deferred_reset_engines+0x29c/0x920 [nvgpu] [ 134.067714] nvgpu_channel_close+0x18/0x20 [nvgpu] [ 134.067904] nvgpu_init_pramin+0x2ac/0x350 [nvgpu] [ 134.068092] nvgpu_ce_app_destroy+0x94/0xe0 [nvgpu] [ 134.068279] nvgpu_put+0x90/0x120 [nvgpu] [ 134.068465] nvgpu_pci_shutdown+0x29c/0x18a0 [nvgpu] [ 134.068655] pci_device_remove+0x44/0xe0 [ 134.068665] device_release_driver_internal+0x114/0x1f0 [ 134.068701] driver_detach+0x54/0xe0 [ 134.068709] bus_remove_driver+0x70/0x120 [ 134.068733] driver_unregister+0x34/0x60 The above issue occurs due to freeing of CIC resources earlier than dependent users of interrupts e.g. CDE, CE etc. As a solution, move CIC deinit sequence to end of nvgpu_put. This handles deinit properly for VGPU/IGPU/DGPU. Bug 200763510 Change-Id: I696e31d5e03a9468cccfe710048000dbf7cf0269 Signed-off-by: Debarshi Dutta Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2592063 Tested-by: mobile promotions Reviewed-by: mobile promotions --- drivers/gpu/nvgpu/common/init/nvgpu_init.c | 4 ++++ drivers/gpu/nvgpu/os/linux/module.c | 18 ------------------ drivers/gpu/nvgpu/os/linux/pci.c | 2 -- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c index af07361d7..e8f1c5202 100644 --- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c +++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c @@ -1102,6 +1102,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount) g->ops.ltc.ltc_remove_support(g); } + (void)nvgpu_cic_rm_deinit_vars(g); + (void)nvgpu_cic_mon_remove(g); + (void)nvgpu_cic_rm_remove(g); + /* * Free the device list once the gk20a struct is removed. We don't want * to do this during the railgate poweroff sequence since that means diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index 1cc01f85e..9b5547707 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -1919,12 +1919,6 @@ int nvgpu_remove(struct device *dev) nvgpu_mutex_destroy(&g->clk_arb_enable_lock); - err = nvgpu_cic_rm_deinit_vars(g); - if (err != 0) { - nvgpu_err(g, "CIC-RM deinit vars failed."); - return err; - } - nvgpu_log_fn(g, "removed"); return err; @@ -1942,18 +1936,6 @@ static int __exit gk20a_remove(struct platform_device *pdev) err = nvgpu_remove(dev); - err = nvgpu_cic_mon_remove(g); - if (err != 0) { - nvgpu_err(g, "CIC-MON remove failed"); - return err; - } - - err = nvgpu_cic_rm_remove(g); - if (err != 0) { - nvgpu_err(g, "CIC-RM remove failed."); - return err; - } - gk20a_dma_buf_priv_list_clear(l); nvgpu_mutex_destroy(&l->dmabuf_priv_list_lock); diff --git a/drivers/gpu/nvgpu/os/linux/pci.c b/drivers/gpu/nvgpu/os/linux/pci.c index 18f4d1191..d7c988ec6 100644 --- a/drivers/gpu/nvgpu/os/linux/pci.c +++ b/drivers/gpu/nvgpu/os/linux/pci.c @@ -780,8 +780,6 @@ static void nvgpu_pci_remove(struct pci_dev *pdev) nvgpu_enable_irqs(g); } #endif - (void)nvgpu_cic_mon_remove(g); - (void)nvgpu_cic_rm_remove(g); nvgpu_pci_pm_deinit(&pdev->dev);