gpu: nvgpu: fix use-after-free use case of CE APP.

The following issue is reported when running sudo modprobe -r nvgpu

[  134.066392] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058
[  134.066428] Mem abort info:
[  134.066431]   ESR = 0x96000004
[  134.066434]   EC = 0x25: DABT (current EL), IL = 32 bit
[  134.066450] [0000000000000058] pgd=0000000000000000, p4d=0000000000000000
[  134.066459] Internal error: Oops: 96000004 [#1] PREEMPT_RT SMP

[  134.066639] pc : nvgpu_cic_rm_wait_for_stall_interrupts+0x78/0xd0 [nvgpu]
[  134.066847] lr : nvgpu_cic_rm_wait_for_stall_interrupts+0x74/0xd0 [nvgpu]
[  134.067043] sp : ffff80001971ba80
[  134.067046] x29: ffff80001971ba80 x28: ffff000093b0da00
[  134.067054] x27: 0000000000000000 x26: ffff80001c28b990
[  134.067061] x25: ffff00008cd01000 x24: 0000000000000bb8
[  134.067067] x23: 0000000000000000 x22: ffff0000915b0000
[  134.067073] x21: ffff000093b0da00 x20: ffff0000915b0000
[  134.067079] x19: ffff0000915b0000 x18: 0000000000000036
[  134.067085] x17: 0000000000000000 x16: 0000000000000000
[  134.067091] x15: ffff8000126b5fd8 x14: 7373616c633d4d45
[  134.067097] x13: ffff8000098abef0 x12: 0000000000000000
[  134.067102] x11: ffff8000098ab5a0 x10: ffff8000098abef8
[  134.067108] x9 : ffff80001010e844 x8 : ffff80001971ba48
[  134.067115] x7 : 2222222222222222 x6 : ffff000093b0da00
[  134.067122] x5 : ffff8000098b1fd8 x4 : 0000000000000000
[  134.067127] x3 : 0000000000000000 x2 : 0000000000000000
[  134.067133] x1 : 0000000000000000 x0 : 0000000000000000
[  134.067138] Call trace:
[  134.067140]  nvgpu_cic_rm_wait_for_stall_interrupts+0x78/0xd0 [nvgpu]
[  134.067328]  nvgpu_cic_rm_wait_for_deferred_interrupts+0x20/0xb0 [nvgpu]
[  134.067517]  nvgpu_channel_deferred_reset_engines+0x29c/0x920 [nvgpu]
[  134.067714]  nvgpu_channel_close+0x18/0x20 [nvgpu]
[  134.067904]  nvgpu_init_pramin+0x2ac/0x350 [nvgpu]
[  134.068092]  nvgpu_ce_app_destroy+0x94/0xe0 [nvgpu]
[  134.068279]  nvgpu_put+0x90/0x120 [nvgpu]
[  134.068465]  nvgpu_pci_shutdown+0x29c/0x18a0 [nvgpu]
[  134.068655]  pci_device_remove+0x44/0xe0
[  134.068665]  device_release_driver_internal+0x114/0x1f0
[  134.068701]  driver_detach+0x54/0xe0
[  134.068709]  bus_remove_driver+0x70/0x120
[  134.068733]  driver_unregister+0x34/0x60

The above issue occurs due to freeing of CIC resources earlier than
dependent users of interrupts e.g. CDE, CE etc.

As a solution, move CIC deinit sequence to end of nvgpu_put.
This handles deinit properly for VGPU/IGPU/DGPU.

Bug 200763510

Change-Id: I696e31d5e03a9468cccfe710048000dbf7cf0269
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2592063
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Debarshi Dutta
2021-09-09 12:38:30 +05:30
committed by mobile promotions
parent aa08389240
commit 9328f057a7
3 changed files with 4 additions and 20 deletions

View File

@@ -1102,6 +1102,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount)
g->ops.ltc.ltc_remove_support(g);
}
(void)nvgpu_cic_rm_deinit_vars(g);
(void)nvgpu_cic_mon_remove(g);
(void)nvgpu_cic_rm_remove(g);
/*
* Free the device list once the gk20a struct is removed. We don't want
* to do this during the railgate poweroff sequence since that means

View File

@@ -1919,12 +1919,6 @@ int nvgpu_remove(struct device *dev)
nvgpu_mutex_destroy(&g->clk_arb_enable_lock);
err = nvgpu_cic_rm_deinit_vars(g);
if (err != 0) {
nvgpu_err(g, "CIC-RM deinit vars failed.");
return err;
}
nvgpu_log_fn(g, "removed");
return err;
@@ -1942,18 +1936,6 @@ static int __exit gk20a_remove(struct platform_device *pdev)
err = nvgpu_remove(dev);
err = nvgpu_cic_mon_remove(g);
if (err != 0) {
nvgpu_err(g, "CIC-MON remove failed");
return err;
}
err = nvgpu_cic_rm_remove(g);
if (err != 0) {
nvgpu_err(g, "CIC-RM remove failed.");
return err;
}
gk20a_dma_buf_priv_list_clear(l);
nvgpu_mutex_destroy(&l->dmabuf_priv_list_lock);

View File

@@ -780,8 +780,6 @@ static void nvgpu_pci_remove(struct pci_dev *pdev)
nvgpu_enable_irqs(g);
}
#endif
(void)nvgpu_cic_mon_remove(g);
(void)nvgpu_cic_rm_remove(g);
nvgpu_pci_pm_deinit(&pdev->dev);