gpu: nvgpu: Fix the race between runtime PM and L2 flush

gk20a_mm_l2_flush flushes the L2 cache when "struct gk20a->power_on" is true. But it doesn't acquire power lock when doing that, which creates a race that runtime PM might suspend the GPU in the middle of L2 flush. The FB flush looks having the same issue with L2 flushing. This patch fixes that by calling pm_runtime_get_if_in_use at the beginning of the ioctl. This API from PM does a compare and add to the usage count. If the device was not in use, it simply returns without incrementing the usage count as its unnecessary to wake up the GPU(using e.g. a call to gk20a_busy()) as the caches are flushed when the device would be resumed anyways. Bug 2643951 Change-Id: I2417f7ca3223c722dcb4d9057d32a7e065b9e574 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2151532 GVS: Gerrit_Virtual_Submit Reviewed-by: Mark Zhang <markz@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2019-07-17 17:23:42 +05:30
parent 9fdb446b47
commit 47f6bc0c2e
3 changed files with 17 additions and 2 deletions
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -1747,6 +1747,7 @@ enum {
 #define GK20A_SIM_IORESOURCE_MEM 2

 void gk20a_busy_noresume(struct gk20a *g);
+int gk20a_busy_try_noresume(struct gk20a *g);
 void gk20a_idle_nosuspend(struct gk20a *g);
 int __must_check gk20a_busy(struct gk20a *g);
 void gk20a_idle(struct gk20a *g);
--- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
@@ -568,19 +568,28 @@ static int gk20a_ctrl_get_fbp_l2_masks(
 static int nvgpu_gpu_ioctl_l2_fb_ops(struct gk20a *g,
 		struct nvgpu_gpu_l2_fb_args *args)
 {
-	int err = 0;
+	int ret = 0;

 	if ((!args->l2_flush && !args->fb_flush) ||
 	    (!args->l2_flush && args->l2_invalidate))
 		return -EINVAL;

+	ret = gk20a_busy_try_noresume(g);
+
+	/* return if device is already powered off */
+	if (ret == 0)
+		return 0;
+
 	if (args->l2_flush)
 		g->ops.mm.l2_flush(g, args->l2_invalidate ? true : false);

 	if (args->fb_flush)
 		g->ops.mm.fb_flush(g);

-	return err;
+  	if (ret > 0)
+		gk20a_idle_nosuspend(g);
+
+	return 0;
 }

 static int nvgpu_gpu_ioctl_set_mmu_debug_mode(
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -109,6 +109,11 @@ void gk20a_busy_noresume(struct gk20a *g)
 	pm_runtime_get_noresume(dev_from_gk20a(g));
 }

+int gk20a_busy_try_noresume(struct gk20a *g)
+{
+	return pm_runtime_get_if_in_use(dev_from_gk20a(g));
+}
+
 /*
 * Check if the device can go busy.
 */