gpu: nvgpu: add hal to mask/unmask intr during teardown

ctxsw timeout error prevents recovery as it can get triggered periodically. Disable ctxsw timeout interrupt to allow recovery. Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I47470e13968d8b26cdaf519b62fd510bc7ea05d9 Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2019645 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> (cherry picked from commit 68c13e2f04 in dev-kernel) Reviewed-on: https://git-master.nvidia.com/r/2024899 GVS: Gerrit_Virtual_Submit Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-02-14 13:36:19 -08:00
parent 9bde6f8950
commit ef69df6dae
11 changed files with 90 additions and 18 deletions
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1980,6 +1980,27 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
 	nvgpu_mutex_release(&g->dbg_sessions_lock);
 }

+void gk20a_fifo_teardown_mask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val &= ~(fifo_intr_en_0_sched_error_m() |
+		fifo_intr_en_0_mmu_fault_m());
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+	gk20a_writel(g, fifo_intr_0_r(), fifo_intr_0_sched_error_reset_f());
+}
+
+void gk20a_fifo_teardown_unmask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val |= fifo_intr_en_0_mmu_fault_f(1) | fifo_intr_en_0_sched_error_f(1);
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+
+}
+
 void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
 			u32 hw_id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault)
@@ -1987,7 +2008,6 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
 	unsigned long engine_id, i;
 	unsigned long _engine_ids = __engine_ids;
 	unsigned long engine_ids = 0;
-	u32 val;
 	u32 mmu_fault_engines = 0;
 	u32 ref_type;
 	u32 ref_id;
@@ -2048,25 +2068,12 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
 	}

 	if (mmu_fault_engines) {
-		/*
-		 * sched error prevents recovery, and ctxsw error will retrigger
-		 * every 100ms. Disable the sched error to allow recovery.
-		 */
-		val = gk20a_readl(g, fifo_intr_en_0_r());
-		val &= ~(fifo_intr_en_0_sched_error_m() |
-			fifo_intr_en_0_mmu_fault_m());
-		gk20a_writel(g, fifo_intr_en_0_r(), val);
-		gk20a_writel(g, fifo_intr_0_r(),
-				fifo_intr_0_sched_error_reset_f());
-
+		g->ops.fifo.teardown_mask_intr(g);
 		g->ops.fifo.trigger_mmu_fault(g, engine_ids);
 		gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines, ref_id,
 				ref_id_is_tsg);

-		val = gk20a_readl(g, fifo_intr_en_0_r());
-		val |= fifo_intr_en_0_mmu_fault_f(1)
-			| fifo_intr_en_0_sched_error_f(1);
-		gk20a_writel(g, fifo_intr_en_0_r(), val);
+		g->ops.fifo.teardown_unmask_intr(g);
 	}

 	nvgpu_log_info(g, "release runlist_lock for all runlists");
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -426,6 +426,8 @@ bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch,
 			bool *verbose, u32 *ms);
 bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
 			bool *verbose, u32 *ms);
+void gk20a_fifo_teardown_mask_intr(struct gk20a *g);
+void gk20a_fifo_teardown_unmask_intr(struct gk20a *g);
 bool gk20a_fifo_handle_sched_error(struct gk20a *g);

 void gk20a_fifo_reset_pbdma_method(struct gk20a *g, int pbdma_id,
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -448,6 +448,8 @@ static const struct gpu_ops gm20b_ops = {
 		.init_pbdma_intr_descs = gm20b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gk20a_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gk20a_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gk20a_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gk20a_fifo_teardown_unmask_intr,
 		.handle_sched_error = gk20a_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1,
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -488,6 +488,8 @@ static const struct gpu_ops gp10b_ops = {
 		.init_pbdma_intr_descs = gp10b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gk20a_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gk20a_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gk20a_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gk20a_fifo_teardown_unmask_intr,
 		.handle_sched_error = gk20a_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gk20a_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gk20a_fifo_handle_pbdma_intr_1,
--- a/drivers/gpu/nvgpu/gv100/fifo_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/fifo_gv100.c
@@ -1,7 +1,7 @@
 /*
 * GV100 fifo
 *
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -55,3 +55,21 @@ void gv100_apply_ctxsw_timeout_intr(struct gk20a *g)
 	gk20a_writel(g, fifo_eng_timeout_r(), timeout);
 }

+void gv100_fifo_teardown_mask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val &= ~(fifo_intr_en_0_sched_error_m());
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+	gk20a_writel(g, fifo_intr_0_r(), fifo_intr_0_sched_error_reset_f());
+}
+
+void gv100_fifo_teardown_unmask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val |= fifo_intr_en_0_sched_error_f(1);
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+}
--- a/drivers/gpu/nvgpu/gv100/fifo_gv100.h
+++ b/drivers/gpu/nvgpu/gv100/fifo_gv100.h
@@ -1,7 +1,7 @@
 /*
 * GV100 Fifo
 *
- * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -31,4 +31,6 @@ struct gk20a;
 u32 gv100_fifo_get_num_fifos(struct gk20a *g);
 u32 gv100_fifo_get_preempt_timeout(struct gk20a *g);
 void gv100_apply_ctxsw_timeout_intr(struct gk20a *g);
+void gv100_fifo_teardown_mask_intr(struct gk20a *g);
+void gv100_fifo_teardown_unmask_intr(struct gk20a *g);
 #endif
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -639,6 +639,8 @@ static const struct gpu_ops gv100_ops = {
 		.init_pbdma_intr_descs = gv11b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gk20a_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gv100_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gv100_fifo_teardown_unmask_intr,
 		.handle_sched_error = gk20a_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gv11b_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gv11b_fifo_handle_pbdma_intr_1,
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -982,6 +982,33 @@ static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
 	}
 }

+void gv11b_fifo_teardown_mask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	/*
+	 * ctxsw timeout error prevents recovery, and ctxsw error will retrigger
+	 * every 100ms. Disable ctxsw timeout error to allow recovery.
+	 */
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val &= ~ fifo_intr_0_ctxsw_timeout_pending_f();
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+	gk20a_writel(g, fifo_intr_ctxsw_timeout_r(),
+			gk20a_readl(g, fifo_intr_ctxsw_timeout_r()));
+
+}
+
+void gv11b_fifo_teardown_unmask_intr(struct gk20a *g)
+{
+	u32 val;
+
+	/* enable ctxsw timeout interrupt */
+	val = gk20a_readl(g, fifo_intr_en_0_r());
+	val |= fifo_intr_0_ctxsw_timeout_pending_f();
+	gk20a_writel(g, fifo_intr_en_0_r(), val);
+}
+
+
 void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			u32 id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault)
@@ -1001,6 +1028,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			runlist_lock);
 	}

+	g->ops.fifo.teardown_mask_intr(g);
+
 	/* get runlist id and tsg */
 	if (id_type == ID_TYPE_TSG) {
 		if (id != FIFO_INVAL_TSG_ID) {
@@ -1195,6 +1224,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 		nvgpu_pmu_enable_elpg(g);
 	}

+	g->ops.fifo.teardown_unmask_intr(g);
+
 	/* release runlist_lock */
 	if (runlist_id != FIFO_INVAL_RUNLIST_ID) {
 		nvgpu_log_fn(g, "release runlist_lock runlist_id = %d",
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
@@ -87,6 +87,8 @@ int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg);
 void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
 			u32 id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault);
+void gv11b_fifo_teardown_mask_intr(struct gk20a *g);
+void gv11b_fifo_teardown_unmask_intr(struct gk20a *g);
 void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f);
 int gv11b_init_fifo_reset_enable_hw(struct gk20a *g);
 bool gv11b_fifo_handle_sched_error(struct gk20a *g);
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -601,6 +601,8 @@ static const struct gpu_ops gv11b_ops = {
 		.init_pbdma_intr_descs = gv11b_fifo_init_pbdma_intr_descs,
 		.reset_enable_hw = gv11b_init_fifo_reset_enable_hw,
 		.teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg,
+		.teardown_mask_intr = gv11b_fifo_teardown_mask_intr,
+		.teardown_unmask_intr = gv11b_fifo_teardown_unmask_intr,
 		.handle_sched_error = gv11b_fifo_handle_sched_error,
 		.handle_pbdma_intr_0 = gv11b_fifo_handle_pbdma_intr_0,
 		.handle_pbdma_intr_1 = gv11b_fifo_handle_pbdma_intr_1,
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -718,6 +718,8 @@ struct gpu_ops {
 		void (*teardown_ch_tsg)(struct gk20a *g, u32 act_eng_bitmask,
 			u32 id, unsigned int id_type, unsigned int rc_type,
 			 struct mmu_fault_info *mmfault);
+		void (*teardown_mask_intr)(struct gk20a *g);
+		void (*teardown_unmask_intr)(struct gk20a *g);
 		bool (*handle_sched_error)(struct gk20a *g);
 		bool (*handle_ctxsw_timeout)(struct gk20a *g, u32 fifo_intr);
 		unsigned int (*handle_pbdma_intr_0)(struct gk20a *g,