gpu: nvgpu: wait ACK for FECS watchdog timeout

From gv11b onwards, FECS ucode returns an ACK for set watchdog timeout method. Failure to wait for this ACK was leading to races, and in some cases, the ACK could be mistaken for the reply to the next method. In particular, this happened for the discover golden image size method which is sent after set watchdog timeout. With instrumented FECS ucode, it takes longer for the code to process the set watchdog timeout method, and the write to ack that method could happen after nvgpu driver clears the mailbox to send the discover image size method. With an invalid golden context image size, FECS ended up causing an MMU fault while attempting to save past allocated buffer. Added NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT to be used with gops_gr_falcon.ctrl_ctxsw, and implemented 2 variants: - gm20b_gr_falcon_ctrl_ctxsw, without ACK - gv11b_gr_falcon_ctrl_ctxsw, with ACK Added NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED flag to allow executing above method without re-acquiring FECS lock. Longer term, the 'flags' could be added to gop_gr_falcon.ctrl_ctxsw parameters. Use gops_gr_falcon.ctrl_ctxsw instead of register writes to invoke set watchdog timeout method in gm20b_gr_falcon_wait_ctxsw_ready. Also replaced calls to gm20b_gr_falcon_ctrl_ctxsw to gops_gr.falcon.ctrl_ctxsw when appropriate, since there are multiple variants (gm20b, gp10b and gv11b). Last, fixed clearing of mailbox 0 in gm20b_gr_falcon_bind_instblk. Bug 200586923 Change-Id: I653b9a216555eec8cd4bb01d6f202bc77b75a939 Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2287340 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2020-01-29 14:02:57 -05:00
parent e21d70574c
commit 9a16bc3fd4
7 changed files with 94 additions and 28 deletions
--- a/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b.h
+++ b/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -47,7 +47,7 @@ void gm20b_gr_falcon_bind_instblk(struct gk20a *g,
 int gm20b_gr_falcon_wait_mem_scrubbing(struct gk20a *g);
 int gm20b_gr_falcon_wait_ctxsw_ready(struct gk20a *g);
 int gm20b_gr_falcon_submit_fecs_method_op(struct gk20a *g,
-	struct nvgpu_fecs_method_op op, bool sleepduringwait);
+	struct nvgpu_fecs_method_op op, u32 flags);
 int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
 						u32 data, u32 *ret_val);
 void gm20b_gr_falcon_set_current_ctx_invalid(struct gk20a *g);
--- a/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b_fusa.c
@@ -127,7 +127,7 @@ void gm20b_gr_falcon_bind_instblk(struct gk20a *g,
 			FECS_ARB_CMD_TIMEOUT_DEFAULT_US;
 	u32 inst_ptr_u32;

-	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
+	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), U32_MAX);

 	while (((nvgpu_readl(g, gr_fecs_ctxsw_status_1_r()) &
 			gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
@@ -520,11 +520,12 @@ int gm20b_gr_falcon_wait_ctxsw_ready(struct gk20a *g)
 	}
 #endif

-	nvgpu_log_info(g, "configuring ctxsw_ucode wdt = 0x%x", wdt_val);
-	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), U32_MAX);
-	nvgpu_writel(g, gr_fecs_method_data_r(), wdt_val);
-	nvgpu_writel(g, gr_fecs_method_push_r(),
-		     gr_fecs_method_push_adr_set_watchdog_timeout_f());
+	ret = g->ops.gr.falcon.ctrl_ctxsw(g,
+		NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT, wdt_val, NULL);
+	if (ret != 0) {
+		nvgpu_err(g, "fail to set watchdog timeout");
+		return ret;
+	}

 	nvgpu_log_fn(g, "done");
 	return 0;
@@ -537,7 +538,7 @@ int gm20b_gr_falcon_init_ctx_state(struct gk20a *g,

 	nvgpu_log_fn(g, " ");

-	ret = gm20b_gr_falcon_ctrl_ctxsw(g,
+	ret = g->ops.gr.falcon.ctrl_ctxsw(g,
 		NVGPU_GR_FALCON_METHOD_CTXSW_DISCOVER_IMAGE_SIZE,
 		0, &sizes->golden_image_size);
 	if (ret != 0) {
@@ -548,7 +549,7 @@ int gm20b_gr_falcon_init_ctx_state(struct gk20a *g,

 #if defined(CONFIG_NVGPU_DEBUGGER) || \
 defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
-	ret = gm20b_gr_falcon_ctrl_ctxsw(g,
+	ret = g->ops.gr.falcon.ctrl_ctxsw(g,
 		NVGPU_GR_FALCON_METHOD_CTXSW_DISCOVER_PM_IMAGE_SIZE,
 #ifndef CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING
 		0, &sizes->pm_ctxsw_image_size);
@@ -565,7 +566,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
 #endif

 #ifdef CONFIG_NVGPU_GRAPHICS
-	ret = gm20b_gr_falcon_ctrl_ctxsw(g,
+	ret = g->ops.gr.falcon.ctrl_ctxsw(g,
 		NVGPU_GR_FALCON_METHOD_CTXSW_DISCOVER_ZCULL_IMAGE_SIZE,
 		0, &sizes->zcull_image_size);
 	if (ret != 0) {
@@ -619,17 +620,21 @@ void gm20b_gr_falcon_set_current_ctx_invalid(struct gk20a *g)
 		gr_fecs_current_ctx_valid_false_f());
 }

-/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
+/*
+ * The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
 * We should replace most, if not all, fecs method calls to this instead.
 */
 int gm20b_gr_falcon_submit_fecs_method_op(struct gk20a *g,
-				   struct nvgpu_fecs_method_op op,
-				   bool sleepduringwait)
+		struct nvgpu_fecs_method_op op, u32 flags)
 {
 	int ret;
 	struct nvgpu_gr_falcon *gr_falcon = nvgpu_gr_get_falcon_ptr(g);
+	bool sleepduringwait =
+			(flags & NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP) != 0U;

-	nvgpu_mutex_acquire(&gr_falcon->fecs_mutex);
+	if ((flags & NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED) == 0U) {
+		nvgpu_mutex_acquire(&gr_falcon->fecs_mutex);
+	}

 	if (op.mailbox.id != 0U) {
 		nvgpu_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
@@ -659,7 +664,9 @@ int gm20b_gr_falcon_submit_fecs_method_op(struct gk20a *g,
 			op.method.data, op.method.addr);
 	}

-	nvgpu_mutex_release(&gr_falcon->fecs_mutex);
+	if ((flags & NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED) == 0U) {
+		nvgpu_mutex_release(&gr_falcon->fecs_mutex);
+	}

 	return ret;
 }
@@ -673,8 +680,8 @@ int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
 		.method.data = 0U,
 		.cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
 		.cond.fail = GR_IS_UCODE_OP_SKIP,
-		};
-	bool sleepduringwait = false;
+	};
+	u32 flags = 0U;

 	nvgpu_log_info(g, "fecs method %d data 0x%x ret_value %p",
 						fecs_method, data, ret_val);
@@ -689,7 +696,7 @@ int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
 		op.mailbox.fail = gr_fecs_ctxsw_mailbox_value_fail_v();
 		op.cond.ok = GR_IS_UCODE_OP_EQUAL;
 		op.cond.fail = GR_IS_UCODE_OP_EQUAL;
-		sleepduringwait = true;
+		flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
 	break;

 	case NVGPU_GR_FALCON_METHOD_CTXSW_START:
@@ -700,7 +707,7 @@ int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
 		op.mailbox.fail = gr_fecs_ctxsw_mailbox_value_fail_v();
 		op.cond.ok = GR_IS_UCODE_OP_EQUAL;
 		op.cond.fail = GR_IS_UCODE_OP_EQUAL;
-		sleepduringwait = true;
+		flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
 	break;
 #endif
 #ifdef CONFIG_NVGPU_ENGINE_RESET
@@ -733,7 +740,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
 		op.method.addr =
 			gr_fecs_method_push_adr_discover_pm_image_size_v();
 		op.mailbox.ret = ret_val;
-		sleepduringwait = true;
+		flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
 		break;
 #endif
 #ifdef CONFIG_NVGPU_POWER_PG
@@ -770,7 +777,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
 		op.mailbox.fail = 0x20U;
 		op.cond.ok = GR_IS_UCODE_OP_AND;
 		op.cond.fail = GR_IS_UCODE_OP_AND;
-		sleepduringwait = true;
+		flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
 		break;

 	case NVGPU_GR_FALCON_METHOD_GOLDEN_IMAGE_SAVE:
@@ -781,7 +788,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
 		op.mailbox.fail = 0x2U;
 		op.cond.ok = GR_IS_UCODE_OP_AND;
 		op.cond.fail = GR_IS_UCODE_OP_AND;
-		sleepduringwait = true;
+		flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
 		break;
 #ifdef CONFIG_NVGPU_FECS_TRACE
 	case NVGPU_GR_FALCON_METHOD_FECS_TRACE_FLUSH:
@@ -789,12 +796,20 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
 			gr_fecs_method_push_adr_write_timestamp_record_v();
 		break;
 #endif
+	case NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT:
+		op.method.addr =
+			gr_fecs_method_push_adr_set_watchdog_timeout_f();
+		op.method.data = data;
+		op.cond.ok = GR_IS_UCODE_OP_SKIP;
+		flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED;
+		break;

 	default:
 		nvgpu_err(g, "unsupported fecs mode %d", fecs_method);
 		break;
 	}
-	return gm20b_gr_falcon_submit_fecs_method_op(g, op, sleepduringwait);
+
+	return gm20b_gr_falcon_submit_fecs_method_op(g, op, flags);
 }

 u32 gm20b_gr_falcon_get_current_ctx(struct gk20a *g)
--- a/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gp10b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gp10b_fusa.c
@@ -80,7 +80,7 @@ int gp10b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
 		op.method.addr =
 			gr_fecs_method_push_adr_discover_preemption_image_size_v();
 		op.mailbox.ret = ret_val;
-		ret = gm20b_gr_falcon_submit_fecs_method_op(g, op, false);
+		ret = gm20b_gr_falcon_submit_fecs_method_op(g, op, 0U);
 	break;

 	case NVGPU_GR_FALCON_METHOD_CONFIGURE_CTXSW_INTR:
--- a/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gv11b.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -29,5 +29,7 @@ struct nvgpu_fecs_ecc_status;
 void gv11b_gr_falcon_handle_fecs_ecc_error(struct gk20a *g,
 			struct nvgpu_fecs_ecc_status *fecs_ecc_status);
 void gv11b_gr_falcon_fecs_host_int_enable(struct gk20a *g);
+int gv11b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
+			u32 data, u32 *ret_val);

 #endif /* NVGPU_GR_FALCON_GV11B_H */
--- a/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gv11b_fusa.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,10 @@
 #include <nvgpu/io.h>
 #include <nvgpu/gr/gr_falcon.h>

+#include "gr_falcon_gp10b.h"
+#include "gr_falcon_gm20b.h"
 #include "gr_falcon_gv11b.h"
+#include "common/gr/gr_falcon_priv.h"

 #include <nvgpu/hw/gv11b/hw_gr_gv11b.h>

@@ -117,6 +120,43 @@ void gv11b_gr_falcon_handle_fecs_ecc_error(struct gk20a *g,
 	}
 }

+int gv11b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
+		u32 data, u32 *ret_val)
+{
+	struct nvgpu_fecs_method_op op = {
+		.mailbox = { .id = 0U, .data = 0U, .ret = NULL,
+			     .clr = ~U32(0U), .ok = 0U, .fail = 0U},
+		.method.data = 0U,
+		.cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+		.cond.fail = GR_IS_UCODE_OP_SKIP,
+	};
+	u32 flags = 0;
+	int ret;
+
+	nvgpu_log_info(g, "fecs method %d data 0x%x ret_val %p",
+				fecs_method, data, ret_val);
+
+	switch (fecs_method) {
+	case NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT:
+		op.method.addr =
+			gr_fecs_method_push_adr_set_watchdog_timeout_f();
+		op.method.data = data;
+		flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED;
+#ifdef CONFIG_NVGPU_SIM
+		op.cond.ok = GR_IS_UCODE_OP_SKIP;
+#endif
+
+		ret = gm20b_gr_falcon_submit_fecs_method_op(g, op, flags);
+		break;
+
+	default:
+		ret = gp10b_gr_falcon_ctrl_ctxsw(g, fecs_method,
+				data, ret_val);
+		break;
+	}
+	return ret;
+}
+
 void gv11b_gr_falcon_fecs_host_int_enable(struct gk20a *g)
 {
 	nvgpu_writel(g, gr_fecs_host_int_enable_r(),
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -766,7 +766,7 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
 			.wait_mem_scrubbing =
 					gm20b_gr_falcon_wait_mem_scrubbing,
 			.wait_ctxsw_ready = gm20b_gr_falcon_wait_ctxsw_ready,
-			.ctrl_ctxsw = gp10b_gr_falcon_ctrl_ctxsw,
+			.ctrl_ctxsw = gv11b_gr_falcon_ctrl_ctxsw,
 			.get_current_ctx = gm20b_gr_falcon_get_current_ctx,
 			.get_ctx_ptr = gm20b_gr_falcon_get_ctx_ptr,
 			.get_fecs_current_ctx_data =
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_falcon.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_falcon.h
@@ -73,6 +73,15 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
 #define NVGPU_GR_FALCON_METHOD_CONFIGURE_CTXSW_INTR		13
 #endif

+/** Falcon method to set watchdog timeout. */
+#define NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT		14
+
+/** Sleep while waiting for Falcon ACK */
+#define NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP		BIT32(0)
+
+/** Falcon lock already held */
+#define NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED		BIT32(1)
+
 /** Falcon index of mailbox 0. */
 #define NVGPU_GR_FALCON_FECS_CTXSW_MAILBOX0 0U
 /** Falcon index of mailbox 1. */