gpu: nvgpu: split nvgpu power on sequence into 2 stages

1) nvgpu poweron sequence split into two stages: - nvgpu_early_init() - Initializes the sub units which are required to be initialized before the grgmr init. For creating dev node, grmgr init and its dependency unit needs to move to early stage of GPU power on. After successful nvgpu_early_init() sequence, NvGpu can indetify the number of MIG instance required for each physical GPU. - nvgpu_finalize_poweron() - Initializes the sub units which can be initialized at the later stage of GPU power on sequence. - grmgr init depends on the following HAL sub units, * device - To get the device caps. * priv_ring - To get the gpc count and other MIG config programming. * fb - MIG config programming. * ltc - MIG config programming. * bios, bus, ecc and clk - dependent module of priv_ring/fb/ltc. 2) g->ops.xve.reset_gpu() should be called before GPU sub unit initialization. Hence, added g->ops.xve.reset_gpu() HAL in the early stage of dGPU power on sequence. 3) Increased xve_reset timeout from 100ms to 200ms. 4) Added nvgpu_assert() for gpc_count, gpc_mask and max_veid_count_per_tsg for identify the GPU boot device probe failure during nvgpu_init_gr_manager(). JIRA NVGPU-6633 Change-Id: I5d43bf711198e6b3f8eebcec3027ba17c15fc692 Signed-off-by: Lakshmanan M <lm@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2521894 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
2025-12-22 17:36:20 +03:00 · 2021-04-28 21:53:41 +05:30
parent a18be36a15
commit c041ad5b4b
4 changed files with 129 additions and 54 deletions
--- a/drivers/gpu/nvgpu/common/grmgr/grmgr.c
+++ b/drivers/gpu/nvgpu/common/grmgr/grmgr.c
@@ -40,6 +40,7 @@ int nvgpu_init_gr_manager(struct gk20a *g)

 	/* Number of gpu instance is 1 for legacy mode */
 	g->mig.gpc_count = g->ops.priv_ring.get_gpc_count(g);
+	nvgpu_assert(g->mig.gpc_count > 0U);
 	g->mig.num_gpu_instances = 1U;
 	g->mig.current_gpu_instance_config_id = 0U;
 	g->mig.is_nongr_engine_sharable = false;
@@ -57,6 +58,7 @@ int nvgpu_init_gr_manager(struct gk20a *g)
 	g->mig.gpcgrp_gpc_count[0] = gr_syspipe->num_gpc;
 	if (g->ops.gr.config.get_gpc_mask != NULL) {
 		gr_syspipe->gpc_mask = g->ops.gr.config.get_gpc_mask(g);
+		nvgpu_assert(gr_syspipe->gpc_mask != 0U);
 	} else {
 		gr_syspipe->gpc_mask = nvgpu_safe_sub_u32(
 			BIT32(gr_syspipe->num_gpc),
@@ -92,6 +94,7 @@ int nvgpu_init_gr_manager(struct gk20a *g)
 	if (g->ops.gr.init.get_max_subctx_count != NULL) {
 		gr_syspipe->max_veid_count_per_tsg =
 			g->ops.gr.init.get_max_subctx_count();
+		nvgpu_assert(gr_syspipe->max_veid_count_per_tsg > 0U);
 	} else {
 		/*
 		 * For vgpu, NvGpu has to rely on chip constant
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -586,46 +586,16 @@ static bool needs_init(struct gk20a *g, nvgpu_init_func_t func, u32 enable_flag)
 		nvgpu_is_enabled(g, enable_flag)) && (func != NULL);
 }

-int nvgpu_early_poweron(struct gk20a *g)
+static int nvgpu_early_init(struct gk20a *g)
 {
 	int err = 0;
+	size_t i;

-	err = nvgpu_detect_chip(g);
-	if (err != 0) {
-		nvgpu_err(g, "nvgpu_detect_chip failed[%d]", err);
-		goto done;
-	}
-
-	/*
-	 * Initialize the GPU's device list. Needed before NVLINK
-	 * init since the NVLINK IOCTRL block is enumerated in the
-	 * device list.
-	 */
-	err = nvgpu_device_init(g);
-	if (err != 0) {
-		nvgpu_err(g, "nvgpu_device_init failed[%d]", err);
-		goto done;
-	}
-
-	err = g->ops.grmgr.init_gr_manager(g);
-	if (err != 0) {
-		nvgpu_device_cleanup(g);
-		nvgpu_err(g, "g->ops.grmgr.init_gr_manager failed[%d]", err);
-		goto done;
-	}
-
-done:
-	return err;
-}
-
-int nvgpu_finalize_poweron(struct gk20a *g)
-{
-	int err = 0;
 	/*
 	 * This cannot be static because we use the func ptrs as initializers
 	 * and static variables require constant literals for initializers.
 	 */
-	const struct nvgpu_init_table_t nvgpu_init_table[] = {
+	const struct nvgpu_init_table_t nvgpu_early_init_table[] = {
 		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_slcg_acb_load_gating_prod,
 					NO_FLAG),
 		/*
@@ -635,21 +605,7 @@ int nvgpu_finalize_poweron(struct gk20a *g)
 		 * prior to enabling interrupts for corresponding units.
 		 */
 		NVGPU_INIT_TABLE_ENTRY(g->ops.ecc.ecc_init_support, NO_FLAG),
-		/*
-		 * Do this early so any early VMs that get made are capable of
-		 * mapping buffers.
-		 */
-		NVGPU_INIT_TABLE_ENTRY(g->ops.mm.pd_cache_init, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(&nvgpu_falcons_sw_init, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(g->ops.pmu.pmu_early_init, NO_FLAG),
-
-#ifdef CONFIG_NVGPU_DGPU
-		NVGPU_INIT_TABLE_ENTRY(g->ops.sec2.init_sec2_setup_sw,
-				       NVGPU_SUPPORT_SEC2_RTOS),
-#endif
-		NVGPU_INIT_TABLE_ENTRY(g->ops.acr.acr_init,
-				       NVGPU_SEC_PRIVSECURITY),
-		NVGPU_INIT_TABLE_ENTRY(&nvgpu_sw_quiesce_init_support, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_device_init, NO_FLAG),
 #ifdef CONFIG_NVGPU_DGPU
 		NVGPU_INIT_TABLE_ENTRY(g->ops.bios.bios_sw_init, NO_FLAG),
 #endif
@@ -666,23 +622,126 @@ int nvgpu_finalize_poweron(struct gk20a *g)
 		 * enabled. For now, do it here.
 		 */
 		NVGPU_INIT_TABLE_ENTRY(g->ops.clk.init_clk_support, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(g->ops.nvlink.init,
-				       NVGPU_SUPPORT_NVLINK),
 #ifdef CONFIG_NVGPU_DGPU
-		NVGPU_INIT_TABLE_ENTRY(nvgpu_init_fbpa_ecc, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fbpa_ecc, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.fb.init_fbpa, NO_FLAG),
 #endif
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fb_support, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.ltc.init_ltc_support, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.grmgr.init_gr_manager, NO_FLAG),
+	};
+
+	for (i = 0; i < ARRAY_SIZE(nvgpu_early_init_table); i++) {
+		if (!needs_init(g, nvgpu_early_init_table[i].func,
+				nvgpu_early_init_table[i].enable_flag)) {
+			nvgpu_log_info(g,
+				"Skipping initializing %s (enable_flag=%u func=%p)",
+				   nvgpu_early_init_table[i].name,
+				   nvgpu_early_init_table[i].enable_flag,
+				   nvgpu_early_init_table[i].func);
+		} else {
+			nvgpu_log_info(g, "Initializing %s",
+					   nvgpu_early_init_table[i].name);
+			err = nvgpu_early_init_table[i].func(g);
+			if (err != 0) {
+				nvgpu_err(g, "Failed initialization for: %s",
+					  nvgpu_early_init_table[i].name);
+				goto done;
+			}
+		}
+	}
+
+done:
+	return err;
+}
+
+int nvgpu_early_poweron(struct gk20a *g)
+{
+	int err = 0;
+
+	err = nvgpu_detect_chip(g);
+	if (err != 0) {
+		nvgpu_err(g, "nvgpu_detect_chip failed[%d]", err);
+		goto done;
+	}
+
+#ifdef CONFIG_NVGPU_DGPU
+	/*
+	 * Before probing the GPU make sure the GPU's state is cleared. This is
+	 * relevant for rebind operations.
+	 */
+	if ((g->ops.xve.reset_gpu != NULL) && !g->gpu_reset_done) {
+		g->ops.xve.reset_gpu(g);
+		g->gpu_reset_done = true;
+	}
+#endif
+
+	/*
+	 * nvgpu poweron sequence split into two stages:
+	 * - nvgpu_early_init() - Initializes the sub units
+	 *   which are required to be initialized before the grgmr init.
+	 *   For creating dev node, grmgr init and its dependency unit
+	 *   needs to move to early stage of GPU power on.
+	 *   After successful nvgpu_early_init() sequence,
+	 *   NvGpu can indetify the number of MIG instance required
+	 *   for each physical GPU.
+	 * - nvgpu_finalize_poweron() - Initializes the sub units which
+	 *   can be initialized at the later stage of GPU power on sequence.
+	 *
+	 * grmgr init depends on the following HAL sub units,
+	 * device - To get the device caps.
+	 * priv_ring - To get the gpc count and other MIG config programming.
+	 * fb - MIG config programming.
+	 * ltc - MIG config programming.
+	 * bios, bus, ecc and clk - dependent module of priv_ring/fb/ltc.
+	 *
+	 */
+	err = nvgpu_early_init(g);
+	if (err != 0) {
+		nvgpu_err(g, "nvgpu_early_init failed[%d]", err);
+		goto done;
+	}
+
+done:
+	return err;
+}
+
+int nvgpu_finalize_poweron(struct gk20a *g)
+{
+	int err = 0;
+	/*
+	 * This cannot be static because we use the func ptrs as initializers
+	 * and static variables require constant literals for initializers.
+	 */
+	const struct nvgpu_init_table_t nvgpu_init_table[] = {
+		/*
+		 * Do this early so any early VMs that get made are capable of
+		 * mapping buffers.
+		 */
+		NVGPU_INIT_TABLE_ENTRY(g->ops.mm.pd_cache_init, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_falcons_sw_init, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.pmu.pmu_early_init, NO_FLAG),
+
+#ifdef CONFIG_NVGPU_DGPU
+		NVGPU_INIT_TABLE_ENTRY(g->ops.sec2.init_sec2_setup_sw,
+				       NVGPU_SUPPORT_SEC2_RTOS),
+#endif
+		NVGPU_INIT_TABLE_ENTRY(g->ops.acr.acr_init,
+				       NVGPU_SEC_PRIVSECURITY),
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_sw_quiesce_init_support, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.nvlink.init,
+				       NVGPU_SUPPORT_NVLINK),

 #ifdef CONFIG_NVGPU_DEBUGGER
 		NVGPU_INIT_TABLE_ENTRY(g->ops.ptimer.config_gr_tick_freq,
 				       NO_FLAG),
 #endif
+
 #ifdef CONFIG_NVGPU_DGPU
 		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fb_mem_unlock, NO_FLAG),
 #endif
+
 		NVGPU_INIT_TABLE_ENTRY(g->ops.fifo.reset_enable_hw, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fb_support, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(g->ops.ltc.init_ltc_support, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.mm.init_mm_support, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.fifo.fifo_init_support, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.therm.elcg_init_idle_filters,
@@ -758,17 +817,6 @@ int nvgpu_finalize_poweron(struct gk20a *g)

 	nvgpu_log_fn(g, " ");

-#ifdef CONFIG_NVGPU_DGPU
-	/*
-	 * Before probing the GPU make sure the GPU's state is cleared. This is
-	 * relevant for rebind operations.
-	 */
-	if ((g->ops.xve.reset_gpu != NULL) && !g->gpu_reset_done) {
-		g->ops.xve.reset_gpu(g);
-		g->gpu_reset_done = true;
-	}
-#endif
-
 	for (i = 0; i < ARRAY_SIZE(nvgpu_init_table); i++) {
 		if (!needs_init(g, nvgpu_init_table[i].func,
 				nvgpu_init_table[i].enable_flag)) {
--- a/drivers/gpu/nvgpu/hal/xve/xve_gp106.c
+++ b/drivers/gpu/nvgpu/hal/xve/xve_gp106.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -72,8 +72,14 @@ void xve_reset_gpu_gp106(struct gk20a *g)

 	/*
 	 * Don't access GPU until _after_ it's back out of reset!
+	 *
+	 * TODO: Need to identify the maximum xve_reset SW timeout value.
+	 * 100ms is not sufficient enough in all worst case scenario.
+	 * Need to replace Function Level Reset(FLR) to reset as much of
+	 * the chip as possible.
+	 * If FLR is not supported use the XVE sw reset logic.
 	 */
-	nvgpu_msleep(100);
+	nvgpu_msleep(200);
 	g->ops.xve.xve_writel(g, xve_reset_r(), 0);
 }

--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h
@@ -89,6 +89,24 @@ struct nvgpu_ref;
 * the GPU dev node in the early stage of GPU power on sequence.
 * Each sub-unit is responsible for HW initialization.
 *
+ * nvgpu poweron sequence split into two stages:
+ * - nvgpu_early_init() - Initializes the sub units
+ *   which are required to be initialized before the grgmr init.
+ *   For creating dev node, grmgr init and its dependency unit
+ *   needs to move to early stage of GPU power on.
+ *   After successful nvgpu_early_init() sequence,
+ *   NvGpu can indetify the number of MIG instance required
+ *   for each physical GPU.
+ * - nvgpu_finalize_poweron() - Initializes the sub units which
+ *   can be initialized at the later stage of GPU power on sequence.
+ *
+ * grmgr init depends on the following HAL sub units,
+ * device - To get the device caps.
+ * priv_ring - To get the gpc count and other MIG config programming.
+ * fb - MIG config programming.
+ * ltc - MIG config programming.
+ * bios, bus, ecc and clk - dependent module of priv_ring/fb/ltc.
+ *
 * @return 0 in case of success, < 0 in case of failure.
 */
 int nvgpu_early_poweron(struct gk20a *g);