From c041ad5b4b5be50cb3c1b37c95aedde1a8f84d5e Mon Sep 17 00:00:00 2001
From: Lakshmanan M <lm@nvidia.com>
Date: Wed, 28 Apr 2021 21:53:41 +0530
Subject: [PATCH] gpu: nvgpu: split nvgpu power on sequence into 2 stages

1) nvgpu poweron sequence split into two stages:
    - nvgpu_early_init() - Initializes the sub units
      which are required to be initialized before the grgmr init.
      For creating dev node, grmgr init and its dependency unit
      needs to move to early stage of GPU power on.
      After successful nvgpu_early_init() sequence,
      NvGpu can indetify the number of MIG instance required
      for each physical GPU.
    - nvgpu_finalize_poweron() - Initializes the sub units which
      can be initialized at the later stage of GPU power on sequence.

    - grmgr init depends on the following HAL sub units,
      * device - To get the device caps.
      * priv_ring - To get the gpc count and other
        MIG config programming.
      * fb - MIG config programming.
      * ltc - MIG config programming.
      * bios, bus, ecc and clk - dependent module of
        priv_ring/fb/ltc.

2) g->ops.xve.reset_gpu() should be called before GPU sub unit
   initialization. Hence, added g->ops.xve.reset_gpu() HAL in the
   early stage of dGPU power on sequence.

3) Increased xve_reset timeout from 100ms to 200ms.

4) Added nvgpu_assert() for gpc_count, gpc_mask and
   max_veid_count_per_tsg for identify the GPU boot
   device probe failure during nvgpu_init_gr_manager().

JIRA NVGPU-6633

Change-Id: I5d43bf711198e6b3f8eebcec3027ba17c15fc692
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2521894
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: Debarshi Dutta <ddutta@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
---
 drivers/gpu/nvgpu/common/grmgr/grmgr.c       |   3 +
 drivers/gpu/nvgpu/common/init/nvgpu_init.c   | 152 ++++++++++++-------
 drivers/gpu/nvgpu/hal/xve/xve_gp106.c        |  10 +-
 drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h |  18 +++
 4 files changed, 129 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/grmgr/grmgr.c b/drivers/gpu/nvgpu/common/grmgr/grmgr.c
index 43f2b9e88..a69e06020 100644
--- a/drivers/gpu/nvgpu/common/grmgr/grmgr.c
+++ b/drivers/gpu/nvgpu/common/grmgr/grmgr.c
@@ -40,6 +40,7 @@ int nvgpu_init_gr_manager(struct gk20a *g)
 
 	/* Number of gpu instance is 1 for legacy mode */
 	g->mig.gpc_count = g->ops.priv_ring.get_gpc_count(g);
+	nvgpu_assert(g->mig.gpc_count > 0U);
 	g->mig.num_gpu_instances = 1U;
 	g->mig.current_gpu_instance_config_id = 0U;
 	g->mig.is_nongr_engine_sharable = false;
@@ -57,6 +58,7 @@ int nvgpu_init_gr_manager(struct gk20a *g)
 	g->mig.gpcgrp_gpc_count[0] = gr_syspipe->num_gpc;
 	if (g->ops.gr.config.get_gpc_mask != NULL) {
 		gr_syspipe->gpc_mask = g->ops.gr.config.get_gpc_mask(g);
+		nvgpu_assert(gr_syspipe->gpc_mask != 0U);
 	} else {
 		gr_syspipe->gpc_mask = nvgpu_safe_sub_u32(
 			BIT32(gr_syspipe->num_gpc),
@@ -92,6 +94,7 @@ int nvgpu_init_gr_manager(struct gk20a *g)
 	if (g->ops.gr.init.get_max_subctx_count != NULL) {
 		gr_syspipe->max_veid_count_per_tsg =
 			g->ops.gr.init.get_max_subctx_count();
+		nvgpu_assert(gr_syspipe->max_veid_count_per_tsg > 0U);
 	} else {
 		/*
 		 * For vgpu, NvGpu has to rely on chip constant
diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
index 5a93345b6..c7898ad00 100644
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -586,6 +586,75 @@ static bool needs_init(struct gk20a *g, nvgpu_init_func_t func, u32 enable_flag)
 		nvgpu_is_enabled(g, enable_flag)) && (func != NULL);
 }
 
+static int nvgpu_early_init(struct gk20a *g)
+{
+	int err = 0;
+	size_t i;
+
+	/*
+	 * This cannot be static because we use the func ptrs as initializers
+	 * and static variables require constant literals for initializers.
+	 */
+	const struct nvgpu_init_table_t nvgpu_early_init_table[] = {
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_slcg_acb_load_gating_prod,
+					NO_FLAG),
+		/*
+		 * ECC support initialization is split into generic init
+		 * followed by per unit initialization and ends with sysfs
+		 * support init. This is done to setup ECC data structures
+		 * prior to enabling interrupts for corresponding units.
+		 */
+		NVGPU_INIT_TABLE_ENTRY(g->ops.ecc.ecc_init_support, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_device_init, NO_FLAG),
+#ifdef CONFIG_NVGPU_DGPU
+		NVGPU_INIT_TABLE_ENTRY(g->ops.bios.bios_sw_init, NO_FLAG),
+#endif
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_interrupt_setup, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.bus.init_hw, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.priv_ring.enable_priv_ring,
+				   NO_FLAG),
+		/* TBD: move this after graphics init in which blcg/slcg is
+		 * enabled. This function removes SlowdownOnBoot which applies
+		 * 32x divider on gpcpll bypass path. The purpose of slowdown is
+		 * to save power during boot but it also significantly slows
+		 * down gk20a init on simulation and emulation. We should remove
+		 * SOB after graphics power saving features (blcg/slcg) are
+		 * enabled. For now, do it here.
+		 */
+		NVGPU_INIT_TABLE_ENTRY(g->ops.clk.init_clk_support, NO_FLAG),
+#ifdef CONFIG_NVGPU_DGPU
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fbpa_ecc, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.fb.init_fbpa, NO_FLAG),
+#endif
+		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fb_support, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.ltc.init_ltc_support, NO_FLAG),
+		NVGPU_INIT_TABLE_ENTRY(g->ops.grmgr.init_gr_manager, NO_FLAG),
+	};
+
+	for (i = 0; i < ARRAY_SIZE(nvgpu_early_init_table); i++) {
+		if (!needs_init(g, nvgpu_early_init_table[i].func,
+				nvgpu_early_init_table[i].enable_flag)) {
+			nvgpu_log_info(g,
+				"Skipping initializing %s (enable_flag=%u func=%p)",
+				   nvgpu_early_init_table[i].name,
+				   nvgpu_early_init_table[i].enable_flag,
+				   nvgpu_early_init_table[i].func);
+		} else {
+			nvgpu_log_info(g, "Initializing %s",
+					   nvgpu_early_init_table[i].name);
+			err = nvgpu_early_init_table[i].func(g);
+			if (err != 0) {
+				nvgpu_err(g, "Failed initialization for: %s",
+					  nvgpu_early_init_table[i].name);
+				goto done;
+			}
+		}
+	}
+
+done:
+	return err;
+}
+
 int nvgpu_early_poweron(struct gk20a *g)
 {
 	int err = 0;
@@ -596,21 +665,40 @@ int nvgpu_early_poweron(struct gk20a *g)
 		goto done;
 	}
 
+#ifdef CONFIG_NVGPU_DGPU
 	/*
-	 * Initialize the GPU's device list. Needed before NVLINK
-	 * init since the NVLINK IOCTRL block is enumerated in the
-	 * device list.
+	 * Before probing the GPU make sure the GPU's state is cleared. This is
+	 * relevant for rebind operations.
 	 */
-	err = nvgpu_device_init(g);
-	if (err != 0) {
-		nvgpu_err(g, "nvgpu_device_init failed[%d]", err);
-		goto done;
+	if ((g->ops.xve.reset_gpu != NULL) && !g->gpu_reset_done) {
+		g->ops.xve.reset_gpu(g);
+		g->gpu_reset_done = true;
 	}
+#endif
 
-	err = g->ops.grmgr.init_gr_manager(g);
+	/*
+	 * nvgpu poweron sequence split into two stages:
+	 * - nvgpu_early_init() - Initializes the sub units
+	 *   which are required to be initialized before the grgmr init.
+	 *   For creating dev node, grmgr init and its dependency unit
+	 *   needs to move to early stage of GPU power on.
+	 *   After successful nvgpu_early_init() sequence,
+	 *   NvGpu can indetify the number of MIG instance required
+	 *   for each physical GPU.
+	 * - nvgpu_finalize_poweron() - Initializes the sub units which
+	 *   can be initialized at the later stage of GPU power on sequence.
+	 *
+	 * grmgr init depends on the following HAL sub units,
+	 * device - To get the device caps.
+	 * priv_ring - To get the gpc count and other MIG config programming.
+	 * fb - MIG config programming.
+	 * ltc - MIG config programming.
+	 * bios, bus, ecc and clk - dependent module of priv_ring/fb/ltc.
+	 *
+	 */
+	err = nvgpu_early_init(g);
 	if (err != 0) {
-		nvgpu_device_cleanup(g);
-		nvgpu_err(g, "g->ops.grmgr.init_gr_manager failed[%d]", err);
+		nvgpu_err(g, "nvgpu_early_init failed[%d]", err);
 		goto done;
 	}
 
@@ -626,15 +714,6 @@ int nvgpu_finalize_poweron(struct gk20a *g)
 	 * and static variables require constant literals for initializers.
 	 */
 	const struct nvgpu_init_table_t nvgpu_init_table[] = {
-		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_slcg_acb_load_gating_prod,
-					NO_FLAG),
-		/*
-		 * ECC support initialization is split into generic init
-		 * followed by per unit initialization and ends with sysfs
-		 * support init. This is done to setup ECC data structures
-		 * prior to enabling interrupts for corresponding units.
-		 */
-		NVGPU_INIT_TABLE_ENTRY(g->ops.ecc.ecc_init_support, NO_FLAG),
 		/*
 		 * Do this early so any early VMs that get made are capable of
 		 * mapping buffers.
@@ -650,39 +729,19 @@ int nvgpu_finalize_poweron(struct gk20a *g)
 		NVGPU_INIT_TABLE_ENTRY(g->ops.acr.acr_init,
 				       NVGPU_SEC_PRIVSECURITY),
 		NVGPU_INIT_TABLE_ENTRY(&nvgpu_sw_quiesce_init_support, NO_FLAG),
-#ifdef CONFIG_NVGPU_DGPU
-		NVGPU_INIT_TABLE_ENTRY(g->ops.bios.bios_sw_init, NO_FLAG),
-#endif
-		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_interrupt_setup, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(g->ops.bus.init_hw, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(g->ops.priv_ring.enable_priv_ring,
-				       NO_FLAG),
-		/* TBD: move this after graphics init in which blcg/slcg is
-		 * enabled. This function removes SlowdownOnBoot which applies
-		 * 32x divider on gpcpll bypass path. The purpose of slowdown is
-		 * to save power during boot but it also significantly slows
-		 * down gk20a init on simulation and emulation. We should remove
-		 * SOB after graphics power saving features (blcg/slcg) are
-		 * enabled. For now, do it here.
-		 */
-		NVGPU_INIT_TABLE_ENTRY(g->ops.clk.init_clk_support, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.nvlink.init,
 				       NVGPU_SUPPORT_NVLINK),
-#ifdef CONFIG_NVGPU_DGPU
-		NVGPU_INIT_TABLE_ENTRY(nvgpu_init_fbpa_ecc, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(g->ops.fb.init_fbpa, NO_FLAG),
-#endif
 
 #ifdef CONFIG_NVGPU_DEBUGGER
 		NVGPU_INIT_TABLE_ENTRY(g->ops.ptimer.config_gr_tick_freq,
 				       NO_FLAG),
 #endif
+
 #ifdef CONFIG_NVGPU_DGPU
 		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fb_mem_unlock, NO_FLAG),
 #endif
+
 		NVGPU_INIT_TABLE_ENTRY(g->ops.fifo.reset_enable_hw, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(&nvgpu_init_fb_support, NO_FLAG),
-		NVGPU_INIT_TABLE_ENTRY(g->ops.ltc.init_ltc_support, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.mm.init_mm_support, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.fifo.fifo_init_support, NO_FLAG),
 		NVGPU_INIT_TABLE_ENTRY(g->ops.therm.elcg_init_idle_filters,
@@ -758,17 +817,6 @@ int nvgpu_finalize_poweron(struct gk20a *g)
 
 	nvgpu_log_fn(g, " ");
 
-#ifdef CONFIG_NVGPU_DGPU
-	/*
-	 * Before probing the GPU make sure the GPU's state is cleared. This is
-	 * relevant for rebind operations.
-	 */
-	if ((g->ops.xve.reset_gpu != NULL) && !g->gpu_reset_done) {
-		g->ops.xve.reset_gpu(g);
-		g->gpu_reset_done = true;
-	}
-#endif
-
 	for (i = 0; i < ARRAY_SIZE(nvgpu_init_table); i++) {
 		if (!needs_init(g, nvgpu_init_table[i].func,
 				nvgpu_init_table[i].enable_flag)) {
diff --git a/drivers/gpu/nvgpu/hal/xve/xve_gp106.c b/drivers/gpu/nvgpu/hal/xve/xve_gp106.c
index d269e1da4..2147eb6a2 100644
--- a/drivers/gpu/nvgpu/hal/xve/xve_gp106.c
+++ b/drivers/gpu/nvgpu/hal/xve/xve_gp106.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -72,8 +72,14 @@ void xve_reset_gpu_gp106(struct gk20a *g)
 
 	/*
 	 * Don't access GPU until _after_ it's back out of reset!
+	 *
+	 * TODO: Need to identify the maximum xve_reset SW timeout value.
+	 * 100ms is not sufficient enough in all worst case scenario.
+	 * Need to replace Function Level Reset(FLR) to reset as much of
+	 * the chip as possible.
+	 * If FLR is not supported use the XVE sw reset logic.
 	 */
-	nvgpu_msleep(100);
+	nvgpu_msleep(200);
 	g->ops.xve.xve_writel(g, xve_reset_r(), 0);
 }
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h
index d5daaf26c..844fabda9 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_init.h
@@ -89,6 +89,24 @@ struct nvgpu_ref;
  * the GPU dev node in the early stage of GPU power on sequence.
  * Each sub-unit is responsible for HW initialization.
  *
+ * nvgpu poweron sequence split into two stages:
+ * - nvgpu_early_init() - Initializes the sub units
+ *   which are required to be initialized before the grgmr init.
+ *   For creating dev node, grmgr init and its dependency unit
+ *   needs to move to early stage of GPU power on.
+ *   After successful nvgpu_early_init() sequence,
+ *   NvGpu can indetify the number of MIG instance required
+ *   for each physical GPU.
+ * - nvgpu_finalize_poweron() - Initializes the sub units which
+ *   can be initialized at the later stage of GPU power on sequence.
+ *
+ * grmgr init depends on the following HAL sub units,
+ * device - To get the device caps.
+ * priv_ring - To get the gpc count and other MIG config programming.
+ * fb - MIG config programming.
+ * ltc - MIG config programming.
+ * bios, bus, ecc and clk - dependent module of priv_ring/fb/ltc.
+ *
  * @return 0 in case of success, < 0 in case of failure.
  */
 int nvgpu_early_poweron(struct gk20a *g);