gpu: nvgpu: split GR ECC initialization

Split GR ECC initialization into GPC/TPC and FECS ECC init as FECS ECC errors during acr_construct_execute need to be reported and handled hence FECS ECC counters are required to be initialized before acr_construct_execute. GPC/TPC ECC counters are dependent on the GR config that will be initialized only after acr_construct_execute. nvgpu_gr_intr_init_support is moved to nvgpu_gr_prepare_sw. FECS ECC interrupt is enabled by default hence interrupt is not enabled through gr_fecs_host_int_enable_r in nvgpu_gr_prepare_sw. JIRA NVGPU-4439 Change-Id: Ifc9912f0578015a6ba1e9d38765c42633632b15f Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2261987 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: Vinod Gopalakrishnakurup <vinodg@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2019-12-13 20:01:39 +05:30
parent 2fe78b4a31
commit a73ca0b70e
11 changed files with 90 additions and 19 deletions
--- a/drivers/gpu/nvgpu/common/gr/gr.c
+++ b/drivers/gpu/nvgpu/common/gr/gr.c
@@ -503,15 +503,10 @@ static int gr_init_setup_sw(struct gk20a *g)
 		goto clean_up;
 	}

-	gr->intr = nvgpu_gr_intr_init_support(g);
-	if (gr->intr == NULL) {
-		err = -ENOMEM;
-		goto clean_up;
-	}
-
-	if (g->ops.gr.ecc.init != NULL && !g->ecc.initialized) {
-		err = g->ops.gr.ecc.init(g);
+	if (g->ops.gr.ecc.gpc_tpc_ecc_init != NULL && !g->ecc.initialized) {
+		err = g->ops.gr.ecc.gpc_tpc_ecc_init(g);
 		if (err != 0) {
+			nvgpu_err(g, "failed to init gr gpc/tpc ecc");
 			goto clean_up;
 		}
 	}
@@ -583,8 +578,36 @@ int nvgpu_gr_prepare_sw(struct gk20a *g)
 		if (gr->falcon == NULL) {
 			nvgpu_err(g, "failed to init gr falcon");
 			err = -ENOMEM;
+			goto exit;
 		}
 	}
+
+	if (gr->intr == NULL) {
+		gr->intr = nvgpu_gr_intr_init_support(g);
+		if (gr->intr == NULL) {
+			nvgpu_err(g, "failed to init gr intr support");
+			err = -ENOMEM;
+			goto exit;
+		}
+	}
+
+	/*
+	 * Initialize FECS ECC counters here before acr_construct_execute as the
+	 * FECS ECC errors during FECS load need to be handled and reported
+	 * using the ECC counters.
+	 */
+	if (g->ops.gr.ecc.fecs_ecc_init != NULL && !g->ecc.initialized) {
+		err = g->ops.gr.ecc.fecs_ecc_init(g);
+		if (err != 0) {
+			nvgpu_err(g, "failed to init gr fecs ecc");
+
+			nvgpu_gr_intr_remove_support(g, gr->intr);
+			gr->intr = NULL;
+			goto exit;
+		}
+	}
+
+exit:
 	return err;
 }

--- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h
+++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h
@@ -32,7 +32,9 @@ struct nvgpu_hw_err_inject_info_desc;

 void gv11b_ecc_detect_enabled_units(struct gk20a *g);

-int gv11b_gr_ecc_init(struct gk20a *g);
+int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g);
+int gv11b_gr_fecs_ecc_init(struct gk20a *g);
+

 #ifdef CONFIG_NVGPU_INJECT_HWERR
 void gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g,
--- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b_fusa.c
@@ -251,7 +251,7 @@ init_gpc_done:
 	return err;
 }

-int gv11b_gr_ecc_init(struct gk20a *g)
+int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g)
 {
 	int err;

@@ -265,6 +265,18 @@ int gv11b_gr_ecc_init(struct gk20a *g)
 		goto done;
 	}

+done:
+	if (err != 0) {
+		nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
+		nvgpu_ecc_free(g);
+	}
+
+	return err;
+}
+int gv11b_gr_fecs_ecc_init(struct gk20a *g)
+{
+	int err;
+
 	err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_uncorrected_err_count);
 	if (err != 0) {
 		goto done;
--- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
@@ -287,7 +287,7 @@ static const struct gpu_ops gp10b_ops = {
 #endif /* CONFIG_NVGPU_DEBUGGER */
 		.ecc = {
 			.detect = gp10b_ecc_detect_enabled_units,
-			.init = gp10b_gr_ecc_init,
+			.gpc_tpc_ecc_init = gp10b_gr_ecc_init,
 		},
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -343,7 +343,8 @@ static const struct gpu_ops gv11b_ops = {
 #endif /* CONFIG_NVGPU_DEBUGGER */
 		.ecc = {
 			.detect = gv11b_ecc_detect_enabled_units,
-			.init = gv11b_gr_ecc_init,
+			.gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init,
+			.fecs_ecc_init = gv11b_gr_fecs_ecc_init,
 #ifdef CONFIG_NVGPU_INJECT_HWERR
 			.get_mmu_err_desc =
 				gv11b_gr_intr_get_mmu_err_desc,
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -381,7 +381,8 @@ static const struct gpu_ops tu104_ops = {
 #endif /* CONFIG_NVGPU_DEBUGGER */
 		.ecc = {
 			.detect = NULL,
-			.init = gv11b_gr_ecc_init,
+			.gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init,
+			.fecs_ecc_init = gv11b_gr_fecs_ecc_init,
 		},
 		.ctxsw_prog = {
 			.hw_get_fecs_header_size =
--- a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h
@@ -85,11 +85,23 @@ struct gops_gr_ecc {
 	 * @param g [in]		Pointer to GPU driver struct.
 	 *
 	 * This function allocates memory to track the ecc error counts
-	 * for GR unit and subunits of GR (like falcon/sm/gpccs/etc).
+	 * for GR unit and subunits of GR (like GPCs, TPCs etc).
 	 *
 	 * @return 0 in case of success, < 0 in case of failure.
 	 */
-	int (*init)(struct gk20a *g);
+	int (*gpc_tpc_ecc_init)(struct gk20a *g);
+
+	/**
+	 * @brief Initialize GR unit ECC support.
+	 *
+	 * @param g [in]		Pointer to GPU driver struct.
+	 *
+	 * This function allocates memory to track the ecc error counts
+	 * for FECS in GR.
+	 *
+	 * @return 0 in case of success, < 0 in case of failure.
+	 */
+	int (*fecs_ecc_init)(struct gk20a *g);

 	/**
 	 * @brief Detect ECC enabled units in GR engine.
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h
@@ -160,7 +160,9 @@ void nvgpu_gr_init(struct gk20a *g);
 * that is required to enable GR engine h/w in #nvgpu_gr_enable_hw().
 *
 * This initialization includes reading netlist ucode and allocating
- * memory for internal data structures required to enable h/w.
+ * memory for internal data structures required to enable h/w. This
+ * function allocates memory for FECS ECC error counters and GR
+ * interrupt structure.
 *
 * Note that all rest of the s/w initialization is completed in
 * #nvgpu_gr_init_support() function.