diff --git a/drivers/gpu/nvgpu/common/gr/gr.c b/drivers/gpu/nvgpu/common/gr/gr.c index 4653483a4..0cfd1e41f 100644 --- a/drivers/gpu/nvgpu/common/gr/gr.c +++ b/drivers/gpu/nvgpu/common/gr/gr.c @@ -503,15 +503,10 @@ static int gr_init_setup_sw(struct gk20a *g) goto clean_up; } - gr->intr = nvgpu_gr_intr_init_support(g); - if (gr->intr == NULL) { - err = -ENOMEM; - goto clean_up; - } - - if (g->ops.gr.ecc.init != NULL && !g->ecc.initialized) { - err = g->ops.gr.ecc.init(g); + if (g->ops.gr.ecc.gpc_tpc_ecc_init != NULL && !g->ecc.initialized) { + err = g->ops.gr.ecc.gpc_tpc_ecc_init(g); if (err != 0) { + nvgpu_err(g, "failed to init gr gpc/tpc ecc"); goto clean_up; } } @@ -583,8 +578,36 @@ int nvgpu_gr_prepare_sw(struct gk20a *g) if (gr->falcon == NULL) { nvgpu_err(g, "failed to init gr falcon"); err = -ENOMEM; + goto exit; } } + + if (gr->intr == NULL) { + gr->intr = nvgpu_gr_intr_init_support(g); + if (gr->intr == NULL) { + nvgpu_err(g, "failed to init gr intr support"); + err = -ENOMEM; + goto exit; + } + } + + /* + * Initialize FECS ECC counters here before acr_construct_execute as the + * FECS ECC errors during FECS load need to be handled and reported + * using the ECC counters. + */ + if (g->ops.gr.ecc.fecs_ecc_init != NULL && !g->ecc.initialized) { + err = g->ops.gr.ecc.fecs_ecc_init(g); + if (err != 0) { + nvgpu_err(g, "failed to init gr fecs ecc"); + + nvgpu_gr_intr_remove_support(g, gr->intr); + gr->intr = NULL; + goto exit; + } + } + +exit: return err; } diff --git a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h index a61f72101..1774632c5 100644 --- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b.h @@ -32,7 +32,9 @@ struct nvgpu_hw_err_inject_info_desc; void gv11b_ecc_detect_enabled_units(struct gk20a *g); -int gv11b_gr_ecc_init(struct gk20a *g); +int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g); +int gv11b_gr_fecs_ecc_init(struct gk20a *g); + #ifdef CONFIG_NVGPU_INJECT_HWERR void gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b_fusa.c index 0b2cd9f20..92e1c30d6 100644 --- a/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/ecc/ecc_gv11b_fusa.c @@ -251,7 +251,7 @@ init_gpc_done: return err; } -int gv11b_gr_ecc_init(struct gk20a *g) +int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g) { int err; @@ -265,6 +265,18 @@ int gv11b_gr_ecc_init(struct gk20a *g) goto done; } +done: + if (err != 0) { + nvgpu_err(g, "ecc counter allocate failed, err=%d", err); + nvgpu_ecc_free(g); + } + + return err; +} +int gv11b_gr_fecs_ecc_init(struct gk20a *g) +{ + int err; + err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_uncorrected_err_count); if (err != 0) { goto done; diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index bdb0e4a5c..b307e8ad0 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -287,7 +287,7 @@ static const struct gpu_ops gp10b_ops = { #endif /* CONFIG_NVGPU_DEBUGGER */ .ecc = { .detect = gp10b_ecc_detect_enabled_units, - .init = gp10b_gr_ecc_init, + .gpc_tpc_ecc_init = gp10b_gr_ecc_init, }, .ctxsw_prog = { .hw_get_fecs_header_size = diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 8a14a9710..55d9cf5bd 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -343,7 +343,8 @@ static const struct gpu_ops gv11b_ops = { #endif /* CONFIG_NVGPU_DEBUGGER */ .ecc = { .detect = gv11b_ecc_detect_enabled_units, - .init = gv11b_gr_ecc_init, + .gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init, + .fecs_ecc_init = gv11b_gr_fecs_ecc_init, #ifdef CONFIG_NVGPU_INJECT_HWERR .get_mmu_err_desc = gv11b_gr_intr_get_mmu_err_desc, diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index db5e0d455..eea101bc1 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -381,7 +381,8 @@ static const struct gpu_ops tu104_ops = { #endif /* CONFIG_NVGPU_DEBUGGER */ .ecc = { .detect = NULL, - .init = gv11b_gr_ecc_init, + .gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init, + .fecs_ecc_init = gv11b_gr_fecs_ecc_init, }, .ctxsw_prog = { .hw_get_fecs_header_size = diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h index c4e13e213..6d807e625 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops_gr.h @@ -85,11 +85,23 @@ struct gops_gr_ecc { * @param g [in] Pointer to GPU driver struct. * * This function allocates memory to track the ecc error counts - * for GR unit and subunits of GR (like falcon/sm/gpccs/etc). + * for GR unit and subunits of GR (like GPCs, TPCs etc). * * @return 0 in case of success, < 0 in case of failure. */ - int (*init)(struct gk20a *g); + int (*gpc_tpc_ecc_init)(struct gk20a *g); + + /** + * @brief Initialize GR unit ECC support. + * + * @param g [in] Pointer to GPU driver struct. + * + * This function allocates memory to track the ecc error counts + * for FECS in GR. + * + * @return 0 in case of success, < 0 in case of failure. + */ + int (*fecs_ecc_init)(struct gk20a *g); /** * @brief Detect ECC enabled units in GR engine. diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h b/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h index c4f828a4c..dec0f4f85 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr.h @@ -160,7 +160,9 @@ void nvgpu_gr_init(struct gk20a *g); * that is required to enable GR engine h/w in #nvgpu_gr_enable_hw(). * * This initialization includes reading netlist ucode and allocating - * memory for internal data structures required to enable h/w. + * memory for internal data structures required to enable h/w. This + * function allocates memory for FECS ECC error counters and GR + * interrupt structure. * * Note that all rest of the s/w initialization is completed in * #nvgpu_gr_init_support() function. diff --git a/userspace/units/gr/init/nvgpu-gr-init.c b/userspace/units/gr/init/nvgpu-gr-init.c index 86905735c..e84d435c7 100644 --- a/userspace/units/gr/init/nvgpu-gr-init.c +++ b/userspace/units/gr/init/nvgpu-gr-init.c @@ -53,14 +53,26 @@ static int gr_init_ecc_fail_alloc(struct gk20a *g) { - int err, i, loop = 28; + int err, i, loop = 26; struct nvgpu_posix_fault_inj *kmem_fi = nvgpu_kmem_get_fault_injection(); struct nvgpu_gr_config *save_gr_config = g->gr->config; for (i = 0; i < loop; i++) { nvgpu_posix_enable_fault_injection(kmem_fi, true, i); - err = g->ops.gr.ecc.init(g); + err = g->ops.gr.ecc.gpc_tpc_ecc_init(g); + if (err == 0) { + return UNIT_FAIL; + } + nvgpu_posix_enable_fault_injection(kmem_fi, false, 0); + g->ops.ecc.ecc_init_support(g); + } + + loop = 2; + + for (i = 0; i < loop; i++) { + nvgpu_posix_enable_fault_injection(kmem_fi, true, i); + err = g->ops.gr.ecc.fecs_ecc_init(g); if (err == 0) { return UNIT_FAIL; } diff --git a/userspace/units/gr/nvgpu-gr.c b/userspace/units/gr/nvgpu-gr.c index 7eb4e839f..8fedd8278 100644 --- a/userspace/units/gr/nvgpu-gr.c +++ b/userspace/units/gr/nvgpu-gr.c @@ -98,6 +98,11 @@ int test_gr_init_prepare(struct unit_module *m, struct gk20a *g, void *args) { int err; + err = g->ops.ecc.ecc_init_support(g); + if (err != 0) { + unit_return_fail(m, "ecc init failed\n"); + } + err = nvgpu_gr_prepare_sw(g); if (err != 0) { unit_return_fail(m, "nvgpu_gr_prepare_sw returned fail\n"); diff --git a/userspace/units/gr/nvgpu-gr.h b/userspace/units/gr/nvgpu-gr.h index e0bb5c409..71decf514 100644 --- a/userspace/units/gr/nvgpu-gr.h +++ b/userspace/units/gr/nvgpu-gr.h @@ -170,7 +170,8 @@ int test_gr_remove_support(struct unit_module *m, struct gk20a *g, void *args); * Input: #test_gr_init_setup, #test_gr_init_prepare and #test_gr_init_support * must have been executed successfully. * - * Targets: gv11b_gr_ecc_init, gv11b_ecc_detect_enabled_units. + * Targets: gv11b_gr_gpc_tpc_ecc_init, gv11b_gr_fecs_ecc_init and + * gv11b_ecc_detect_enabled_units. * * Steps: * - Array with various combinations setting register bits for