gpu: nvgpu: split GR ECC initialization

Split GR ECC initialization into GPC/TPC and FECS ECC init as FECS ECC
errors during acr_construct_execute need to be reported and handled
hence FECS ECC counters are required to be initialized before
acr_construct_execute.

GPC/TPC ECC counters are dependent on the GR config that will be
initialized only after acr_construct_execute.

nvgpu_gr_intr_init_support is moved to nvgpu_gr_prepare_sw.

FECS ECC interrupt is enabled by default hence interrupt is not
enabled through gr_fecs_host_int_enable_r in nvgpu_gr_prepare_sw.

JIRA NVGPU-4439

Change-Id: Ifc9912f0578015a6ba1e9d38765c42633632b15f
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2261987
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: Vinod Gopalakrishnakurup <vinodg@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Sagar Kamble
2019-12-13 20:01:39 +05:30
committed by Alex Waterman
parent 2fe78b4a31
commit a73ca0b70e
11 changed files with 90 additions and 19 deletions

View File

@@ -503,15 +503,10 @@ static int gr_init_setup_sw(struct gk20a *g)
goto clean_up;
}
gr->intr = nvgpu_gr_intr_init_support(g);
if (gr->intr == NULL) {
err = -ENOMEM;
goto clean_up;
}
if (g->ops.gr.ecc.init != NULL && !g->ecc.initialized) {
err = g->ops.gr.ecc.init(g);
if (g->ops.gr.ecc.gpc_tpc_ecc_init != NULL && !g->ecc.initialized) {
err = g->ops.gr.ecc.gpc_tpc_ecc_init(g);
if (err != 0) {
nvgpu_err(g, "failed to init gr gpc/tpc ecc");
goto clean_up;
}
}
@@ -583,8 +578,36 @@ int nvgpu_gr_prepare_sw(struct gk20a *g)
if (gr->falcon == NULL) {
nvgpu_err(g, "failed to init gr falcon");
err = -ENOMEM;
goto exit;
}
}
if (gr->intr == NULL) {
gr->intr = nvgpu_gr_intr_init_support(g);
if (gr->intr == NULL) {
nvgpu_err(g, "failed to init gr intr support");
err = -ENOMEM;
goto exit;
}
}
/*
* Initialize FECS ECC counters here before acr_construct_execute as the
* FECS ECC errors during FECS load need to be handled and reported
* using the ECC counters.
*/
if (g->ops.gr.ecc.fecs_ecc_init != NULL && !g->ecc.initialized) {
err = g->ops.gr.ecc.fecs_ecc_init(g);
if (err != 0) {
nvgpu_err(g, "failed to init gr fecs ecc");
nvgpu_gr_intr_remove_support(g, gr->intr);
gr->intr = NULL;
goto exit;
}
}
exit:
return err;
}

View File

@@ -32,7 +32,9 @@ struct nvgpu_hw_err_inject_info_desc;
void gv11b_ecc_detect_enabled_units(struct gk20a *g);
int gv11b_gr_ecc_init(struct gk20a *g);
int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g);
int gv11b_gr_fecs_ecc_init(struct gk20a *g);
#ifdef CONFIG_NVGPU_INJECT_HWERR
void gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g,

View File

@@ -251,7 +251,7 @@ init_gpc_done:
return err;
}
int gv11b_gr_ecc_init(struct gk20a *g)
int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g)
{
int err;
@@ -265,6 +265,18 @@ int gv11b_gr_ecc_init(struct gk20a *g)
goto done;
}
done:
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
nvgpu_ecc_free(g);
}
return err;
}
int gv11b_gr_fecs_ecc_init(struct gk20a *g)
{
int err;
err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_uncorrected_err_count);
if (err != 0) {
goto done;

View File

@@ -287,7 +287,7 @@ static const struct gpu_ops gp10b_ops = {
#endif /* CONFIG_NVGPU_DEBUGGER */
.ecc = {
.detect = gp10b_ecc_detect_enabled_units,
.init = gp10b_gr_ecc_init,
.gpc_tpc_ecc_init = gp10b_gr_ecc_init,
},
.ctxsw_prog = {
.hw_get_fecs_header_size =

View File

@@ -343,7 +343,8 @@ static const struct gpu_ops gv11b_ops = {
#endif /* CONFIG_NVGPU_DEBUGGER */
.ecc = {
.detect = gv11b_ecc_detect_enabled_units,
.init = gv11b_gr_ecc_init,
.gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init,
.fecs_ecc_init = gv11b_gr_fecs_ecc_init,
#ifdef CONFIG_NVGPU_INJECT_HWERR
.get_mmu_err_desc =
gv11b_gr_intr_get_mmu_err_desc,

View File

@@ -381,7 +381,8 @@ static const struct gpu_ops tu104_ops = {
#endif /* CONFIG_NVGPU_DEBUGGER */
.ecc = {
.detect = NULL,
.init = gv11b_gr_ecc_init,
.gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init,
.fecs_ecc_init = gv11b_gr_fecs_ecc_init,
},
.ctxsw_prog = {
.hw_get_fecs_header_size =

View File

@@ -85,11 +85,23 @@ struct gops_gr_ecc {
* @param g [in] Pointer to GPU driver struct.
*
* This function allocates memory to track the ecc error counts
* for GR unit and subunits of GR (like falcon/sm/gpccs/etc).
* for GR unit and subunits of GR (like GPCs, TPCs etc).
*
* @return 0 in case of success, < 0 in case of failure.
*/
int (*init)(struct gk20a *g);
int (*gpc_tpc_ecc_init)(struct gk20a *g);
/**
* @brief Initialize GR unit ECC support.
*
* @param g [in] Pointer to GPU driver struct.
*
* This function allocates memory to track the ecc error counts
* for FECS in GR.
*
* @return 0 in case of success, < 0 in case of failure.
*/
int (*fecs_ecc_init)(struct gk20a *g);
/**
* @brief Detect ECC enabled units in GR engine.

View File

@@ -160,7 +160,9 @@ void nvgpu_gr_init(struct gk20a *g);
* that is required to enable GR engine h/w in #nvgpu_gr_enable_hw().
*
* This initialization includes reading netlist ucode and allocating
* memory for internal data structures required to enable h/w.
* memory for internal data structures required to enable h/w. This
* function allocates memory for FECS ECC error counters and GR
* interrupt structure.
*
* Note that all rest of the s/w initialization is completed in
* #nvgpu_gr_init_support() function.