mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 10:34:43 +03:00
gpu: nvgpu: split GR ECC initialization
Split GR ECC initialization into GPC/TPC and FECS ECC init as FECS ECC errors during acr_construct_execute need to be reported and handled hence FECS ECC counters are required to be initialized before acr_construct_execute. GPC/TPC ECC counters are dependent on the GR config that will be initialized only after acr_construct_execute. nvgpu_gr_intr_init_support is moved to nvgpu_gr_prepare_sw. FECS ECC interrupt is enabled by default hence interrupt is not enabled through gr_fecs_host_int_enable_r in nvgpu_gr_prepare_sw. JIRA NVGPU-4439 Change-Id: Ifc9912f0578015a6ba1e9d38765c42633632b15f Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2261987 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: Vinod Gopalakrishnakurup <vinodg@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
Alex Waterman
parent
2fe78b4a31
commit
a73ca0b70e
@@ -503,15 +503,10 @@ static int gr_init_setup_sw(struct gk20a *g)
|
||||
goto clean_up;
|
||||
}
|
||||
|
||||
gr->intr = nvgpu_gr_intr_init_support(g);
|
||||
if (gr->intr == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto clean_up;
|
||||
}
|
||||
|
||||
if (g->ops.gr.ecc.init != NULL && !g->ecc.initialized) {
|
||||
err = g->ops.gr.ecc.init(g);
|
||||
if (g->ops.gr.ecc.gpc_tpc_ecc_init != NULL && !g->ecc.initialized) {
|
||||
err = g->ops.gr.ecc.gpc_tpc_ecc_init(g);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "failed to init gr gpc/tpc ecc");
|
||||
goto clean_up;
|
||||
}
|
||||
}
|
||||
@@ -583,8 +578,36 @@ int nvgpu_gr_prepare_sw(struct gk20a *g)
|
||||
if (gr->falcon == NULL) {
|
||||
nvgpu_err(g, "failed to init gr falcon");
|
||||
err = -ENOMEM;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if (gr->intr == NULL) {
|
||||
gr->intr = nvgpu_gr_intr_init_support(g);
|
||||
if (gr->intr == NULL) {
|
||||
nvgpu_err(g, "failed to init gr intr support");
|
||||
err = -ENOMEM;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize FECS ECC counters here before acr_construct_execute as the
|
||||
* FECS ECC errors during FECS load need to be handled and reported
|
||||
* using the ECC counters.
|
||||
*/
|
||||
if (g->ops.gr.ecc.fecs_ecc_init != NULL && !g->ecc.initialized) {
|
||||
err = g->ops.gr.ecc.fecs_ecc_init(g);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "failed to init gr fecs ecc");
|
||||
|
||||
nvgpu_gr_intr_remove_support(g, gr->intr);
|
||||
gr->intr = NULL;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,9 @@ struct nvgpu_hw_err_inject_info_desc;
|
||||
|
||||
void gv11b_ecc_detect_enabled_units(struct gk20a *g);
|
||||
|
||||
int gv11b_gr_ecc_init(struct gk20a *g);
|
||||
int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g);
|
||||
int gv11b_gr_fecs_ecc_init(struct gk20a *g);
|
||||
|
||||
|
||||
#ifdef CONFIG_NVGPU_INJECT_HWERR
|
||||
void gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g,
|
||||
|
||||
@@ -251,7 +251,7 @@ init_gpc_done:
|
||||
return err;
|
||||
}
|
||||
|
||||
int gv11b_gr_ecc_init(struct gk20a *g)
|
||||
int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g)
|
||||
{
|
||||
int err;
|
||||
|
||||
@@ -265,6 +265,18 @@ int gv11b_gr_ecc_init(struct gk20a *g)
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
||||
nvgpu_ecc_free(g);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
int gv11b_gr_fecs_ecc_init(struct gk20a *g)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_uncorrected_err_count);
|
||||
if (err != 0) {
|
||||
goto done;
|
||||
|
||||
@@ -287,7 +287,7 @@ static const struct gpu_ops gp10b_ops = {
|
||||
#endif /* CONFIG_NVGPU_DEBUGGER */
|
||||
.ecc = {
|
||||
.detect = gp10b_ecc_detect_enabled_units,
|
||||
.init = gp10b_gr_ecc_init,
|
||||
.gpc_tpc_ecc_init = gp10b_gr_ecc_init,
|
||||
},
|
||||
.ctxsw_prog = {
|
||||
.hw_get_fecs_header_size =
|
||||
|
||||
@@ -343,7 +343,8 @@ static const struct gpu_ops gv11b_ops = {
|
||||
#endif /* CONFIG_NVGPU_DEBUGGER */
|
||||
.ecc = {
|
||||
.detect = gv11b_ecc_detect_enabled_units,
|
||||
.init = gv11b_gr_ecc_init,
|
||||
.gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init,
|
||||
.fecs_ecc_init = gv11b_gr_fecs_ecc_init,
|
||||
#ifdef CONFIG_NVGPU_INJECT_HWERR
|
||||
.get_mmu_err_desc =
|
||||
gv11b_gr_intr_get_mmu_err_desc,
|
||||
|
||||
@@ -381,7 +381,8 @@ static const struct gpu_ops tu104_ops = {
|
||||
#endif /* CONFIG_NVGPU_DEBUGGER */
|
||||
.ecc = {
|
||||
.detect = NULL,
|
||||
.init = gv11b_gr_ecc_init,
|
||||
.gpc_tpc_ecc_init = gv11b_gr_gpc_tpc_ecc_init,
|
||||
.fecs_ecc_init = gv11b_gr_fecs_ecc_init,
|
||||
},
|
||||
.ctxsw_prog = {
|
||||
.hw_get_fecs_header_size =
|
||||
|
||||
@@ -85,11 +85,23 @@ struct gops_gr_ecc {
|
||||
* @param g [in] Pointer to GPU driver struct.
|
||||
*
|
||||
* This function allocates memory to track the ecc error counts
|
||||
* for GR unit and subunits of GR (like falcon/sm/gpccs/etc).
|
||||
* for GR unit and subunits of GR (like GPCs, TPCs etc).
|
||||
*
|
||||
* @return 0 in case of success, < 0 in case of failure.
|
||||
*/
|
||||
int (*init)(struct gk20a *g);
|
||||
int (*gpc_tpc_ecc_init)(struct gk20a *g);
|
||||
|
||||
/**
|
||||
* @brief Initialize GR unit ECC support.
|
||||
*
|
||||
* @param g [in] Pointer to GPU driver struct.
|
||||
*
|
||||
* This function allocates memory to track the ecc error counts
|
||||
* for FECS in GR.
|
||||
*
|
||||
* @return 0 in case of success, < 0 in case of failure.
|
||||
*/
|
||||
int (*fecs_ecc_init)(struct gk20a *g);
|
||||
|
||||
/**
|
||||
* @brief Detect ECC enabled units in GR engine.
|
||||
|
||||
@@ -160,7 +160,9 @@ void nvgpu_gr_init(struct gk20a *g);
|
||||
* that is required to enable GR engine h/w in #nvgpu_gr_enable_hw().
|
||||
*
|
||||
* This initialization includes reading netlist ucode and allocating
|
||||
* memory for internal data structures required to enable h/w.
|
||||
* memory for internal data structures required to enable h/w. This
|
||||
* function allocates memory for FECS ECC error counters and GR
|
||||
* interrupt structure.
|
||||
*
|
||||
* Note that all rest of the s/w initialization is completed in
|
||||
* #nvgpu_gr_init_support() function.
|
||||
|
||||
@@ -53,14 +53,26 @@
|
||||
|
||||
static int gr_init_ecc_fail_alloc(struct gk20a *g)
|
||||
{
|
||||
int err, i, loop = 28;
|
||||
int err, i, loop = 26;
|
||||
struct nvgpu_posix_fault_inj *kmem_fi =
|
||||
nvgpu_kmem_get_fault_injection();
|
||||
struct nvgpu_gr_config *save_gr_config = g->gr->config;
|
||||
|
||||
for (i = 0; i < loop; i++) {
|
||||
nvgpu_posix_enable_fault_injection(kmem_fi, true, i);
|
||||
err = g->ops.gr.ecc.init(g);
|
||||
err = g->ops.gr.ecc.gpc_tpc_ecc_init(g);
|
||||
if (err == 0) {
|
||||
return UNIT_FAIL;
|
||||
}
|
||||
nvgpu_posix_enable_fault_injection(kmem_fi, false, 0);
|
||||
g->ops.ecc.ecc_init_support(g);
|
||||
}
|
||||
|
||||
loop = 2;
|
||||
|
||||
for (i = 0; i < loop; i++) {
|
||||
nvgpu_posix_enable_fault_injection(kmem_fi, true, i);
|
||||
err = g->ops.gr.ecc.fecs_ecc_init(g);
|
||||
if (err == 0) {
|
||||
return UNIT_FAIL;
|
||||
}
|
||||
|
||||
@@ -98,6 +98,11 @@ int test_gr_init_prepare(struct unit_module *m, struct gk20a *g, void *args)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = g->ops.ecc.ecc_init_support(g);
|
||||
if (err != 0) {
|
||||
unit_return_fail(m, "ecc init failed\n");
|
||||
}
|
||||
|
||||
err = nvgpu_gr_prepare_sw(g);
|
||||
if (err != 0) {
|
||||
unit_return_fail(m, "nvgpu_gr_prepare_sw returned fail\n");
|
||||
|
||||
@@ -170,7 +170,8 @@ int test_gr_remove_support(struct unit_module *m, struct gk20a *g, void *args);
|
||||
* Input: #test_gr_init_setup, #test_gr_init_prepare and #test_gr_init_support
|
||||
* must have been executed successfully.
|
||||
*
|
||||
* Targets: gv11b_gr_ecc_init, gv11b_ecc_detect_enabled_units.
|
||||
* Targets: gv11b_gr_gpc_tpc_ecc_init, gv11b_gr_fecs_ecc_init and
|
||||
* gv11b_ecc_detect_enabled_units.
|
||||
*
|
||||
* Steps:
|
||||
* - Array with various combinations setting register bits for
|
||||
|
||||
Reference in New Issue
Block a user