gpu: nvgpu: fix ecc counter free

ECC counter structures are freed without removing the node from the
stats_list. This can lead to invalid access due to dangling pointers.

Update the ecc counter free logic to set them to NULL upon free, to
remove them from stats_list and free them by validation.

Also updated some of the ecc init paths where error was not propa-
gated to callers and full ecc counters deallocation was not done.

Now, calling unit ecc_free from any context (with counters alloc-
ated or not) is harmless as requisite checks are in place.

bug 3326612
bug 3345977

Change-Id: I05eb6ed226cff9197ad37776912da9dcb7e0716d
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2565264
Tested-by: Ashish Mhetre <amhetre@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: svc_kernel_abi <svc_kernel_abi@nvidia.com>
Reviewed-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Sagar Kamble
2021-06-17 11:34:36 +05:30
committed by mobile promotions
parent 2887d06e3b
commit 40064ef1ec
33 changed files with 546 additions and 218 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -36,5 +36,6 @@ struct nvgpu_hw_err_inject_info_desc *
void ga10b_ecc_detect_enabled_units(struct gk20a *g);
int ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g);
void ga10b_gr_gpc_tpc_ecc_deinit(struct gk20a *g);
#endif /* NVGPU_ECC_GA10B_H */

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -158,11 +158,40 @@ void ga10b_ecc_detect_enabled_units(struct gk20a *g)
}
}
int ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g)
static int _ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g)
{
gv11b_gr_gpc_tpc_ecc_init(g);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_rams_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_rams_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_rams_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_rams_ecc_uncorrected_err_count);
return 0;
}
int ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g)
{
int err;
err = gv11b_gr_gpc_tpc_ecc_init(g);
if (err != 0) {
goto done;
}
err = _ga10b_gr_gpc_tpc_ecc_init(g);
if (err != 0) {
goto done;
}
done:
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
ga10b_gr_gpc_tpc_ecc_deinit(g);
}
return 0;
}
void ga10b_gr_gpc_tpc_ecc_deinit(struct gk20a *g)
{
gv11b_gr_gpc_tpc_ecc_deinit(g);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_rams_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_rams_ecc_uncorrected_err_count);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -126,25 +126,25 @@ void gp10b_ecc_detect_enabled_units(struct gk20a *g)
static int gp10b_ecc_init_tpc_sm(struct gk20a *g)
{
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_single_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_double_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_shm_ecc_sec_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_shm_ecc_sed_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_shm_ecc_ded_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_single_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_double_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_shm_ecc_sec_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_shm_ecc_sed_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_shm_ecc_ded_count);
return 0;
}
static int gp10b_ecc_init_tpc_tex(struct gk20a *g)
{
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_sec_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_ded_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_sec_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_ded_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_sec_pipe1_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_ded_pipe1_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_sec_pipe1_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_ded_pipe1_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_sec_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_ded_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_sec_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_ded_pipe0_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_sec_pipe1_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_ded_pipe1_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_sec_pipe1_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_ded_pipe1_count);
return 0;
}
@@ -170,8 +170,36 @@ int gp10b_gr_ecc_init(struct gk20a *g)
err = gp10b_ecc_init_tpc(g);
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
nvgpu_ecc_free(g);
gp10b_gr_ecc_deinit(g);
}
return err;
}
static void gp10b_ecc_deinit_tpc_sm(struct gk20a *g)
{
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_single_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_double_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_shm_ecc_sec_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_shm_ecc_sed_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_shm_ecc_ded_count);
}
static void gp10b_ecc_deinit_tpc_tex(struct gk20a *g)
{
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_sec_pipe0_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_ded_pipe0_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_sec_pipe0_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_ded_pipe0_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_sec_pipe1_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_ded_pipe1_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_sec_pipe1_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_ded_pipe1_count);
}
void gp10b_gr_ecc_deinit(struct gk20a *g)
{
gp10b_ecc_deinit_tpc_sm(g);
gp10b_ecc_deinit_tpc_tex(g);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -27,5 +27,6 @@ struct gk20a;
void gp10b_ecc_detect_enabled_units(struct gk20a *g);
int gp10b_gr_ecc_init(struct gk20a *g);
void gp10b_gr_ecc_deinit(struct gk20a *g);
#endif /* NVGPU_ECC_GP10B_H */

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -35,6 +35,8 @@ void gv11b_ecc_detect_enabled_units(struct gk20a *g);
int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g);
int gv11b_gr_fecs_ecc_init(struct gk20a *g);
void gv11b_gr_gpc_tpc_ecc_deinit(struct gk20a *g);
void gv11b_gr_fecs_ecc_deinit(struct gk20a *g);
#ifdef CONFIG_NVGPU_INJECT_HWERR
void gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g,

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -177,20 +177,20 @@ void gv11b_ecc_detect_enabled_units(struct gk20a *g)
static int gv11b_ecc_init_sm_corrected_err_count(struct gk20a *g)
{
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_tag_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_cbu_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_data_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_icache_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_tag_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_cbu_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_data_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_icache_ecc_corrected_err_count);
return 0;
}
static int gv11b_ecc_init_sm_uncorrected_err_count(struct gk20a *g)
{
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_tag_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_cbu_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_data_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_icache_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_tag_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_cbu_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_data_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_icache_ecc_uncorrected_err_count);
return 0;
}
@@ -199,8 +199,8 @@ static int gv11b_ecc_init_tpc(struct gk20a *g)
{
int ret;
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_single_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_double_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_single_err_count);
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_double_err_count);
ret = gv11b_ecc_init_sm_corrected_err_count(g);
if (ret != 0) {
@@ -268,22 +268,23 @@ int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g)
done:
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
nvgpu_ecc_free(g);
gv11b_gr_gpc_tpc_ecc_deinit(g);
}
return err;
}
int gv11b_gr_fecs_ecc_init(struct gk20a *g)
{
int err;
nvgpu_log(g, gpu_dbg_gr, " ");
err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_uncorrected_err_count);
err = NVGPU_ECC_COUNTER_INIT_PER_GR(fecs_ecc_uncorrected_err_count);
if (err != 0) {
goto done;
}
err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_corrected_err_count);
err = NVGPU_ECC_COUNTER_INIT_PER_GR(fecs_ecc_corrected_err_count);
if (err != 0) {
goto done;
}
@@ -291,8 +292,60 @@ int gv11b_gr_fecs_ecc_init(struct gk20a *g)
done:
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
nvgpu_ecc_free(g);
gv11b_gr_fecs_ecc_deinit(g);
}
return err;
}
static void gv11b_ecc_deinit_sm_corrected_err_count(struct gk20a *g)
{
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_tag_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_cbu_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_data_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_icache_ecc_corrected_err_count);
}
static void gv11b_ecc_deinit_sm_uncorrected_err_count(struct gk20a *g)
{
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_tag_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_cbu_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_data_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_icache_ecc_uncorrected_err_count);
}
static void gv11b_ecc_deinit_tpc(struct gk20a *g)
{
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_single_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_double_err_count);
gv11b_ecc_deinit_sm_corrected_err_count(g);
gv11b_ecc_deinit_sm_uncorrected_err_count(g);
}
static void gv11b_ecc_deinit_gpc(struct gk20a *g)
{
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gcc_l15_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gcc_l15_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gpccs_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gpccs_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(mmu_l1tlb_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(mmu_l1tlb_ecc_corrected_err_count);
}
void gv11b_gr_gpc_tpc_ecc_deinit(struct gk20a *g)
{
nvgpu_log(g, gpu_dbg_gr, " ");
gv11b_ecc_deinit_tpc(g);
gv11b_ecc_deinit_gpc(g);
}
void gv11b_gr_fecs_ecc_deinit(struct gk20a *g)
{
nvgpu_log(g, gpu_dbg_gr, " ");
NVGPU_ECC_COUNTER_DEINIT_PER_GR(fecs_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_DEINIT_PER_GR(fecs_ecc_corrected_err_count);
}