mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-25 11:04:51 +03:00
gpu: nvgpu: fix ecc counter free
ECC counter structures are freed without removing the node from the stats_list. This can lead to invalid access due to dangling pointers. Update the ecc counter free logic to set them to NULL upon free, to remove them from stats_list and free them by validation. Also updated some of the ecc init paths where error was not propa- gated to callers and full ecc counters deallocation was not done. Now, calling unit ecc_free from any context (with counters alloc- ated or not) is harmless as requisite checks are in place. bug 3326612 bug 3345977 Change-Id: I05eb6ed226cff9197ad37776912da9dcb7e0716d Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2565264 Tested-by: Ashish Mhetre <amhetre@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: svc_kernel_abi <svc_kernel_abi@nvidia.com> Reviewed-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
This commit is contained in:
committed by
mobile promotions
parent
2887d06e3b
commit
40064ef1ec
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -36,5 +36,6 @@ struct nvgpu_hw_err_inject_info_desc *
|
||||
|
||||
void ga10b_ecc_detect_enabled_units(struct gk20a *g);
|
||||
int ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g);
|
||||
void ga10b_gr_gpc_tpc_ecc_deinit(struct gk20a *g);
|
||||
|
||||
#endif /* NVGPU_ECC_GA10B_H */
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -158,11 +158,40 @@ void ga10b_ecc_detect_enabled_units(struct gk20a *g)
|
||||
}
|
||||
}
|
||||
|
||||
int ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g)
|
||||
static int _ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g)
|
||||
{
|
||||
gv11b_gr_gpc_tpc_ecc_init(g);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_rams_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_rams_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_rams_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_rams_ecc_uncorrected_err_count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ga10b_gr_gpc_tpc_ecc_init(struct gk20a *g)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = gv11b_gr_gpc_tpc_ecc_init(g);
|
||||
if (err != 0) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
err = _ga10b_gr_gpc_tpc_ecc_init(g);
|
||||
if (err != 0) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
done:
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
||||
ga10b_gr_gpc_tpc_ecc_deinit(g);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ga10b_gr_gpc_tpc_ecc_deinit(struct gk20a *g)
|
||||
{
|
||||
gv11b_gr_gpc_tpc_ecc_deinit(g);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_rams_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_rams_ecc_uncorrected_err_count);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -126,25 +126,25 @@ void gp10b_ecc_detect_enabled_units(struct gk20a *g)
|
||||
|
||||
static int gp10b_ecc_init_tpc_sm(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_single_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_double_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_shm_ecc_sec_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_shm_ecc_sed_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_shm_ecc_ded_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_single_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_double_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_shm_ecc_sec_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_shm_ecc_sed_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_shm_ecc_ded_count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gp10b_ecc_init_tpc_tex(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_sec_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_ded_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_sec_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_ded_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_sec_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_ecc_total_ded_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_sec_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(tex_unique_ecc_ded_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_sec_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_ded_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_sec_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_ded_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_sec_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_ecc_total_ded_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_sec_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(tex_unique_ecc_ded_pipe1_count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -170,8 +170,36 @@ int gp10b_gr_ecc_init(struct gk20a *g)
|
||||
err = gp10b_ecc_init_tpc(g);
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
||||
nvgpu_ecc_free(g);
|
||||
gp10b_gr_ecc_deinit(g);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void gp10b_ecc_deinit_tpc_sm(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_single_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_double_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_shm_ecc_sec_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_shm_ecc_sed_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_shm_ecc_ded_count);
|
||||
}
|
||||
|
||||
static void gp10b_ecc_deinit_tpc_tex(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_sec_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_ded_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_sec_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_ded_pipe0_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_sec_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_ecc_total_ded_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_sec_pipe1_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(tex_unique_ecc_ded_pipe1_count);
|
||||
}
|
||||
|
||||
void gp10b_gr_ecc_deinit(struct gk20a *g)
|
||||
{
|
||||
gp10b_ecc_deinit_tpc_sm(g);
|
||||
|
||||
gp10b_ecc_deinit_tpc_tex(g);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -27,5 +27,6 @@ struct gk20a;
|
||||
|
||||
void gp10b_ecc_detect_enabled_units(struct gk20a *g);
|
||||
int gp10b_gr_ecc_init(struct gk20a *g);
|
||||
void gp10b_gr_ecc_deinit(struct gk20a *g);
|
||||
|
||||
#endif /* NVGPU_ECC_GP10B_H */
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -35,6 +35,8 @@ void gv11b_ecc_detect_enabled_units(struct gk20a *g);
|
||||
int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g);
|
||||
int gv11b_gr_fecs_ecc_init(struct gk20a *g);
|
||||
|
||||
void gv11b_gr_gpc_tpc_ecc_deinit(struct gk20a *g);
|
||||
void gv11b_gr_fecs_ecc_deinit(struct gk20a *g);
|
||||
|
||||
#ifdef CONFIG_NVGPU_INJECT_HWERR
|
||||
void gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -177,20 +177,20 @@ void gv11b_ecc_detect_enabled_units(struct gk20a *g)
|
||||
|
||||
static int gv11b_ecc_init_sm_corrected_err_count(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_tag_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_cbu_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_data_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_icache_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_tag_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_cbu_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_data_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_icache_ecc_corrected_err_count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gv11b_ecc_init_sm_uncorrected_err_count(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_tag_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_cbu_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_l1_data_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_icache_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_tag_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_cbu_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_l1_data_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_icache_ecc_uncorrected_err_count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -199,8 +199,8 @@ static int gv11b_ecc_init_tpc(struct gk20a *g)
|
||||
{
|
||||
int ret;
|
||||
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_single_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC(sm_lrf_ecc_double_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_single_err_count);
|
||||
NVGPU_ECC_COUNTER_INIT_PER_TPC_OR_RETURN(sm_lrf_ecc_double_err_count);
|
||||
|
||||
ret = gv11b_ecc_init_sm_corrected_err_count(g);
|
||||
if (ret != 0) {
|
||||
@@ -268,22 +268,23 @@ int gv11b_gr_gpc_tpc_ecc_init(struct gk20a *g)
|
||||
done:
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
||||
nvgpu_ecc_free(g);
|
||||
gv11b_gr_gpc_tpc_ecc_deinit(g);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int gv11b_gr_fecs_ecc_init(struct gk20a *g)
|
||||
{
|
||||
int err;
|
||||
|
||||
nvgpu_log(g, gpu_dbg_gr, " ");
|
||||
|
||||
err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_uncorrected_err_count);
|
||||
err = NVGPU_ECC_COUNTER_INIT_PER_GR(fecs_ecc_uncorrected_err_count);
|
||||
if (err != 0) {
|
||||
goto done;
|
||||
}
|
||||
err = NVGPU_ECC_COUNTER_INIT_GR(fecs_ecc_corrected_err_count);
|
||||
err = NVGPU_ECC_COUNTER_INIT_PER_GR(fecs_ecc_corrected_err_count);
|
||||
if (err != 0) {
|
||||
goto done;
|
||||
}
|
||||
@@ -291,8 +292,60 @@ int gv11b_gr_fecs_ecc_init(struct gk20a *g)
|
||||
done:
|
||||
if (err != 0) {
|
||||
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
|
||||
nvgpu_ecc_free(g);
|
||||
gv11b_gr_fecs_ecc_deinit(g);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void gv11b_ecc_deinit_sm_corrected_err_count(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_tag_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_cbu_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_data_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_icache_ecc_corrected_err_count);
|
||||
}
|
||||
|
||||
static void gv11b_ecc_deinit_sm_uncorrected_err_count(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_tag_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_cbu_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_l1_data_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_icache_ecc_uncorrected_err_count);
|
||||
}
|
||||
|
||||
static void gv11b_ecc_deinit_tpc(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_single_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_TPC(sm_lrf_ecc_double_err_count);
|
||||
|
||||
gv11b_ecc_deinit_sm_corrected_err_count(g);
|
||||
gv11b_ecc_deinit_sm_uncorrected_err_count(g);
|
||||
}
|
||||
|
||||
static void gv11b_ecc_deinit_gpc(struct gk20a *g)
|
||||
{
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gcc_l15_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gcc_l15_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gpccs_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(gpccs_ecc_corrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(mmu_l1tlb_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GPC(mmu_l1tlb_ecc_corrected_err_count);
|
||||
}
|
||||
|
||||
void gv11b_gr_gpc_tpc_ecc_deinit(struct gk20a *g)
|
||||
{
|
||||
nvgpu_log(g, gpu_dbg_gr, " ");
|
||||
|
||||
gv11b_ecc_deinit_tpc(g);
|
||||
|
||||
gv11b_ecc_deinit_gpc(g);
|
||||
}
|
||||
|
||||
void gv11b_gr_fecs_ecc_deinit(struct gk20a *g)
|
||||
{
|
||||
nvgpu_log(g, gpu_dbg_gr, " ");
|
||||
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GR(fecs_ecc_uncorrected_err_count);
|
||||
NVGPU_ECC_COUNTER_DEINIT_PER_GR(fecs_ecc_corrected_err_count);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user