gpu: nvgpu: fix ecc counter free

ECC counter structures are freed without removing the node from the
stats_list. This can lead to invalid access due to dangling pointers.

Update the ecc counter free logic to set them to NULL upon free, to
remove them from stats_list and free them by validation.

Also updated some of the ecc init paths where error was not propa-
gated to callers and full ecc counters deallocation was not done.

Now, calling unit ecc_free from any context (with counters alloc-
ated or not) is harmless as requisite checks are in place.

bug 3326612
bug 3345977

Change-Id: I05eb6ed226cff9197ad37776912da9dcb7e0716d
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2565264
Tested-by: Ashish Mhetre <amhetre@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: svc_kernel_abi <svc_kernel_abi@nvidia.com>
Reviewed-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Sagar Kamble
2021-06-17 11:34:36 +05:30
committed by mobile promotions
parent 2887d06e3b
commit 40064ef1ec
33 changed files with 546 additions and 218 deletions

View File

@@ -1,7 +1,7 @@
/*
* GA10B FB ECC
*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -50,61 +50,52 @@ int ga10b_fb_ecc_init(struct gk20a *g)
err = gv11b_fb_ecc_init(g);
if (err != 0) {
goto init_fb_gv11b_counters_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_uncorrected_unique_err_count);
if (err != 0) {
goto init_l2tlb_ecc_uncorrected_unique_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_corrected_unique_err_count);
if (err != 0) {
goto init_l2tlb_ecc_corrected_unique_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_uncorrected_unique_err_count);
if (err != 0) {
goto init_hubtlb_ecc_uncorrected_unique_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_corrected_unique_err_count);
if (err != 0) {
goto init_hubtlb_ecc_corrected_unique_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_fillunit_ecc_uncorrected_unique_err_count);
if (err != 0) {
goto init_fillunit_ecc_uncorrected_unique_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_fillunit_ecc_corrected_unique_err_count);
if (err != 0) {
goto init_fillunit_ecc_corrected_unique_fail;
goto init_fb_ecc_err;
}
return 0;
init_fb_ecc_err:
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
ga10b_fb_ecc_free(g);
}
init_fillunit_ecc_corrected_unique_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_fillunit_ecc_uncorrected_unique_err_count);
init_fillunit_ecc_uncorrected_unique_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_corrected_unique_err_count);
init_hubtlb_ecc_corrected_unique_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_uncorrected_unique_err_count);
init_hubtlb_ecc_uncorrected_unique_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_corrected_unique_err_count);
init_l2tlb_ecc_corrected_unique_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_uncorrected_unique_err_count);
init_l2tlb_ecc_uncorrected_unique_fail:
gv11b_fb_ecc_free(g);
init_fb_gv11b_counters_fail:
return err;
}
void ga10b_fb_ecc_free(struct gk20a *g)
{
struct nvgpu_ecc *ecc = &g->ecc;
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_corrected_unique_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_uncorrected_unique_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_corrected_unique_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_uncorrected_unique_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_fillunit_ecc_corrected_unique_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_fillunit_ecc_uncorrected_unique_err_count);
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_corrected_unique_err_count);
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_uncorrected_unique_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_corrected_unique_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_uncorrected_unique_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_corrected_unique_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_uncorrected_unique_err_count);
gv11b_fb_ecc_free(g);
}

View File

@@ -1,7 +1,7 @@
/*
* GV11B FB ECC
*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -36,8 +36,8 @@ struct gk20a;
#define NVGPU_ECC_COUNTER_INIT_FB(stat) \
nvgpu_ecc_counter_init(g, &g->ecc.fb.stat, #stat)
#define NVGPU_ECC_COUNTER_FREE_FB(stat) \
nvgpu_kfree(g, g->ecc.fb.stat)
#define NVGPU_ECC_COUNTER_FREE_FB(stat) \
nvgpu_ecc_counter_deinit(g, &g->ecc.fb.stat)
int gv11b_fb_ecc_init(struct gk20a *g);
void gv11b_fb_ecc_free(struct gk20a *g);

View File

@@ -1,7 +1,7 @@
/*
* GV11B FB ECC
*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -36,57 +36,49 @@ int gv11b_fb_ecc_init(struct gk20a *g)
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_uncorrected_err_count);
if (err != 0) {
goto init_l2tlb_ecc_uncorrected_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_corrected_err_count);
if (err != 0) {
goto init_l2tlb_ecc_corrected_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_uncorrected_err_count);
if (err != 0) {
goto init_hubtlb_ecc_uncorrected_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_corrected_err_count);
if (err != 0) {
goto init_hubtlb_ecc_corrected_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(
mmu_fillunit_ecc_uncorrected_err_count);
if (err != 0) {
goto init_fillunit_ecc_uncorrected_fail;
goto init_fb_ecc_err;
}
err = NVGPU_ECC_COUNTER_INIT_FB(
mmu_fillunit_ecc_corrected_err_count);
if (err != 0) {
goto init_fillunit_ecc_corrected_fail;
goto init_fb_ecc_err;
}
return 0;
init_fb_ecc_err:
if (err != 0) {
nvgpu_err(g, "ecc counter allocate failed, err=%d", err);
gv11b_fb_ecc_free(g);
}
init_fillunit_ecc_corrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_fillunit_ecc_uncorrected_err_count);
init_fillunit_ecc_uncorrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_corrected_err_count);
init_hubtlb_ecc_corrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_uncorrected_err_count);
init_hubtlb_ecc_uncorrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_corrected_err_count);
init_l2tlb_ecc_corrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_uncorrected_err_count);
init_l2tlb_ecc_uncorrected_fail:
return err;
}
void gv11b_fb_ecc_free(struct gk20a *g)
{
struct nvgpu_ecc *ecc = &g->ecc;
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_uncorrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_uncorrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_uncorrected_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_fillunit_ecc_corrected_err_count);
NVGPU_ECC_COUNTER_FREE_FB(mmu_fillunit_ecc_uncorrected_err_count);
}
void gv11b_fb_ecc_l2tlb_error_mask(u32 *corrected_error_mask,