gpu: nvgpu: update fb unit ecc init, handling

The ecc init, handling for the fb unit is refactored to improve reusability
for nvgpu-next.

The following changes have been done:
- fb.ecc:
  This is a new subunit within fb and contains the following functions:
  - init: Moved from fb.fb_ecc_init.
  - free: Moved from fb.fb_ecc_free.
  - l2tlb_error_mask: Fetch bit mask for corrected, uncorrected errors supported
    by the unit.
- fb.intr:
  This unit has been updated to include the following ecc interrupt, error
  handlers:
  - handle_ecc: Top level interrupt handler for fb ecc errors.
  - handle_ecc_l2tlb: Handle errors within l2tlb memory.
  - handle_ecc_hubtlb: Handle errors within hubtlb memory.
  - handle_ecc_fillunit: Handle errors within fillunit memory

Jira: NVGPU-5032

Change-Id: I1a26c1823eb992e0e0175250b969f1186dff6e62
Signed-off-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2333271
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Antony Clince Alex
2020-04-22 21:43:51 +05:30
committed by Alex Waterman
parent 8f715117d4
commit 50dcfe1637
24 changed files with 348 additions and 131 deletions

View File

@@ -638,6 +638,7 @@ fb_fusa:
hal/fb/fb_gm20b.h,
hal/fb/fb_gv11b_fusa.c,
hal/fb/fb_gv11b.h,
hal/fb/ecc/fb_ecc_gv11b.h, hal/fb/ecc/fb_ecc_gv11b_fusa.c,
hal/fb/intr/fb_intr_gv11b.h, hal/fb/intr/fb_intr_gv11b_fusa.c,
hal/fb/fb_mmu_fault_gv11b.h, hal/fb/fb_mmu_fault_gv11b_fusa.c,
hal/fb/intr/fb_intr_ecc_gv11b.h, hal/fb/intr/fb_intr_ecc_gv11b_fusa.c ]

View File

@@ -615,6 +615,7 @@ nvgpu-y += \
hal/fb/fb_gm20b_fusa.o \
hal/fb/fb_gv11b_fusa.o \
hal/fb/fb_mmu_fault_gv11b_fusa.o \
hal/fb/ecc/fb_ecc_gv11b_fusa.o \
hal/fb/intr/fb_intr_ecc_gv11b_fusa.o \
hal/fb/intr/fb_intr_gv11b_fusa.o \
hal/fifo/channel_gk20a_fusa.o \

View File

@@ -181,6 +181,7 @@ srcs += hal/mm/mm_gv11b_fusa.c \
hal/fb/fb_gm20b_fusa.c \
hal/fb/fb_gv11b_fusa.c \
hal/fb/fb_mmu_fault_gv11b_fusa.c \
hal/fb/ecc/fb_ecc_gv11b_fusa.c \
hal/fb/intr/fb_intr_ecc_gv11b_fusa.c \
hal/fb/intr/fb_intr_gv11b_fusa.c \
hal/fifo/channel_gk20a_fusa.c \

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -58,8 +58,8 @@ void nvgpu_ecc_free(struct gk20a *g)
nvgpu_gr_ecc_free(g);
nvgpu_ltc_ecc_free(g);
if (g->ops.fb.fb_ecc_free != NULL) {
g->ops.fb.fb_ecc_free(g);
if (g->ops.fb.ecc.free != NULL) {
g->ops.fb.ecc.free(g);
}
#ifdef CONFIG_NVGPU_DGPU

View File

@@ -557,8 +557,8 @@ static int nvgpu_init_mm_setup_sw(struct gk20a *g)
return err;
}
if ((g->ops.fb.fb_ecc_init != NULL) && !g->ecc.initialized) {
err = g->ops.fb.fb_ecc_init(g);
if ((g->ops.fb.ecc.init != NULL) && !g->ecc.initialized) {
err = g->ops.fb.ecc.init(g);
if (err != 0) {
return err;
}

View File

@@ -0,0 +1,47 @@
/*
* GV11B FB ECC
*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef NVGPU_FB_ECC_GV11B_H
#define NVGPU_FB_ECC_GV11B_H
struct gk20a;
/*
* @brief Allocate and initialize counters for memories within FB.
*
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
*
*/
#define NVGPU_ECC_COUNTER_INIT_FB(stat) \
nvgpu_ecc_counter_init(g, &g->ecc.fb.stat, #stat)
#define NVGPU_ECC_COUNTER_FREE_FB(stat) \
nvgpu_kfree(g, g->ecc.fb.stat)
int gv11b_fb_ecc_init(struct gk20a *g);
void gv11b_fb_ecc_free(struct gk20a *g);
void gv11b_fb_ecc_l2tlb_error_mask(u32 *corrected_error_mask,
u32 *uncorrected_error_mask);
#endif /* NVGPU_FB_ECC_GV11B_H */

View File

@@ -0,0 +1,101 @@
/*
* GV11B FB ECC
*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/log.h>
#include <nvgpu/io.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_err.h>
#include "fb_ecc_gv11b.h"
#include <nvgpu/hw/gv11b/hw_fb_gv11b.h>
int gv11b_fb_ecc_init(struct gk20a *g)
{
int err = 0;
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_uncorrected_err_count);
if (err != 0) {
goto init_l2tlb_ecc_uncorrected_fail;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_corrected_err_count);
if (err != 0) {
goto init_l2tlb_ecc_corrected_fail;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_uncorrected_err_count);
if (err != 0) {
goto init_hubtlb_ecc_uncorrected_fail;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_corrected_err_count);
if (err != 0) {
goto init_hubtlb_ecc_corrected_fail;
}
err = NVGPU_ECC_COUNTER_INIT_FB(
mmu_fillunit_ecc_uncorrected_err_count);
if (err != 0) {
goto init_fillunit_ecc_uncorrected_fail;
}
err = NVGPU_ECC_COUNTER_INIT_FB(
mmu_fillunit_ecc_corrected_err_count);
if (err != 0) {
goto init_fillunit_ecc_corrected_fail;
}
return 0;
init_fillunit_ecc_corrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_fillunit_ecc_uncorrected_err_count);
init_fillunit_ecc_uncorrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_corrected_err_count);
init_hubtlb_ecc_corrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_hubtlb_ecc_uncorrected_err_count);
init_hubtlb_ecc_uncorrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_corrected_err_count);
init_l2tlb_ecc_corrected_fail:
NVGPU_ECC_COUNTER_FREE_FB(mmu_l2tlb_ecc_uncorrected_err_count);
init_l2tlb_ecc_uncorrected_fail:
return err;
}
void gv11b_fb_ecc_free(struct gk20a *g)
{
struct nvgpu_ecc *ecc = &g->ecc;
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_uncorrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_uncorrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_uncorrected_err_count);
}
void gv11b_fb_ecc_l2tlb_error_mask(u32 *corrected_error_mask,
u32 *uncorrected_error_mask)
{
*corrected_error_mask =
fb_mmu_l2tlb_ecc_status_corrected_err_l2tlb_sa_data_m();
*uncorrected_error_mask =
fb_mmu_l2tlb_ecc_status_uncorrected_err_l2tlb_sa_data_m();
return;
}

View File

@@ -1,7 +1,7 @@
/*
* GV11B FB
*
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -35,16 +35,4 @@ struct nvgpu_cbc;
void gv11b_fb_cbc_configure(struct gk20a *g, struct nvgpu_cbc *cbc);
#endif
/*
* @brief Allocate and initialize counters for memories within FB.
*
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
*
*/
#define NVGPU_ECC_COUNTER_INIT_FB(stat) \
nvgpu_ecc_counter_init(g, &g->ecc.fb.stat, #stat)
int gv11b_fb_ecc_init(struct gk20a *g);
void gv11b_fb_ecc_free(struct gk20a *g);
#endif /* NVGPU_FB_GV11B_H */

View File

@@ -1,7 +1,7 @@
/*
* GV11B FB
*
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -115,47 +115,3 @@ void gv11b_fb_init_fs_state(struct gk20a *g)
nvgpu_writel(g, fb_priv_mmu_phy_secure_r(), U32_MAX);
}
}
int gv11b_fb_ecc_init(struct gk20a *g)
{
int err = 0;
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_uncorrected_err_count);
if (err != 0) {
goto init_fb_done;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_l2tlb_ecc_corrected_err_count);
if (err != 0) {
goto init_fb_done;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_uncorrected_err_count);
if (err != 0) {
goto init_fb_done;
}
err = NVGPU_ECC_COUNTER_INIT_FB(mmu_hubtlb_ecc_corrected_err_count);
if (err != 0) {
goto init_fb_done;
}
err = NVGPU_ECC_COUNTER_INIT_FB(
mmu_fillunit_ecc_uncorrected_err_count);
if (err != 0) {
goto init_fb_done;
}
err = NVGPU_ECC_COUNTER_INIT_FB(
mmu_fillunit_ecc_corrected_err_count);
init_fb_done:
return err;
}
void gv11b_fb_ecc_free(struct gk20a *g)
{
struct nvgpu_ecc *ecc = &g->ecc;
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_l2tlb_ecc_uncorrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_hubtlb_ecc_uncorrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_corrected_err_count);
nvgpu_kfree(g, ecc->fb.mmu_fillunit_ecc_uncorrected_err_count);
}

View File

@@ -1,7 +1,7 @@
/*
* GV11B FB INTR ECC
*
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -33,6 +33,10 @@ struct nvgpu_hw_err_inject_info;
struct nvgpu_hw_err_inject_info_desc;
void gv11b_fb_intr_handle_ecc(struct gk20a *g);
void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status);
void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status);
void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status);
#ifdef CONFIG_NVGPU_INJECT_HWERR
struct nvgpu_hw_err_inject_info_desc *
gv11b_fb_intr_get_hubmmu_err_desc(struct gk20a *g);

View File

@@ -1,7 +1,7 @@
/*
* GV11B ECC INTR
*
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -32,29 +32,31 @@
#include <nvgpu/hw/gv11b/hw_fb_gv11b.h>
static void gv11b_fb_intr_handle_ecc_l2tlb_errs(struct gk20a *g,
u32 ecc_status, u32 ecc_addr)
u32 ecc_status, u32 ecc_addr)
{
if ((ecc_status &
fb_mmu_l2tlb_ecc_status_corrected_err_l2tlb_sa_data_m())
!= 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED,
u32 corrected_error_mask = 0U;
u32 uncorrected_error_mask = 0U;
g->ops.fb.ecc.l2tlb_error_mask(&corrected_error_mask,
&uncorrected_error_mask);
if ((ecc_status & corrected_error_mask) != 0U) {
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_l2tlb_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
}
if ((ecc_status &
fb_mmu_l2tlb_ecc_status_uncorrected_err_l2tlb_sa_data_m())
!= 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED,
if ((ecc_status & uncorrected_error_mask) != 0U) {
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
}
}
static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status)
void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status)
{
u32 ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
@@ -121,27 +123,28 @@ static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status)
}
static void gv11b_fb_intr_handle_ecc_hubtlb_errs(struct gk20a *g,
u32 ecc_status, u32 ecc_addr)
u32 ecc_status, u32 ecc_addr)
{
if ((ecc_status &
fb_mmu_hubtlb_ecc_status_corrected_err_sa_data_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED,
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_hubtlb_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
}
if ((ecc_status &
fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED,
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
}
}
static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status)
void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status)
{
u32 ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
@@ -193,6 +196,7 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status)
g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter,
uncorrected_delta);
gv11b_fb_intr_handle_ecc_hubtlb_errs(g, ecc_status, ecc_addr);
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
@@ -208,12 +212,12 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status)
}
static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g,
u32 ecc_status, u32 ecc_addr)
u32 ecc_status, u32 ecc_addr)
{
if ((ecc_status &
fb_mmu_fillunit_ecc_status_corrected_err_pte_data_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PTE_DATA_ECC_CORRECTED,
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_PTE_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc pte data error");
@@ -221,16 +225,16 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g,
if ((ecc_status &
fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m())
!= 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED,
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pte data error");
}
if ((ecc_status &
fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PDE0_DATA_ECC_CORRECTED,
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_PDE0_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc pde0 data error");
@@ -238,15 +242,15 @@ static void gv11b_fb_intr_handle_ecc_fillunit_errors(struct gk20a *g,
if ((ecc_status &
fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m())
!= 0U) {
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED,
nvgpu_report_fb_ecc_err(g,
GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc pde0 data error");
}
}
static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status)
void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status)
{
u32 ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
@@ -321,16 +325,16 @@ void gv11b_fb_intr_handle_ecc(struct gk20a *g)
status = nvgpu_readl(g, fb_mmu_l2tlb_ecc_status_r());
if (status != 0U) {
gv11b_fb_intr_handle_ecc_l2tlb(g, status);
g->ops.fb.intr.handle_ecc_l2tlb(g, status);
}
status = nvgpu_readl(g, fb_mmu_hubtlb_ecc_status_r());
if (status != 0U) {
gv11b_fb_intr_handle_ecc_hubtlb(g, status);
g->ops.fb.intr.handle_ecc_hubtlb(g, status);
}
status = nvgpu_readl(g, fb_mmu_fillunit_ecc_status_r());
if (status != 0U) {
gv11b_fb_intr_handle_ecc_fillunit(g, status);
g->ops.fb.intr.handle_ecc_fillunit(g, status);
}
}

View File

@@ -31,7 +31,6 @@
#include "hal/fb/fb_mmu_fault_gv11b.h"
#include "fb_intr_gv11b.h"
#include "fb_intr_ecc_gv11b.h"
#include <nvgpu/hw/gv11b/hw_fb_gv11b.h>
@@ -90,7 +89,7 @@ void gv11b_fb_intr_isr(struct gk20a *g, u32 intr_unit_bitmask)
}
if ((niso_intr &
fb_niso_intr_mmu_ecc_uncorrected_error_notify_pending_f()) != 0U) {
gv11b_fb_intr_handle_ecc(g);
g->ops.fb.intr.handle_ecc(g);
}
if ((niso_intr &
(fb_niso_intr_mmu_other_fault_notify_m() |

View File

@@ -28,7 +28,6 @@
#include "hal/fb/fb_mmu_fault_tu104.h"
#include "hal/mc/mc_tu104.h"
#include "fb_intr_ecc_gv11b.h"
#include "fb_intr_tu104.h"
#include "nvgpu/hw/tu104/hw_fb_tu104.h"
@@ -95,7 +94,7 @@ void tu104_fb_intr_isr(struct gk20a *g, u32 intr_unit_bitmask)
if (intr_tu104_vector_intr_pending(g,
fb_mmu_int_vector_ecc_error_vector_v(ecc_error))) {
gv11b_fb_intr_handle_ecc(g);
g->ops.fb.intr.handle_ecc(g);
}
if (intr_tu104_vector_intr_pending(g,

View File

@@ -80,6 +80,7 @@
#include "hal/fb/fb_gp10b.h"
#include "hal/fb/fb_gv11b.h"
#include "hal/fb/fb_mmu_fault_gv11b.h"
#include "hal/fb/ecc/fb_ecc_gv11b.h"
#include "hal/fb/intr/fb_intr_gv11b.h"
#include "hal/fb/intr/fb_intr_ecc_gv11b.h"
#include "hal/fuse/fuse_gm20b.h"
@@ -824,8 +825,6 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
#endif
},
.fb = {
.fb_ecc_init = gv11b_fb_ecc_init,
.fb_ecc_free = gv11b_fb_ecc_free,
#ifdef CONFIG_NVGPU_INJECT_HWERR
.get_hubmmu_err_desc =
gv11b_fb_intr_get_hubmmu_err_desc,
@@ -882,12 +881,21 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
.is_fault_buf_enabled = gv11b_fb_is_fault_buf_enabled,
.fault_buf_set_state_hw = gv11b_fb_fault_buf_set_state_hw,
.fault_buf_configure_hw = gv11b_fb_fault_buf_configure_hw,
.ecc = {
.init = gv11b_fb_ecc_init,
.free = gv11b_fb_ecc_free,
.l2tlb_error_mask = gv11b_fb_ecc_l2tlb_error_mask,
},
.intr = {
.enable = gv11b_fb_intr_enable,
.disable = gv11b_fb_intr_disable,
.isr = gv11b_fb_intr_isr,
.is_mmu_fault_pending =
gv11b_fb_intr_is_mmu_fault_pending,
.handle_ecc = gv11b_fb_intr_handle_ecc,
.handle_ecc_l2tlb = gv11b_fb_intr_handle_ecc_l2tlb,
.handle_ecc_hubtlb = gv11b_fb_intr_handle_ecc_hubtlb,
.handle_ecc_fillunit = gv11b_fb_intr_handle_ecc_fillunit,
},
},
.cg = {

View File

@@ -67,6 +67,8 @@
#include "hal/fb/fb_tu104.h"
#include "hal/fb/fb_mmu_fault_gv11b.h"
#include "hal/fb/fb_mmu_fault_tu104.h"
#include "hal/fb/ecc/fb_ecc_gv11b.h"
#include "hal/fb/intr/fb_intr_ecc_gv11b.h"
#include "hal/fb/intr/fb_intr_tu104.h"
#include "hal/ptimer/ptimer_gk20a.h"
#include "hal/ptimer/ptimer_gp10b.h"
@@ -845,8 +847,6 @@ static const struct gpu_ops tu104_ops = {
#endif
},
.fb = {
.fb_ecc_init = gv11b_fb_ecc_init,
.fb_ecc_free = gv11b_fb_ecc_free,
.fbpa_ecc_init = tu104_fbpa_ecc_init,
.fbpa_ecc_free = tu104_fbpa_ecc_free,
.init_hw = gv11b_fb_init_hw,
@@ -907,12 +907,21 @@ static const struct gpu_ops tu104_ops = {
.get_vidmem_size = tu104_fb_get_vidmem_size,
#endif
.apply_pdb_cache_war = tu104_fb_apply_pdb_cache_war,
.ecc = {
.init = gv11b_fb_ecc_init,
.free = gv11b_fb_ecc_free,
.l2tlb_error_mask = gv11b_fb_ecc_l2tlb_error_mask,
},
.intr = {
.enable = tu104_fb_intr_enable,
.disable = tu104_fb_intr_disable,
.isr = tu104_fb_intr_isr,
.is_mmu_fault_pending =
tu104_fb_intr_is_mmu_fault_pending,
.handle_ecc = gv11b_fb_intr_handle_ecc,
.handle_ecc_l2tlb = gv11b_fb_intr_handle_ecc_l2tlb,
.handle_ecc_hubtlb = gv11b_fb_intr_handle_ecc_hubtlb,
.handle_ecc_fillunit = gv11b_fb_intr_handle_ecc_fillunit,
}
},
.nvdec = {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -222,6 +222,9 @@ struct nvgpu_ecc {
struct nvgpu_ecc_stat *mmu_fillunit_ecc_corrected_err_count;
/** hubmmu fillunit uncorrected error count. */
struct nvgpu_ecc_stat *mmu_fillunit_ecc_uncorrected_err_count;
#if defined(CONFIG_NVGPU_NON_FUSA) && defined(CONFIG_NVGPU_NEXT)
#include "include/nvgpu/nvgpu_next_ecc.h"
#endif
} fb;
/**

View File

@@ -86,16 +86,44 @@ struct gops_fb_intr {
* @return true in case of mmu faults pending, false otherwise.
*/
bool (*is_mmu_fault_pending)(struct gk20a *g);
/*
* @brief Handle fb ecc error interrupts.
*
* @param g [in] Pointer to GPU driver struct.
*
* This function handles ecc errors generated from memories within
* the fb.
*/
void (*handle_ecc)(struct gk20a *g);
/**
* @brief Handle l2tlb ecc errors.
*
* @param g [in] Pointer to GPU driver struct.
*
* This function handles ecc faults in l2tlb memory.
*/
void (*handle_ecc_l2tlb)(struct gk20a *g, u32 status);
/**
* @brief Handle hubmmu tlb ecc errors.
*
* @param g [in] Pointer to GPU driver struct.
*
* This function handles ecc faults in hubmmu tlb memory.
*/
void (*handle_ecc_hubtlb)(struct gk20a *g, u32 status);
/**
* @brief Handle hubmmu fillunit ecc errors.
*
* @param g [in] Pointer to GPU driver struct.
*
* This function handles ecc faults in hubmmu fillunit memory.
*/
void (*handle_ecc_fillunit)(struct gk20a *g, u32 status);
};
/**
* common.fb unit hal operations.
*
* This structure stores common.fb unit hal pointers.
*
* @see gpu_ops
*/
struct gops_fb {
struct gops_fb_ecc {
/**
* @brief Initialize FB unit ECC support.
*
@@ -106,7 +134,7 @@ struct gops_fb {
*
* @return 0 in case of success, < 0 in case of failure.
*/
int (*fb_ecc_init)(struct gk20a *g);
int (*init)(struct gk20a *g);
/**
* @brief Free FB unit ECC support.
@@ -116,7 +144,31 @@ struct gops_fb {
* This function deallocates memory allocated for ecc error counts
* for FB unit.
*/
void (*fb_ecc_free)(struct gk20a *g);
void (*free)(struct gk20a *g);
/**
* @brief Fetch bitmask for l2tlb corrected, uncorrcted errors.
*
* @param corrected_error_mask [out] Pointer to write corrected error
* mask.
* @param uncorrected_error_mask [out] Pointer to write uncorrected
* error mask.
*
* Fetchs a bit mask of all the corrected, uncorrected errors supported
* by l2tlb.
*/
void (*l2tlb_error_mask)(u32 *corrected_error_mask,
u32 *uncorrected_error_mask);
};
/**
* common.fb unit hal operations.
*
* This structure stores common.fb unit hal pointers.
*
* @see gpu_ops
*/
struct gops_fb {
/**
* @brief Initializes frame buffer h/w configuration.
@@ -306,6 +358,8 @@ struct gops_fb {
*/
void (*fault_buf_set_state_hw)(struct gk20a *g, u32 index, u32 state);
struct gops_fb_ecc ecc;
struct gops_fb_intr intr;
/** @cond DOXYGEN_SHOULD_SKIP_THIS */

View File

@@ -463,6 +463,29 @@ void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count);
/**
* @brief Helper function to report FB MMU ECC errors to 3LSS.
*
* @param g [in] - The GPU driver struct.
* @param err_id [in] - Error index.
* - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED
* - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
* @param err_addr [in] - Error address.
* - This is the location at which correctable or
* uncorrectable error has occurred.
* @param err_count [in] - Error count.
*
* Calls nvgpu_report_ecc_err with hw_unit=NVGPU_ERR_MODULE_HUBMMU and inst=0.
*
* @return None
*/
static inline void nvgpu_report_fb_ecc_err(struct gk20a *g, u32 err_id, u64 err_addr,
u64 err_count)
{
nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU, 0, err_id, err_addr,
err_count);
}
/**
* @brief Report CTXSW error to 3LSS.
*

View File

@@ -115,6 +115,11 @@ gv11b_device_info_parse_data
gv11b_elcg_init_idle_filters
gv11b_fb_ecc_free
gv11b_fb_ecc_init
gv11b_fb_ecc_l2tlb_error_mask
gv11b_fb_intr_handle_ecc
gv11b_fb_intr_handle_ecc_l2tlb
gv11b_fb_intr_handle_ecc_hubtlb
gv11b_fb_intr_handle_ecc_fillunit
gv11b_fb_fault_buf_configure_hw
gv11b_fb_fault_buf_set_state_hw
gv11b_fb_fault_buffer_get_ptr_update

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -192,7 +192,7 @@ int test_ecc_free(struct unit_module *m, struct gk20a *g,
* - "nvgpu_ecc_free" should skip freeing ecc counters for fb, fpba,
* pmu and return without faulting.
*/
g->ops.fb.fb_ecc_free = NULL;
g->ops.fb.ecc.free = NULL;
g->ops.pmu.ecc_free = NULL;
g->ecc.ltc.ecc_sec_count = nvgpu_kzalloc(g,
sizeof(*g->ecc.ltc.ecc_sec_count));
@@ -210,7 +210,7 @@ int test_ecc_free(struct unit_module *m, struct gk20a *g,
* - fb and pmu ecc HALs have ecc free handles are set.
* - "nvgpu_ecc_free" should return without faulting.
*/
g->ops.fb.fb_ecc_free = mock_ecc_free;
g->ops.fb.ecc.free = mock_ecc_free;
g->ops.pmu.ecc_free = mock_ecc_free;
g->ecc.ltc.ecc_sec_count = nvgpu_kzalloc(g,
sizeof(*g->ecc.ltc.ecc_sec_count));

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -31,7 +31,9 @@
#include "hal/mc/mc_gp10b.h"
#include "hal/fb/fb_gm20b.h"
#include "hal/fb/fb_gv11b.h"
#include "hal/fb/ecc/fb_ecc_gv11b.h"
#include "hal/fb/intr/fb_intr_gv11b.h"
#include "hal/fb/intr/fb_intr_ecc_gv11b.h"
#include <nvgpu/hw/gv11b/hw_fb_gv11b.h>
#include "fb_fusa.h"
@@ -46,8 +48,13 @@ int fb_gv11b_init_test(struct unit_module *m, struct gk20a *g, void *args)
g->ops.ecc.ecc_init_support = nvgpu_ecc_init_support;
g->ops.fb.init_hw = gv11b_fb_init_hw;
g->ops.fb.init_fs_state = gv11b_fb_init_fs_state;
g->ops.fb.fb_ecc_init = gv11b_fb_ecc_init;
g->ops.fb.fb_ecc_free = gv11b_fb_ecc_free;
g->ops.fb.ecc.init = gv11b_fb_ecc_init;
g->ops.fb.ecc.free = gv11b_fb_ecc_free;
g->ops.fb.ecc.l2tlb_error_mask = gv11b_fb_ecc_l2tlb_error_mask;
g->ops.fb.intr.handle_ecc = gv11b_fb_intr_handle_ecc,
g->ops.fb.intr.handle_ecc_l2tlb = gv11b_fb_intr_handle_ecc_l2tlb,
g->ops.fb.intr.handle_ecc_hubtlb = gv11b_fb_intr_handle_ecc_hubtlb,
g->ops.fb.intr.handle_ecc_fillunit = gv11b_fb_intr_handle_ecc_fillunit,
/* Other HALs */
g->ops.mc.intr_stall_unit_config = mc_gp10b_intr_stall_unit_config;
@@ -94,19 +101,19 @@ int fb_gv11b_init_test(struct unit_module *m, struct gk20a *g, void *args)
*/
for (int i = 0; i < 5; i++) {
nvgpu_posix_enable_fault_injection(kmem_fi, true, i);
err = g->ops.fb.fb_ecc_init(g);
err = g->ops.fb.ecc.init(g);
nvgpu_posix_enable_fault_injection(kmem_fi, false, 0);
if (err != -ENOMEM) {
unit_return_fail(m, "gv11b_fb_ecc_init did not fail as expected (%d)\n", i);
}
}
err = g->ops.fb.fb_ecc_init(g);
err = g->ops.fb.ecc.init(g);
if (err != 0) {
unit_return_fail(m, "gv11b_fb_ecc_init failed\n");
}
g->ops.fb.fb_ecc_free(g);
g->ops.fb.ecc.free(g);
return UNIT_SUCCESS;
}

View File

@@ -33,7 +33,9 @@
#include "hal/fb/fb_gm20b.h"
#include "hal/fb/fb_gv11b.h"
#include "hal/fb/fb_mmu_fault_gv11b.h"
#include "hal/fb/ecc/fb_ecc_gv11b.h"
#include "hal/fb/intr/fb_intr_gv11b.h"
#include "hal/fb/intr/fb_intr_ecc_gv11b.h"
#include <nvgpu/hw/gv11b/hw_fb_gv11b.h>
#include <nvgpu/hw/gv11b/hw_mc_gv11b.h>
@@ -45,8 +47,13 @@
int fb_intr_gv11b_init_test(struct unit_module *m, struct gk20a *g, void *args)
{
/* HALs under test */
g->ops.fb.fb_ecc_init = gv11b_fb_ecc_init;
g->ops.fb.fb_ecc_free = gv11b_fb_ecc_free;
g->ops.fb.ecc.init = gv11b_fb_ecc_init;
g->ops.fb.ecc.free = gv11b_fb_ecc_free;
g->ops.fb.ecc.l2tlb_error_mask = gv11b_fb_ecc_l2tlb_error_mask;
g->ops.fb.intr.handle_ecc = gv11b_fb_intr_handle_ecc;
g->ops.fb.intr.handle_ecc_l2tlb = gv11b_fb_intr_handle_ecc_l2tlb;
g->ops.fb.intr.handle_ecc_hubtlb = gv11b_fb_intr_handle_ecc_hubtlb;
g->ops.fb.intr.handle_ecc_fillunit = gv11b_fb_intr_handle_ecc_fillunit;
return UNIT_SUCCESS;
}
@@ -159,7 +166,7 @@ int fb_intr_gv11b_ecc_test(struct unit_module *m, struct gk20a *g, void *args)
unit_return_fail(m, "Invalid subcase\n");
}
g->ops.fb.fb_ecc_init(g);
g->ops.fb.ecc.init(g);
/* Set the interrupt status as corrected */
nvgpu_writel(g, p->status_reg, p->corrected_status);
@@ -205,7 +212,7 @@ int fb_intr_gv11b_ecc_test(struct unit_module *m, struct gk20a *g, void *args)
/* Clear interrupt status */
nvgpu_writel(g, p->status_reg, 0);
g->ops.fb.fb_ecc_free(g);
g->ops.fb.ecc.free(g);
return UNIT_SUCCESS;
}

View File

@@ -155,7 +155,7 @@ int test_init_mm(struct unit_module *m, struct gk20a *g, void *args)
gv11b_fb_read_mmu_fault_buffer_size;
g->ops.fb.init_hw = gv11b_fb_init_hw;
g->ops.fb.intr.enable = gv11b_fb_intr_enable;
g->ops.fb.fb_ecc_init = NULL;
g->ops.fb.ecc.init = NULL;
err = nvgpu_init_mm_support(g);
if (err != 0) {

View File

@@ -278,10 +278,10 @@ int test_nvgpu_init_mm(struct unit_module *m, struct gk20a *g, void *args)
ARBITRARY_ERROR, 16);
/* Making g->ops.fb.fb_ecc_init fail */
g->ops.fb.fb_ecc_init = int_empty_hal;
g->ops.fb.ecc.init = int_empty_hal;
errors += nvgpu_init_mm_support_inject_error(m, g, ERROR_TYPE_HAL, 2,
ARBITRARY_ERROR, 17);
g->ops.fb.fb_ecc_init = NULL;
g->ops.fb.ecc.init = NULL;
/*
* Extra cases for branch coverage: change support flags to test