gpu: nvgpu: remove usage of hw headers from SDL

This patch does the following:
(1) Removes the usage of hw headers in SDL unit. For this purpose, it moves
    the initialization required for errors that can be injected using hw
    support, error injection function. Further, it passes the required
    information to SDL via hal layers.
(2) Renames (i) PWR as PMU, (ii) nvgpu_report_ecc_parity_err to
    nvgpu_report_ecc_err.

Jira NVGPU-3235

Change-Id: I69290af78c09fbb5b792058e7bc6cc8b6ba340c9
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2112837
Reviewed-by: Raghuram Kothakota <rkothakota@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Rajesh Devaraj
2019-05-06 15:46:40 +05:30
committed by mobile promotions
parent 90aee0086f
commit 05ed37ae3a
16 changed files with 470 additions and 57 deletions

View File

@@ -31,6 +31,46 @@
#include <nvgpu/hw/gv11b/hw_fb_gv11b.h>
static struct nvgpu_hw_err_inject_info hubmmu_ecc_err_desc[] = {
NVGPU_ECC_ERR("hubmmu_l2tlb_sa_data_ecc_uncorrected",
gv11b_fb_intr_inject_hubmmu_ecc_error,
fb_mmu_l2tlb_ecc_control_r,
fb_mmu_l2tlb_ecc_control_inject_uncorrected_err_f),
NVGPU_ECC_ERR("hubmmu_tlb_sa_data_ecc_uncorrected",
gv11b_fb_intr_inject_hubmmu_ecc_error,
fb_mmu_hubtlb_ecc_control_r,
fb_mmu_hubtlb_ecc_control_inject_uncorrected_err_f),
NVGPU_ECC_ERR("hubmmu_pte_data_ecc_uncorrected",
gv11b_fb_intr_inject_hubmmu_ecc_error,
fb_mmu_fillunit_ecc_control_r,
fb_mmu_fillunit_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc hubmmu_err_desc;
struct nvgpu_hw_err_inject_info_desc *
gv11b_fb_intr_get_hubmmu_err_desc(struct gk20a *g)
{
hubmmu_err_desc.info_ptr = hubmmu_ecc_err_desc;
hubmmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(hubmmu_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &hubmmu_err_desc;
}
int gv11b_fb_intr_inject_hubmmu_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err,
u32 error_info)
{
unsigned int reg_addr = err->get_reg_addr();
nvgpu_info(g, "Injecting HUBMMU fault %s", err->name);
nvgpu_writel(g, reg_addr, err->get_reg_val(1U));
return 0;
}
static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status)
{
u32 ecc_addr, corrected_cnt, uncorrected_cnt;
@@ -83,7 +123,7 @@ static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status)
if ((ecc_status &
fb_mmu_l2tlb_ecc_status_corrected_err_l2tlb_sa_data_m())
!= 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_l2tlb_ecc_corrected_err_count[0].counter);
@@ -92,7 +132,7 @@ static void gv11b_fb_intr_handle_ecc_l2tlb(struct gk20a *g, u32 ecc_status)
if ((ecc_status &
fb_mmu_l2tlb_ecc_status_uncorrected_err_l2tlb_sa_data_m())
!= 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_l2tlb_ecc_uncorrected_err_count[0].counter);
@@ -161,7 +201,7 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status)
if ((ecc_status &
fb_mmu_hubtlb_ecc_status_corrected_err_sa_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_hubtlb_ecc_corrected_err_count[0].counter);
@@ -169,7 +209,7 @@ static void gv11b_fb_intr_handle_ecc_hubtlb(struct gk20a *g, u32 ecc_status)
}
if ((ecc_status &
fb_mmu_hubtlb_ecc_status_uncorrected_err_sa_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_hubtlb_ecc_uncorrected_err_count[0].counter);
@@ -239,7 +279,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status)
if ((ecc_status &
fb_mmu_fillunit_ecc_status_corrected_err_pte_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PTE_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter);
@@ -248,7 +288,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status)
if ((ecc_status &
fb_mmu_fillunit_ecc_status_uncorrected_err_pte_data_m())
!= 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);
@@ -256,7 +296,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status)
}
if ((ecc_status &
fb_mmu_fillunit_ecc_status_corrected_err_pde0_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PDE0_DATA_ECC_CORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_corrected_err_count[0].counter);
@@ -265,7 +305,7 @@ static void gv11b_fb_intr_handle_ecc_fillunit(struct gk20a *g, u32 ecc_status)
if ((ecc_status &
fb_mmu_fillunit_ecc_status_uncorrected_err_pde0_data_m())
!= 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_HUBMMU,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_HUBMMU,
0, GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED,
ecc_addr,
g->ecc.fb.mmu_fillunit_ecc_uncorrected_err_count[0].counter);

View File

@@ -25,8 +25,17 @@
#ifndef NVGPU_FB_INTR_ECC_GV11B_H
#define NVGPU_FB_INTR_ECC_GV11B_H
#include <nvgpu/types.h>
#include <nvgpu/nvgpu_err.h>
struct gk20a;
struct nvgpu_hw_err_inject_info;
struct nvgpu_hw_err_inject_info_desc;
void gv11b_fb_intr_handle_ecc(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc *
gv11b_fb_intr_get_hubmmu_err_desc(struct gk20a *g);
int gv11b_fb_intr_inject_hubmmu_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
#endif /* NVGPU_FB_INTR_ECC_GV11B_H */

View File

@@ -28,6 +28,203 @@
#include "ecc_gv11b.h"
static struct nvgpu_hw_err_inject_info fecs_ecc_err_desc[] = {
NVGPU_ECC_ERR("falcon_imem_ecc_corrected",
gv11b_gr_intr_inject_fecs_ecc_error,
gr_fecs_falcon_ecc_control_r,
gr_fecs_falcon_ecc_control_inject_corrected_err_f),
NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected",
gv11b_gr_intr_inject_fecs_ecc_error,
gr_fecs_falcon_ecc_control_r,
gr_fecs_falcon_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc fecs_err_desc;
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_fecs_err_desc(struct gk20a *g)
{
fecs_err_desc.info_ptr = fecs_ecc_err_desc;
fecs_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(fecs_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &fecs_err_desc;
}
int gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info)
{
nvgpu_info(g, "Injecting FECS fault %s", err->name);
nvgpu_writel(g, err->get_reg_addr(), err->get_reg_val(1U));
return 0;
}
static struct nvgpu_hw_err_inject_info gpccs_ecc_err_desc[] = {
NVGPU_ECC_ERR("falcon_imem_ecc_corrected",
gv11b_gr_intr_inject_gpccs_ecc_error,
gr_gpccs_falcon_ecc_control_r,
gr_gpccs_falcon_ecc_control_inject_corrected_err_f),
NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected",
gv11b_gr_intr_inject_gpccs_ecc_error,
gr_gpccs_falcon_ecc_control_r,
gr_gpccs_falcon_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc gpccs_err_desc;
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_gpccs_err_desc(struct gk20a *g)
{
gpccs_err_desc.info_ptr = gpccs_ecc_err_desc;
gpccs_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(gpccs_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &gpccs_err_desc;
}
int gv11b_gr_intr_inject_gpccs_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info)
{
unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
unsigned int gpc = (error_info & 0xFFU);
unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride;
nvgpu_info(g, "Injecting GPCCS fault %s for gpc: %d", err->name, gpc);
nvgpu_writel(g, reg_addr, err->get_reg_val(1U));
return 0;
}
static struct nvgpu_hw_err_inject_info sm_ecc_err_desc[] = {
NVGPU_ECC_ERR("l1_tag_ecc_corrected",
gv11b_gr_intr_inject_sm_ecc_error,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_r,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_inject_corrected_err_f),
NVGPU_ECC_ERR("l1_tag_ecc_uncorrected",
gv11b_gr_intr_inject_sm_ecc_error,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_r,
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_inject_uncorrected_err_f),
NVGPU_ECC_ERR("cbu_ecc_uncorrected",
gv11b_gr_intr_inject_sm_ecc_error,
gr_pri_gpc0_tpc0_sm_cbu_ecc_control_r,
gr_pri_gpc0_tpc0_sm_cbu_ecc_control_inject_uncorrected_err_f),
NVGPU_ECC_ERR("lrf_ecc_uncorrected",
gv11b_gr_intr_inject_sm_ecc_error,
gr_pri_gpc0_tpc0_sm_lrf_ecc_control_r,
gr_pri_gpc0_tpc0_sm_lrf_ecc_control_inject_uncorrected_err_f),
NVGPU_ECC_ERR("l1_data_ecc_uncorrected",
gv11b_gr_intr_inject_sm_ecc_error,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_r,
gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_inject_uncorrected_err_f),
NVGPU_ECC_ERR("icache_l0_data_ecc_uncorrected",
gv11b_gr_intr_inject_sm_ecc_error,
gr_pri_gpc0_tpc0_sm_icache_ecc_control_r,
gr_pri_gpc0_tpc0_sm_icache_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc sm_err_desc;
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_sm_err_desc(struct gk20a *g)
{
sm_err_desc.info_ptr = sm_ecc_err_desc;
sm_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(sm_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &sm_err_desc;
}
int gv11b_gr_intr_inject_sm_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err,
u32 error_info)
{
unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
unsigned int tpc_stride =
nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
unsigned int gpc = (error_info & 0xFF00U) >> 8U;
unsigned int tpc = (error_info & 0xFFU);
unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride
+ tpc * tpc_stride;
nvgpu_info(g, "Injecting SM fault %s for gpc: %d, tpc: %d",
err->name, gpc, tpc);
nvgpu_writel(g, reg_addr, err->get_reg_val(1U));
return 0;
}
static struct nvgpu_hw_err_inject_info mmu_ecc_err_desc[] = {
NVGPU_ECC_ERR("l1tlb_sa_data_ecc_uncorrected",
gv11b_gr_intr_inject_mmu_ecc_error,
gr_gpc0_mmu_l1tlb_ecc_control_r,
gr_gpc0_mmu_l1tlb_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc mmu_err_desc;
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_mmu_err_desc(struct gk20a *g)
{
mmu_err_desc.info_ptr = mmu_ecc_err_desc;
mmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(mmu_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &mmu_err_desc;
}
int gv11b_gr_intr_inject_mmu_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info)
{
unsigned int gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
unsigned int gpc = (error_info & 0xFFU);
unsigned int reg_addr = err->get_reg_addr() + gpc * gpc_stride;
nvgpu_info(g, "Injecting MMU fault %s for gpc: %d", err->name, gpc);
nvgpu_writel(g, reg_addr, err->get_reg_val(1U));
return 0;
}
static struct nvgpu_hw_err_inject_info gcc_ecc_err_desc[] = {
NVGPU_ECC_ERR("l15_ecc_uncorrected",
gv11b_gr_intr_inject_gcc_ecc_error,
gr_pri_gpc0_gcc_l15_ecc_control_r,
gr_pri_gpc0_gcc_l15_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc gcc_err_desc;
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_gcc_err_desc(struct gk20a *g)
{
gcc_err_desc.info_ptr = gcc_ecc_err_desc;
gcc_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(gcc_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &gcc_err_desc;
}
int gv11b_gr_intr_inject_gcc_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info)
{
unsigned int gpc_stride = nvgpu_get_litter_value(g,
GPU_LIT_GPC_STRIDE);
unsigned int gpc = (error_info & 0xFFU);
unsigned int reg_addr = err->get_reg_addr()
+ gpc * gpc_stride;
nvgpu_info(g, "Injecting GCC fault %s for gpc: %d", err->name, gpc);
nvgpu_writel(g, reg_addr, err->get_reg_val(1U));
return 0;
}
void gv11b_ecc_detect_enabled_units(struct gk20a *g)
{
bool opt_ecc_en = g->ops.fuse.is_opt_ecc_enable(g);

View File

@@ -23,9 +23,34 @@
#ifndef NVGPU_ECC_GV11B_H
#define NVGPU_ECC_GV11B_H
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/types.h>
struct gk20a;
struct nvgpu_hw_err_inject_info;
struct nvgpu_hw_err_inject_info_desc;
void gv11b_ecc_detect_enabled_units(struct gk20a *g);
int gv11b_ecc_init(struct gk20a *g);
int gv11b_gr_intr_inject_fecs_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_fecs_err_desc(struct gk20a *g);
int gv11b_gr_intr_inject_gpccs_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_gpccs_err_desc(struct gk20a *g);
int gv11b_gr_intr_inject_sm_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_sm_err_desc(struct gk20a *g);
int gv11b_gr_intr_inject_mmu_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_mmu_err_desc(struct gk20a *g);
int gv11b_gr_intr_inject_gcc_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
struct nvgpu_hw_err_inject_info_desc *
gv11b_gr_intr_get_gcc_err_desc(struct gk20a *g);
#endif /* NVGPU_ECC_GV11B_H */

View File

@@ -154,7 +154,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
if ((l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m())) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_TAG_ECC_CORRECTED, 0,
@@ -162,7 +162,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_TAG_MISS_FIFO_ECC_CORRECTED, 0,
@@ -170,7 +170,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED, 0,
@@ -195,7 +195,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
if ((l1_tag_ecc_status &
(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() |
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m())) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
@@ -203,7 +203,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0,
@@ -211,7 +211,7 @@ static void gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0,
@@ -293,7 +293,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
lrf_corrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_LRF_ECC_CORRECTED, 0,
@@ -314,7 +314,7 @@ static void gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
lrf_uncorrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_LRF_ECC_UNCORRECTED, 0,
@@ -387,7 +387,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
g->ecc.gr.sm_cbu_ecc_corrected_err_count[gpc][tpc].counter +=
cbu_corrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_CBU_ECC_CORRECTED,
@@ -408,7 +408,7 @@ static void gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
cbu_uncorrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_CBU_ECC_UNCORRECTED,
@@ -477,7 +477,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
g->ecc.gr.sm_l1_data_ecc_corrected_err_count[gpc][tpc].counter +=
l1_data_corrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_DATA_ECC_CORRECTED,
@@ -498,7 +498,7 @@ static void gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
l1_data_uncorrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_L1_DATA_ECC_UNCORRECTED,
@@ -575,7 +575,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
0);
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L0_DATA_ECC_CORRECTED,
@@ -583,7 +583,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L0_PREDECODE_ECC_CORRECTED,
@@ -591,7 +591,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L1_DATA_ECC_CORRECTED,
@@ -599,7 +599,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED,
@@ -623,7 +623,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
0);
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED,
@@ -631,7 +631,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
@@ -639,7 +639,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED,
@@ -647,7 +647,7 @@ static void gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
}
if ((icache_ecc_status &
gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_SM,
(gpc << 8) | tpc,
GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED,

View File

@@ -50,28 +50,28 @@ static void gv11b_gr_intr_handle_fecs_ecc_error(struct gk20a *g)
fecs_ecc_status.uncorrected_delta;
if (fecs_ecc_status.imem_corrected_err) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
fecs_ecc_status.ecc_addr,
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
}
if (fecs_ecc_status.imem_uncorrected_err) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
fecs_ecc_status.ecc_addr,
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
}
if (fecs_ecc_status.dmem_corrected_err) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
GPU_FECS_FALCON_DMEM_ECC_CORRECTED,
fecs_ecc_status.ecc_addr,
g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
}
if (fecs_ecc_status.dmem_uncorrected_err) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_FECS, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
fecs_ecc_status.ecc_addr,
g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
@@ -320,7 +320,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
);
}
*corrected_err += gcc_l15_corrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GCC, gpc,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc,
GPU_GCC_L15_ECC_CORRECTED,
0, *corrected_err);
nvgpu_writel(g,
@@ -342,7 +342,7 @@ void gv11b_gr_intr_handle_gcc_exception(struct gk20a *g, u32 gpc,
);
}
*uncorrected_err += gcc_l15_uncorrected_err_count_delta;
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GCC, gpc,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GCC, gpc,
GPU_GCC_L15_ECC_UNCORRECTED,
0, *uncorrected_err);
nvgpu_writel(g,
@@ -430,7 +430,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) !=
0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED,
0, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
@@ -438,7 +438,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
0, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
@@ -446,7 +446,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) !=
0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED,
0, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
@@ -454,7 +454,7 @@ void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_MMU, gpc,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_MMU, gpc,
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
0, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
@@ -537,28 +537,28 @@ void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS,
gpc, GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
ecc_addr, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS,
gpc, GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED,
ecc_addr, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS,
gpc, GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED,
ecc_addr, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_GPCCS,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_GPCCS,
gpc, GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
ecc_addr, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");

View File

@@ -64,6 +64,7 @@
#include "hal/fb/fb_gv11b.h"
#include "hal/fb/fb_mmu_fault_gv11b.h"
#include "hal/fb/intr/fb_intr_gv11b.h"
#include "hal/fb/intr/fb_intr_ecc_gv11b.h"
#include "hal/fuse/fuse_gm20b.h"
#include "hal/fuse/fuse_gp10b.h"
#include "hal/ptimer/ptimer_gk20a.h"
@@ -179,6 +180,8 @@ static void gv11b_init_gpu_characteristics(struct gk20a *g)
static const struct gpu_ops gv11b_ops = {
.ltc = {
.get_ltc_err_desc =
gv11b_ltc_get_err_desc,
.determine_L2_size_bytes = gp10b_determine_L2_size_bytes,
#ifdef NVGPU_GRAPHICS
.set_zbc_s_entry = gv11b_ltc_set_zbc_stencil_entry,
@@ -295,6 +298,16 @@ static const struct gpu_ops gv11b_ops = {
.ecc = {
.detect = gv11b_ecc_detect_enabled_units,
.init = gv11b_ecc_init,
.get_mmu_err_desc =
gv11b_gr_intr_get_mmu_err_desc,
.get_gcc_err_desc =
gv11b_gr_intr_get_gcc_err_desc,
.get_sm_err_desc =
gv11b_gr_intr_get_sm_err_desc,
.get_gpccs_err_desc =
gv11b_gr_intr_get_gpccs_err_desc,
.get_fecs_err_desc =
gv11b_gr_intr_get_fecs_err_desc,
},
.ctxsw_prog = {
.hw_get_fecs_header_size =
@@ -651,6 +664,8 @@ static const struct gpu_ops gv11b_ops = {
.is_valid_compute = gv11b_class_is_valid_compute,
},
.fb = {
.get_hubmmu_err_desc =
gv11b_fb_intr_get_hubmmu_err_desc,
.init_hw = gv11b_fb_init_hw,
.init_fs_state = gv11b_fb_init_fs_state,
.cbc_configure = gv11b_fb_cbc_configure,
@@ -998,6 +1013,8 @@ static const struct gpu_ops gv11b_ops = {
.elcg_init_idle_filters = gv11b_elcg_init_idle_filters,
},
.pmu = {
.get_pmu_err_desc =
gv11b_pmu_intr_get_err_desc,
/*
* Basic init ops are must, as PMU engine used by ACR to
* load & bootstrap GR LS falcons without LS PMU, remaining

View File

@@ -154,7 +154,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
if ((ecc_status &
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_RSTG_ECC_CORRECTED, ecc_addr,
@@ -163,7 +163,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
}
if ((ecc_status &
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED, ecc_addr,
@@ -172,7 +172,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
}
if ((ecc_status &
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_TSTG_ECC_CORRECTED, ecc_addr,
@@ -181,7 +181,7 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
}
if ((ecc_status &
ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, ecc_addr,
@@ -193,13 +193,13 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) != 0U) {
if ((dstg_ecc_addr &
ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_ECC_CORRECTED, ecc_addr,
g->ecc.ltc.ecc_sec_count[ltc][slice].counter);
} else {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_BE_ECC_CORRECTED, ecc_addr,
@@ -209,13 +209,13 @@ static void gv11b_ltc_intr_handle_lts_interrupts(struct gk20a *g,
}
if ((ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) != 0U) {
if ((dstg_ecc_addr & ltc_ltc0_lts0_dstg_ecc_address_info_ram_m()) == 0U) {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, ecc_addr,
g->ecc.ltc.ecc_ded_count[ltc][slice].counter);
} else {
(void) nvgpu_report_ecc_parity_err(g,
(void) nvgpu_report_ecc_err(g,
NVGPU_ERR_MODULE_LTC,
(ltc << 8U) | slice,
GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, ecc_addr,

View File

@@ -33,6 +33,46 @@
#include <nvgpu/utils.h>
static struct nvgpu_hw_err_inject_info ltc_ecc_err_desc[] = {
NVGPU_ECC_ERR("cache_rstg_ecc_corrected",
gv11b_ltc_inject_ecc_error,
ltc_ltc0_lts0_l1_cache_ecc_control_r,
ltc_ltc0_lts0_l1_cache_ecc_control_inject_corrected_err_f),
NVGPU_ECC_ERR("cache_rstg_ecc_uncorrected",
gv11b_ltc_inject_ecc_error,
ltc_ltc0_lts0_l1_cache_ecc_control_r,
ltc_ltc0_lts0_l1_cache_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc ltc_err_desc;
struct nvgpu_hw_err_inject_info_desc * gv11b_ltc_get_err_desc(struct gk20a *g)
{
ltc_err_desc.info_ptr = ltc_ecc_err_desc;
ltc_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(ltc_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &ltc_err_desc;
}
int gv11b_ltc_inject_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info)
{
unsigned int ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
unsigned int lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE);
unsigned int ltc = (error_info & 0xFF00U) >> 8U;
unsigned int lts = (error_info & 0xFFU);
unsigned int reg_addr = err->get_reg_addr() + ltc * ltc_stride +
lts * lts_stride;
nvgpu_info(g, "Injecting LTC fault %s for ltc: %d, lts: %d",
err->name, ltc, lts);
nvgpu_writel(g, reg_addr, err->get_reg_val(1U));
return 0;
}
#ifdef NVGPU_GRAPHICS
/*
* Sets the ZBC stencil for the passed index.

View File

@@ -22,7 +22,13 @@
#ifndef LTC_GV11B_H
#define LTC_GV11B_H
#include <nvgpu/types.h>
#include <nvgpu/nvgpu_err.h>
struct gk20a;
struct nvgpu_hw_err_inject_info;
struct nvgpu_hw_err_inject_info_desc;
#ifdef NVGPU_GRAPHICS
void gv11b_ltc_set_zbc_stencil_entry(struct gk20a *g,
@@ -30,5 +36,8 @@ void gv11b_ltc_set_zbc_stencil_entry(struct gk20a *g,
u32 index);
#endif /* NVGPU_GRAPHICS */
void gv11b_ltc_init_fs_state(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc * gv11b_ltc_get_err_desc(struct gk20a *g);
int gv11b_ltc_inject_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
#endif

View File

@@ -39,6 +39,39 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr);
#define ALIGN_4KB 12
static struct nvgpu_hw_err_inject_info pmu_ecc_err_desc[] = {
NVGPU_ECC_ERR("falcon_imem_ecc_corrected",
gv11b_pmu_inject_ecc_error,
pwr_pmu_falcon_ecc_control_r,
pwr_pmu_falcon_ecc_control_inject_corrected_err_f),
NVGPU_ECC_ERR("falcon_imem_ecc_uncorrected",
gv11b_pmu_inject_ecc_error,
pwr_pmu_falcon_ecc_control_r,
pwr_pmu_falcon_ecc_control_inject_uncorrected_err_f),
};
static struct nvgpu_hw_err_inject_info_desc pmu_err_desc;
struct nvgpu_hw_err_inject_info_desc *
gv11b_pmu_intr_get_err_desc(struct gk20a *g)
{
pmu_err_desc.info_ptr = pmu_ecc_err_desc;
pmu_err_desc.info_size = nvgpu_safe_cast_u64_to_u32(
sizeof(pmu_ecc_err_desc) /
sizeof(struct nvgpu_hw_err_inject_info));
return &pmu_err_desc;
}
int gv11b_pmu_inject_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info)
{
nvgpu_info(g, "Injecting PMU fault %s", err->name);
nvgpu_writel(g, err->get_reg_addr(), err->get_reg_val(1U));
return 0;
}
#ifdef NVGPU_FEATURE_LS_PMU
/* PROD settings for ELPG sequencing registers*/
static struct pg_init_sequence_list _pginitseq_gv11b[] = {
@@ -443,7 +476,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
@@ -451,7 +484,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
}
if ((ecc_status &
pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
@@ -460,7 +493,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
}
if ((ecc_status &
pwr_pmu_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
GPU_PMU_FALCON_DMEM_ECC_CORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
@@ -468,7 +501,7 @@ static int gv11b_pmu_correct_ecc(struct gk20a *g, u32 ecc_status, u32 ecc_addr)
}
if ((ecc_status &
pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
(void) nvgpu_report_ecc_parity_err(g, NVGPU_ERR_MODULE_PMU, 0,
(void) nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
ecc_addr,
g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);

View File

@@ -23,9 +23,12 @@
#ifndef PMU_GV11B_H
#define PMU_GV11B_H
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/types.h>
struct gk20a;
struct nvgpu_hw_err_inject_info;
struct nvgpu_hw_err_inject_info_desc;
bool gv11b_pmu_is_debug_mode_en(struct gk20a *g);
void gv11b_pmu_flcn_setup_boot_config(struct gk20a *g);
@@ -43,5 +46,8 @@ void gv11b_clear_pmu_bar0_host_err_status(struct gk20a *g);
int gv11b_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status,
u32 *etype);
bool gv11b_pmu_validate_mem_integrity(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc * gv11b_pmu_intr_get_err_desc(struct gk20a *g);
int gv11b_pmu_inject_ecc_error(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 error_info);
#endif /* PMU_GV11B_H */

View File

@@ -214,6 +214,8 @@ enum nvgpu_event_id_type {
struct gpu_ops {
struct {
u64 (*determine_L2_size_bytes)(struct gk20a *gk20a);
struct nvgpu_hw_err_inject_info_desc * (*get_ltc_err_desc)
(struct gk20a *g);
#ifdef NVGPU_GRAPHICS
void (*set_zbc_color_entry)(struct gk20a *g,
u32 *color_val_l2,
@@ -411,6 +413,16 @@ struct gpu_ops {
struct {
void (*detect)(struct gk20a *g);
int (*init)(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc * (*get_mmu_err_desc)
(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc * (*get_gcc_err_desc)
(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc * (*get_sm_err_desc)
(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc * (*get_gpccs_err_desc)
(struct gk20a *g);
struct nvgpu_hw_err_inject_info_desc * (*get_fecs_err_desc)
(struct gk20a *g);
} ecc;
struct {
u32 (*hw_get_fecs_header_size)(void);
@@ -847,6 +859,8 @@ struct gpu_ops {
} gpu_class;
struct {
struct nvgpu_hw_err_inject_info_desc * (*get_hubmmu_err_desc)
(struct gk20a *g);
void (*init_hw)(struct gk20a *g);
void (*cbc_configure)(struct gk20a *g, struct nvgpu_cbc *cbc);
void (*init_fs_state)(struct gk20a *g);
@@ -1317,6 +1331,8 @@ struct gpu_ops {
u32 (*idle_slowdown_disable)(struct gk20a *g);
} therm;
struct {
struct nvgpu_hw_err_inject_info_desc * (*get_pmu_err_desc)
(struct gk20a *g);
bool (*is_pmu_supported)(struct gk20a *g);
u32 (*falcon_base_addr)(void);
/* reset */

View File

@@ -175,6 +175,27 @@ struct gr_err_info {
struct gr_exception_info *exception_info;
};
#define NVGPU_ECC_ERR(err_name, inject_fn, addr, val) \
{ \
.name = (err_name), \
.inject_hw_fault = (inject_fn), \
.get_reg_addr = (addr), \
.get_reg_val = (val) \
}
struct nvgpu_hw_err_inject_info {
const char *name;
int (*inject_hw_fault)(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 err_info);
u32 (*get_reg_addr)(void);
u32 (*get_reg_val)(u32 val);
};
struct nvgpu_hw_err_inject_info_desc {
struct nvgpu_hw_err_inject_info *info_ptr;
u32 info_size;
};
/* Functions to report errors to 3LSS */
int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
u32 inst, u32 err_id, u32 intr_info);
@@ -182,7 +203,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
int nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
u32 inst, u32 err_id, u32 intr_info);
int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst,
int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u64 err_addr, u64 err_count);
int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,

View File

@@ -30,7 +30,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
return 0;
}
int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst,
int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u64 err_addr, u64 err_count)
{
return 0;

View File

@@ -50,7 +50,7 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
return 0;
}
int nvgpu_report_ecc_parity_err(struct gk20a *g, u32 hw_unit, u32 inst,
int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u64 err_addr, u64 err_count)
{
return 0;