gpu: nvgpu: Err injection utility support

The HSI error injection utility is an on-bench debug and test utility
which can be used by customers and SQA to test end-to-end error
detection and reporting path.
Inplement callback function to integrate with this utility and allow
injecting GPU HSI related errors.
As part of callback function hsierrrpt_inj(), invoke the driver's
error-reporting logic which uses the EPD MISC_EC APIs. In future,
we can enhance the callback function to trigger driver's error
handling logic incrementally for different errors.

Bug 3413214

Change-Id: I2d050b6c850d6151b40095f243a6733b4ba74f47
Signed-off-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2727198
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Tejal Kudav
2022-06-09 05:34:54 +00:00
committed by mobile promotions
parent 8b4bc0e51c
commit 494dc19ee8
6 changed files with 96 additions and 2 deletions

View File

@@ -197,6 +197,12 @@ CONFIG_GK20A_DEVFREQ := y
CONFIG_GK20A_PM_QOS := n CONFIG_GK20A_PM_QOS := n
endif endif
ifeq ($(CONFIG_TEGRA_HSIERRRPTINJ),y)
ifeq ($(CONFIG_NVGPU_ENABLE_MISC_EC),y)
CONFIG_NVGPU_FSI_ERR_INJECTION := y
endif
endif
ifeq ($(CONFIG_GK20A_PMU),y) ifeq ($(CONFIG_GK20A_PMU),y)
ccflags-y += -DCONFIG_GK20A_PMU ccflags-y += -DCONFIG_GK20A_PMU
endif endif
@@ -308,3 +314,6 @@ endif
ifeq ($(CONFIG_NVGPU_ENABLE_MISC_EC),y) ifeq ($(CONFIG_NVGPU_ENABLE_MISC_EC),y)
ccflags-y += -DCONFIG_NVGPU_ENABLE_MISC_EC ccflags-y += -DCONFIG_NVGPU_ENABLE_MISC_EC
endif endif
ifeq ($(CONFIG_NVGPU_FSI_ERR_INJECTION),y)
ccflags-y += -DCONFIG_NVGPU_FSI_ERR_INJECTION
endif

View File

@@ -167,6 +167,16 @@ NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_TRACE
CONFIG_NVGPU_FALCON_DEBUG := 1 CONFIG_NVGPU_FALCON_DEBUG := 1
NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_FALCON_DEBUG NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_FALCON_DEBUG
# Enable FSI Error injection support on safety debug and regular build
# when the HSI error injection utility and NVGPU's MISC_EC support is
# enabled.
ifeq ($(CONFIG_TEGRA_HSIERRRPTINJ),1)
ifeq ($(CONFIG_NVGPU_ENABLE_MISC_EC),1)
CONFIG_NVGPU_FSI_ERR_INJECTION := 1
NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_FSI_ERR_INJECTION
endif
endif
# #
# Flags enabled only for regular build profile. # Flags enabled only for regular build profile.
# #

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,6 @@
#include <nvgpu/kmem.h> #include <nvgpu/kmem.h>
#include <nvgpu/log.h> #include <nvgpu/log.h>
#include <nvgpu/cic_mon.h> #include <nvgpu/cic_mon.h>
#include "cic_mon_priv.h" #include "cic_mon_priv.h"
int nvgpu_cic_mon_setup(struct gk20a *g) int nvgpu_cic_mon_setup(struct gk20a *g)
@@ -45,6 +44,21 @@ int nvgpu_cic_mon_setup(struct gk20a *g)
} }
g->cic_mon = cic_mon; g->cic_mon = cic_mon;
#ifdef CONFIG_NVGPU_FSI_ERR_INJECTION
err = nvgpu_cic_mon_reg_errinj_cb(g);
if (err != 0) {
nvgpu_err(g,
"Err inj callback registration failed: %d",
err);
/* Continue CIC init despite err inj utility
* registration failure, as the err inj support
* is meant only for debug purposes.
*/
err = 0;
}
#endif
cic_dbg(g, "CIC_MON unit initialization done."); cic_dbg(g, "CIC_MON unit initialization done.");
return err; return err;
} }

View File

@@ -643,4 +643,7 @@ void nvgpu_cic_mon_intr_nonstall_resume(struct gk20a *g);
void nvgpu_cic_mon_intr_enable(struct gk20a *g); void nvgpu_cic_mon_intr_enable(struct gk20a *g);
#ifdef CONFIG_NVGPU_FSI_ERR_INJECTION
int nvgpu_cic_mon_reg_errinj_cb(struct gk20a *g);
#endif
#endif /* NVGPU_CIC_MON_H */ #endif /* NVGPU_CIC_MON_H */

View File

@@ -21,7 +21,12 @@
#include <linux/tegra-epl.h> #include <linux/tegra-epl.h>
#include <nvgpu/timers.h> #include <nvgpu/timers.h>
#include "os/linux/os_linux.h" #include "os/linux/os_linux.h"
#ifdef CONFIG_NVGPU_FSI_ERR_INJECTION
#include <linux/tegra-hsierrrptinj.h>
#define NVGPU_FSI_REPORTER_ID 0x8016
#endif #endif
#endif
struct gk20a; struct gk20a;
@@ -99,3 +104,48 @@ int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g, u32 err_id)
return ret; return ret;
} }
#ifdef CONFIG_NVGPU_FSI_ERR_INJECTION
static struct gk20a *g_err_inj;
static int nvgpu_cic_mon_inject_err_fsi(uint32_t inst_id,
struct epl_error_report_frame err_rpt_frame)
{
struct gk20a *g = g_err_inj;
int err = 0;
/* Sanity check reporter_id */
if (err_rpt_frame.reporter_id != NVGPU_FSI_REPORTER_ID) {
nvgpu_err(g, "Invalid Input -> Reporter ID = %u",
err_rpt_frame.reporter_id);
return -EINVAL;
}
/* Sanity check inst_id */
if (inst_id != 0U) {
nvgpu_err(g, "Invalid Input -> instance ID = %u", inst_id);
return -EINVAL;
}
err = nvgpu_cic_mon_report_err_safety_services(g,
err_rpt_frame.error_code);
if (err != 0) {
nvgpu_err(g, "Error injection failed for err_id: %u",
err_rpt_frame.error_code);
return -EFAULT;
}
return err;
}
int nvgpu_cic_mon_reg_errinj_cb(struct gk20a *g)
{
hsierrrpt_ipid_t ip_id = IP_GPU;
unsigned int inst_id = 0U;
/* Save NvGPU context which can be used during error injection */
g_err_inj = g;
return hsierrrpt_reg_cb(ip_id, inst_id, nvgpu_cic_mon_inject_err_fsi);
}
#endif

View File

@@ -56,3 +56,11 @@ int nvgpu_cic_mon_report_err_safety_services(struct gk20a *g,
(void)err_id; (void)err_id;
return 0; return 0;
} }
#ifdef CONFIG_NVGPU_FSI_ERR_INJECTION
int nvgpu_cic_mon_reg_errinj_cb(struct gk20a *g)
{
(void)g;
return 0;
}
#endif