gpu: nvgpu: SWUD for SDL unit

This patch adds SWUD (SW Unit Design) document for SDL unit. In addition,
it re-names err_type to err_id in error reporting APIs related to ECC, GR,
PRI and MMU, to keep the name consistent with other APIs.

JIRA NVGPU-3758

Change-Id: I968218574aa78144497fc12bd6dab20d1be7aa1c
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2151092
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Rajesh Devaraj
2019-07-10 23:04:15 +05:30
committed by mobile promotions
parent 3659c2f0c1
commit 2d8791e866
4 changed files with 308 additions and 40 deletions

View File

@@ -35,6 +35,7 @@
* *
* - @ref unit-mm * - @ref unit-mm
* - @ref unit-fifo * - @ref unit-fifo
* - @ref unit-sdl
* - Etc, etc. * - Etc, etc.
* *
* nvgpu-driver Level Requirements Table * nvgpu-driver Level Requirements Table

View File

@@ -23,11 +23,22 @@
#ifndef NVGPU_NVGPU_ERR_H #ifndef NVGPU_NVGPU_ERR_H
#define NVGPU_NVGPU_ERR_H #define NVGPU_NVGPU_ERR_H
/**
* @file
*
* Define indices for HW units and errors. Define structures used to carry error
* information. Declare prototype for APIs that are used to report GPU HW errors
* to 3LSS.
*/
#include <nvgpu/types.h> #include <nvgpu/types.h>
struct gk20a; struct gk20a;
struct mmu_fault_info; struct mmu_fault_info;
/**
* This assigns an unique index for hw units in GPU.
*/
#define NVGPU_ERR_MODULE_HOST (0U) #define NVGPU_ERR_MODULE_HOST (0U)
#define NVGPU_ERR_MODULE_SM (1U) #define NVGPU_ERR_MODULE_SM (1U)
#define NVGPU_ERR_MODULE_FECS (2U) #define NVGPU_ERR_MODULE_FECS (2U)
@@ -41,6 +52,9 @@ struct mmu_fault_info;
#define NVGPU_ERR_MODULE_PRI (10U) #define NVGPU_ERR_MODULE_PRI (10U)
#define NVGPU_ERR_MODULE_CE (11U) #define NVGPU_ERR_MODULE_CE (11U)
/**
* This assigns an unique index for errors in HOST unit.
*/
#define GPU_HOST_PFIFO_BIND_ERROR (0U) #define GPU_HOST_PFIFO_BIND_ERROR (0U)
#define GPU_HOST_PFIFO_SCHED_ERROR (1U) #define GPU_HOST_PFIFO_SCHED_ERROR (1U)
#define GPU_HOST_PFIFO_CHSW_ERROR (2U) #define GPU_HOST_PFIFO_CHSW_ERROR (2U)
@@ -60,6 +74,9 @@ struct mmu_fault_info;
#define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (16U) #define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (16U)
#define GPU_HOST_INVALID_ERROR (17U) #define GPU_HOST_INVALID_ERROR (17U)
/**
* This assigns an unique index for errors in SM unit.
*/
#define GPU_SM_L1_TAG_ECC_CORRECTED (0U) #define GPU_SM_L1_TAG_ECC_CORRECTED (0U)
#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U) #define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U)
#define GPU_SM_CBU_ECC_CORRECTED (2U) #define GPU_SM_CBU_ECC_CORRECTED (2U)
@@ -79,17 +96,34 @@ struct mmu_fault_info;
#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED (16U) #define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_CORRECTED (16U)
#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U) #define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U)
#define GPU_SM_MACHINE_CHECK_ERROR (18U) #define GPU_SM_MACHINE_CHECK_ERROR (18U)
struct gr_sm_mcerr_info {
u64 hww_warp_esr_pc; /* PC which triggered the machine check error */
u32 hww_warp_esr_status;/* Error status register */
u32 curr_ctx; /* Context which triggered error */
u32 chid; /* Channel to which the context belongs */
u32 tsgid; /* TSG to which the channel is bound */
u32 tpc, gpc, sm;
};
#define GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED (19U) #define GPU_SM_ICACHE_L1_PREDECODE_ECC_CORRECTED (19U)
#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U) #define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U)
/**
* This structure is used to store SM machine check related information.
*/
struct gr_sm_mcerr_info {
/** PC which triggered the machine check error. */
u64 hww_warp_esr_pc;
/** Error status register. */
u32 hww_warp_esr_status;
/** Context which triggered error. */
u32 curr_ctx;
/** Channel to which the context belongs. */
u32 chid;
/** TSG to which the channel is bound. */
u32 tsgid;
u32 tpc, gpc, sm;
};
/**
* This assigns an unique index for errors in FECS unit.
*/
#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U) #define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U) #define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_FECS_FALCON_DMEM_ECC_CORRECTED (2U) #define GPU_FECS_FALCON_DMEM_ECC_CORRECTED (2U)
@@ -99,6 +133,10 @@ struct gr_sm_mcerr_info {
#define GPU_FECS_FAULT_DURING_CTXSW (6U) #define GPU_FECS_FAULT_DURING_CTXSW (6U)
#define GPU_FECS_CTXSW_INIT_ERROR (7U) #define GPU_FECS_CTXSW_INIT_ERROR (7U)
#define GPU_FECS_INVALID_ERROR (8U) #define GPU_FECS_INVALID_ERROR (8U)
/**
* This structure is used to store CTXSW error related information.
*/
struct ctxsw_err_info { struct ctxsw_err_info {
u32 curr_ctx; u32 curr_ctx;
u32 ctxsw_status0; u32 ctxsw_status0;
@@ -107,11 +145,17 @@ struct ctxsw_err_info {
u32 mailbox_value; u32 mailbox_value;
}; };
/**
* This assigns an unique index for errors in GPCCS unit.
*/
#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U) #define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U) #define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED (2U) #define GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED (2U)
#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U) #define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
/**
* This assigns an unique index for errors in MMU unit.
*/
#define GPU_MMU_L1TLB_ECC_CORRECTED (0U) #define GPU_MMU_L1TLB_ECC_CORRECTED (0U)
#define GPU_MMU_L1TLB_ECC_UNCORRECTED (1U) #define GPU_MMU_L1TLB_ECC_UNCORRECTED (1U)
#define GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED (2U) #define GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED (2U)
@@ -119,15 +163,24 @@ struct ctxsw_err_info {
#define GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED (4U) #define GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED (4U)
#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (5U) #define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (5U)
/**
* This assigns an unique index for errors in GCC unit.
*/
#define GPU_GCC_L15_ECC_CORRECTED (0U) #define GPU_GCC_L15_ECC_CORRECTED (0U)
#define GPU_GCC_L15_ECC_UNCORRECTED (1U) #define GPU_GCC_L15_ECC_UNCORRECTED (1U)
/**
* This assigns an unique index for errors in PMU unit.
*/
#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U) #define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U) #define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_PMU_FALCON_DMEM_ECC_CORRECTED (2U) #define GPU_PMU_FALCON_DMEM_ECC_CORRECTED (2U)
#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U) #define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U)
#define GPU_PMU_BAR0_ERROR_TIMEOUT (4U) #define GPU_PMU_BAR0_ERROR_TIMEOUT (4U)
/**
* This assigns an unique index for errors in PGRAPH unit.
*/
#define GPU_PGRAPH_FE_EXCEPTION (0U) #define GPU_PGRAPH_FE_EXCEPTION (0U)
#define GPU_PGRAPH_MEMFMT_EXCEPTION (1U) #define GPU_PGRAPH_MEMFMT_EXCEPTION (1U)
#define GPU_PGRAPH_PD_EXCEPTION (2U) #define GPU_PGRAPH_PD_EXCEPTION (2U)
@@ -140,19 +193,31 @@ struct ctxsw_err_info {
#define GPU_PGRAPH_MPC_EXCEPTION (9U) #define GPU_PGRAPH_MPC_EXCEPTION (9U)
#define GPU_PGRAPH_ILLEGAL_ERROR (10U) #define GPU_PGRAPH_ILLEGAL_ERROR (10U)
/* Sub-errors in GPU_PGRAPH_ILLEGAL_ERROR */ /** Sub-errors in GPU_PGRAPH_ILLEGAL_ERROR. */
#define GPU_PGRAPH_ILLEGAL_NOTIFY (0U) #define GPU_PGRAPH_ILLEGAL_NOTIFY (0U)
#define GPU_PGRAPH_ILLEGAL_METHOD (1U) #define GPU_PGRAPH_ILLEGAL_METHOD (1U)
#define GPU_PGRAPH_ILLEGAL_CLASS (2U) #define GPU_PGRAPH_ILLEGAL_CLASS (2U)
#define GPU_PGRAPH_CLASS_ERROR (3U) #define GPU_PGRAPH_CLASS_ERROR (3U)
/**
* This structure is used to store GR exception related information.
*/
struct gr_exception_info { struct gr_exception_info {
u32 curr_ctx; /* Context which triggered the exception */ /** Context which triggered the exception. */
u32 chid; /* Channel bound to the context */ u32 curr_ctx;
u32 tsgid; /* TSG to which the channel is bound */
u32 status; /* Exception status */ /** Channel bound to the context. */
u32 chid;
/** TSG to which the channel is bound. */
u32 tsgid;
u32 status;
}; };
/**
* This assigns an unique index for errors in LTC unit.
*/
#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U) #define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U) #define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
#define GPU_LTC_CACHE_TSTG_ECC_CORRECTED (2U) #define GPU_LTC_CACHE_TSTG_ECC_CORRECTED (2U)
@@ -162,6 +227,9 @@ struct gr_exception_info {
#define GPU_LTC_CACHE_DSTG_BE_ECC_CORRECTED (6U) #define GPU_LTC_CACHE_DSTG_BE_ECC_CORRECTED (6U)
#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U) #define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
/**
* This assigns an unique index for errors in HUBMMU unit.
*/
#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED (0U) #define GPU_HUBMMU_L2TLB_SA_DATA_ECC_CORRECTED (0U)
#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U) #define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U)
#define GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED (2U) #define GPU_HUBMMU_TLB_SA_DATA_ECC_CORRECTED (2U)
@@ -172,27 +240,40 @@ struct gr_exception_info {
#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U) #define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U)
#define GPU_HUBMMU_PAGE_FAULT_ERROR (8U) #define GPU_HUBMMU_PAGE_FAULT_ERROR (8U)
/* Sub-errors in GPU_HUBMMU_PAGE_FAULT_ERROR */ /** Sub-errors in GPU_HUBMMU_PAGE_FAULT_ERROR. */
#define GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW (0U) #define GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW (0U)
#define GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY (1U) #define GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY (1U)
#define GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW (2U) #define GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW (2U)
#define GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY (3U) #define GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY (3U)
#define GPU_HUBMMU_OTHER_FAULT_NOTIFY (4U) #define GPU_HUBMMU_OTHER_FAULT_NOTIFY (4U)
/**
* This assigns an unique index for errors in PRI unit.
*/
#define GPU_PRI_TIMEOUT_ERROR (0U) #define GPU_PRI_TIMEOUT_ERROR (0U)
#define GPU_PRI_ACCESS_VIOLATION (1U) #define GPU_PRI_ACCESS_VIOLATION (1U)
/**
* This assigns an unique index for errors in CE unit.
*/
#define GPU_CE_LAUNCH_ERROR (0U) #define GPU_CE_LAUNCH_ERROR (0U)
#define GPU_CE_BLOCK_PIPE (1U) #define GPU_CE_BLOCK_PIPE (1U)
#define GPU_CE_NONBLOCK_PIPE (2U) #define GPU_CE_NONBLOCK_PIPE (2U)
#define GPU_CE_INVALID_CONFIG (3U) #define GPU_CE_INVALID_CONFIG (3U)
#define GPU_CE_METHOD_BUFFER_FAULT (4U) #define GPU_CE_METHOD_BUFFER_FAULT (4U)
/**
* This structure is used to store GR error related information.
*/
struct gr_err_info { struct gr_err_info {
struct gr_sm_mcerr_info *sm_mcerr_info; struct gr_sm_mcerr_info *sm_mcerr_info;
struct gr_exception_info *exception_info; struct gr_exception_info *exception_info;
}; };
/**
* This macro is used to initialize the members of nvgpu_hw_err_inject_info
* struct.
*/
#define NVGPU_ECC_ERR(err_name, inject_fn, addr, val) \ #define NVGPU_ECC_ERR(err_name, inject_fn, addr, val) \
{ \ { \
.name = (err_name), \ .name = (err_name), \
@@ -201,7 +282,12 @@ struct gr_err_info {
.get_reg_val = (val) \ .get_reg_val = (val) \
} }
/**
* This structure carries the information required for HW based error injection
* for a given error.
*/
struct nvgpu_hw_err_inject_info { struct nvgpu_hw_err_inject_info {
/** String representation of error. */
const char *name; const char *name;
int (*inject_hw_fault)(struct gk20a *g, int (*inject_hw_fault)(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 err_info); struct nvgpu_hw_err_inject_info *err, u32 err_info);
@@ -209,35 +295,216 @@ struct nvgpu_hw_err_inject_info {
u32 (*get_reg_val)(u32 val); u32 (*get_reg_val)(u32 val);
}; };
/**
* This structure contains a pointer to an array containing HW based error
* injection information and the size of that array.
*/
struct nvgpu_hw_err_inject_info_desc { struct nvgpu_hw_err_inject_info_desc {
struct nvgpu_hw_err_inject_info *info_ptr; struct nvgpu_hw_err_inject_info *info_ptr;
u32 info_size; u32 info_size;
}; };
/* Functions to report errors to 3LSS */ /**
* @brief Report error in HOST unit to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit (HOST).
* @param inst[in] - Instance ID.
* @param err_id[in] - Error index.
* @param intr_info[in] - Content of interrupt status register.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
u32 inst, u32 err_id, u32 intr_info); u32 inst, u32 err_id, u32 intr_info);
/**
* @brief Report error in CE unit to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit (CE).
* @param inst[in] - Instance ID.
* @param err_id[in] - Error index.
* @param intr_info[in] - Content of interrupt status register.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit, int nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
u32 inst, u32 err_id, u32 intr_info); u32 inst, u32 err_id, u32 intr_info);
/**
* @brief Report ECC error to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit.
* @param inst[in] - Instance ID.
* @param err_id[in] - Error index.
* @param err_addr[in] - Error address.
* @param err_count[in] - Error count.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Validates slice ID and TPC ID for LTC and SM units, respectively.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u64 err_addr, u64 err_count); u32 err_id, u64 err_addr, u64 err_count);
/**
* @brief Report CTXSW error to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit (FECS).
* @param err_id[in] - Error index.
* @param data[in] - CTXSW error information.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id, int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
void *data); void *data);
/**
* @brief Report GR error to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit.
* @param inst[in] - Instance ID.
* @param err_id[in] - Error index.
* @param err_info[in] - Error information.
* @param sub_err_type[in] - Sub error type.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type); u32 err_id, struct gr_err_info *err_info, u32 sub_err_type);
/**
* @brief Report PMU error to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit (PMU).
* @param err_id[in] - Error index.
* @param sub_err_type[in] - Sub error type.
* @param status[in] - Error information.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id, int nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
u32 sub_err_type, u32 status); u32 sub_err_type, u32 status);
/**
* @brief Report PRI error to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit (PMU).
* @param err_id[in] - Error index.
* @param sub_err_type[in] - Sub error type.
* @param status[in] - Error information.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u32 err_addr, u32 err_code); u32 err_id, u32 err_addr, u32 err_code);
/**
* @brief Report MMU page fault error to 3LSS.
*
* @param g[in] - The GPU driver struct.
* @param hw_unit[in] - Index of HW unit (PMU).
* @param err_id[in] - Error index.
* @param fault_info[in] - MMU page fault information.
* @param sub_err_type[in] - Sub error type.
*
* - In case of linux/posix implementation, it simply returns 0 since SDL is not
* supported for them.
* - In case of nvgpu-qnx, it does the following:
* - Checks whether SDL is supported in the current GPU platform.
* - Validates the HW unit ID and error ID.
* - Forms error packet and checks whether it exceeds the max size.
* - Sends error packet to report error to 3LSS.
*
* @return 0 in case of success, <0 in case of failure.
* @retval -EINVAL in case of (1) invalid HW unit ID, (2) invalid error ID,
* (3) clock get time API failed, (4) the size of error packet exceeds
* maximum allowed size.
*/
int nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, int nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
u32 err_type, struct mmu_fault_info *fault_info, u32 err_id, struct mmu_fault_info *fault_info,
u32 status, u32 sub_err_type); u32 status, u32 sub_err_type);
#endif #endif /* NVGPU_NVGPU_ERR_H */

View File

@@ -32,13 +32,13 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
} }
int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u64 err_addr, u64 err_count) u32 err_id, u64 err_addr, u64 err_count)
{ {
return 0; return 0;
} }
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type) u32 err_id, struct gr_err_info *err_info, u32 sub_err_type)
{ {
return 0; return 0;
} }
@@ -56,7 +56,7 @@ int nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
} }
int nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u32 err_addr, u32 err_code) u32 err_id, u32 err_addr, u32 err_code)
{ {
return 0; return 0;
} }
@@ -68,7 +68,7 @@ int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
} }
int nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, int nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
u32 err_type, struct mmu_fault_info *fault_info, u32 err_id, struct mmu_fault_info *fault_info,
u32 status, u32 sub_err_type) u32 status, u32 sub_err_type)
{ {
return 0; return 0;

View File

@@ -54,13 +54,13 @@ int nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
} }
int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u64 err_addr, u64 err_count) u32 err_id, u64 err_addr, u64 err_count)
{ {
return 0; return 0;
} }
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type) u32 err_id, struct gr_err_info *err_info, u32 sub_err_type)
{ {
return 0; return 0;
} }
@@ -78,7 +78,7 @@ int nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
} }
int nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst, int nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, u32 err_addr, u32 err_code) u32 err_id, u32 err_addr, u32 err_code)
{ {
return 0; return 0;
} }
@@ -90,7 +90,7 @@ int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
} }
int nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, int nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
u32 err_type, struct mmu_fault_info *fault_info, u32 err_id, struct mmu_fault_info *fault_info,
u32 status, u32 sub_err_type) u32 status, u32 sub_err_type)
{ {
return 0; return 0;