gpu: nvgpu: update sm ecc_status_error handling

Use gv11b_gr_intr_handle_tpc_sm_ecc_exception
function for future chip to avoid code replication.

Add sm_ecc_status_errors hal to read
the ecc_status_errors

Jira NVGPU-5033

Signed-off-by: Vinod G <vinodg@nvidia.com>
Change-Id: I4a25837d9b833a48307b9353b82ff6597f985e41
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2325537
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2020-04-07 18:16:23 -07:00
committed by Alex Waterman
parent 72d01afd0c
commit 6a7bf6cdc0
5 changed files with 557 additions and 186 deletions

View File

@@ -58,6 +58,38 @@ struct nvgpu_gr_isr_data;
#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE U32(0)
#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_TRUE U32(1)
#define SHIFT_8_BITS 8U
#define MAX_SM_ECC_ERR_COUNT 8U
/* Enum for different types of SM ecc errors */
enum nvgpu_gr_sm_ecc_error_types {
SM_LRF_ECC_ERROR = 0U,
SM_L1_DATA_ECC_ERROR = 1U,
SM_L1_TAG_ERROR = 2U,
SM_CBU_ECC_ERROR = 3U,
SM_ICACHE_ECC_ERROR = 4U,
SM_RAMS_ECC_ERROR = 5U
};
/* Use this struch with each SM ecc_status_error type */
struct nvgpu_gr_sm_ecc_status {
/*
* Total ecc errors reporting back to SDL
* from each sm exception
*/
u32 err_count;
/* Error index report to SDL */
u32 err_id[MAX_SM_ECC_ERR_COUNT];
/* Reported corrected error status from SM ecc_status */
u32 corrected_err_status;
/* Reported uncorrected error status from SM ecc_status */
u32 uncorrected_err_status;
};
int gv11b_gr_intr_handle_fecs_error(struct gk20a *g,
struct nvgpu_channel *ch_ptr,
struct nvgpu_gr_isr_data *isr_data);
@@ -109,6 +141,9 @@ u64 gv11b_gr_intr_get_sm_hww_warp_esr_pc(struct gk20a *g, u32 offset);
u32 gv11b_gr_intr_ctxsw_checksum_mismatch_mailbox_val(void);
bool gv11b_gr_intr_sm_ecc_status_errors(struct gk20a *g,
u32 ecc_status_reg, enum nvgpu_gr_sm_ecc_error_types err_type,
struct nvgpu_gr_sm_ecc_status *ecc_status);
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
void gv11b_gr_intr_set_shader_exceptions(struct gk20a *g, u32 data);
#endif

View File

File diff suppressed because it is too large Load Diff

View File

@@ -736,6 +736,8 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask,
.get_ctxsw_checksum_mismatch_mailbox_val =
gv11b_gr_intr_ctxsw_checksum_mismatch_mailbox_val,
.sm_ecc_status_errors =
gv11b_gr_intr_sm_ecc_status_errors,
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
.handle_tex_exception = NULL,
.set_shader_exceptions =

View File

@@ -754,6 +754,8 @@ static const struct gpu_ops tu104_ops = {
gv11b_gr_intr_get_sm_hww_global_esr,
.get_sm_no_lock_down_hww_global_esr_mask =
gv11b_gr_intr_get_sm_no_lock_down_hww_global_esr_mask,
.sm_ecc_status_errors =
gv11b_gr_intr_sm_ecc_status_errors,
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
.handle_tex_exception = NULL,
.set_shader_exceptions =

View File

@@ -45,6 +45,9 @@ struct nvgpu_fecs_ecc_status;
struct nvgpu_fecs_host_intr_status;
struct netlist_av_list;
struct nvgpu_hw_err_inject_info_desc;
struct nvgpu_gr_sm_ecc_status;
enum nvgpu_gr_sm_ecc_error_types;
#ifdef CONFIG_NVGPU_FECS_TRACE
struct nvgpu_gr_subctx;
@@ -504,6 +507,9 @@ struct gops_gr_intr {
u32 (*get_sm_no_lock_down_hww_global_esr_mask)(
struct gk20a *g);
u32 (*get_ctxsw_checksum_mismatch_mailbox_val)(void);
bool (*sm_ecc_status_errors)(struct gk20a *g, u32 ecc_status_reg,
enum nvgpu_gr_sm_ecc_error_types err_type,
struct nvgpu_gr_sm_ecc_status *ecc_status);
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
void (*handle_tex_exception)(struct gk20a *g,
u32 gpc, u32 tpc);