gpu: nvgpu: Move SDL err info structs to common

The SDL's error reporting code will be leveraged by central interrupt
controller (CIC) or common.cic unit.
This is a base patch to move SDL error reporting code from QNX
to common. Move the data structures used during error reporting to
common header - nvgpu_err_info.h

JIRA NVGPU-6522

Change-Id: Ie6b209323a14b9bb38e3402c2427fbcdaae52206
Signed-off-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2504726
Reviewed-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Tejal Kudav
2021-03-24 13:38:48 +00:00
committed by mobile promotions
parent d4c33de919
commit d67cea61f1
2 changed files with 336 additions and 1 deletions

View File

@@ -175,7 +175,8 @@ nvlink:
nvgpu_err: nvgpu_err:
safe: yes safe: yes
owner: Unknown owner: Unknown
sources: [ include/nvgpu/nvgpu_err.h ] sources: [ include/nvgpu/nvgpu_err.h,
include/nvgpu/nvgpu_err_info.h]
pramin: pramin:
safe: yes safe: yes

View File

@@ -0,0 +1,334 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation. Any
* use, reproduction, disclosure or distribution of this software and related
* documentation without an express license agreement from NVIDIA Corporation
* is strictly prohibited.
*/
#ifndef NVGPU_ERR_INFO_H
#define NVGPU_ERR_INFO_H
/**
* @file
*
* Declare the format of error message for various hw units in GPU and add
* designated initializers for them.
*/
#include <nvgpu/types.h>
struct gk20a;
struct nvgpu_hw_err_inject_info;
/**
* gpu_err_header structure holds fields which are required to identify the
* version of header, sub-error type, sub-unit id, error address and time stamp.
*/
struct gpu_err_header {
/** Version of GPU error header. */
struct {
/** Major version number. */
u16 major;
/** Minor version number. */
u16 minor;
} version;
/** Sub error type corresponding to the error that is being reported. */
u32 sub_err_type;
/** ID of the sub-unit in a HW unit which encountered an error. */
u64 sub_unit_id;
/** Location of the error. */
u64 address;
/** Timestamp in nano seconds. */
u64 timestamp_ns;
};
struct gpu_host_error_info {
struct gpu_err_header header;
};
struct gpu_ecc_error_info {
struct gpu_err_header header;
/** Number of ECC errors. */
u64 err_cnt;
};
struct gpu_gr_error_info {
struct gpu_err_header header;
/** Context which triggerd exception. */
u32 curr_ctx;
/** Channel bound to the context. */
u32 chid;
/** TSG to which the channel is bound. */
u32 tsgid;
/** Exception status. */
u32 status;
};
struct gpu_sm_error_info {
struct gpu_err_header header;
/** PC when exception was triggered. */
u64 warp_esr_pc;
/** SM error status. */
u32 warp_esr_status;
/** Current context which triggered exception. */
u32 curr_ctx;
/** Channel ID. */
u32 chid;
/** TSG ID. */
u32 tsgid;
/** IDs of TPC, GPC, and SM. */
u32 tpc, gpc, sm;
};
/**
* This structure describes the various debug information reported
* by GMMU during MMU page fault exceptions. The details of each
* member in this struct can be found in mmu_fault_info structure
* defined in
*
* + drivers/gpu/nvgpu/include/nvgpu/mmu_fault.h
*/
struct mmu_page_fault_info {
u64 inst_ptr;
u32 inst_aperture;
u64 fault_addr;
u32 fault_addr_aperture;
u32 timestamp_lo;
u32 timestamp_hi;
u32 mmu_engine_id;
u32 gpc_id;
u32 client_type;
u32 client_id;
u32 fault_type;
u32 access_type;
u32 protected_mode;
bool replayable_fault;
u32 replay_fault_en;
bool valid;
u32 faulted_pbdma;
u32 faulted_engine;
u32 faulted_subid;
u32 chid;
};
struct gpu_mmu_error_info {
struct gpu_err_header header;
struct mmu_page_fault_info info;
/** MMU page fault status. */
u32 status;
};
struct gpu_ce_error_info {
struct gpu_err_header header;
};
struct gpu_pri_error_info {
struct gpu_err_header header;
};
struct gpu_pmu_error_info {
struct gpu_err_header header;
/** PMU bar0 error status value. */
u32 status;
};
struct gpu_ctxsw_error_info {
struct gpu_err_header header;
/** Current context. */
u32 curr_ctx;
/** TSG ID. */
u32 tsgid;
/** ChanneDl ID. */
u32 chid;
/** Context switch status registers. */
u32 ctxsw_status0, ctxsw_status1;
/** Mailbox value. */
u32 mailbox_value;
};
/**
* gpu_error_info structure holds fields for error info related to each hardware
* unit whose errors will be reported.
*/
union gpu_error_info {
struct gpu_host_error_info host_info;
struct gpu_ecc_error_info ecc_info;
struct gpu_gr_error_info gr_info;
struct gpu_sm_error_info sm_info;
struct gpu_ce_error_info ce_info;
struct gpu_pri_error_info pri_info;
struct gpu_pmu_error_info pmu_err_info;
struct gpu_ctxsw_error_info ctxsw_info;
struct gpu_mmu_error_info mmu_info;
};
/**
* nvgpu_err_msg structure holds fields which are required to identify the
* source and type and criticality of the reported error.
*/
struct nvgpu_err_msg {
/**
* Identify the hw module which generated the error. List of supported
* hw modules and errors can be found in
*
* + drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
*/
u32 hw_unit_id;
/** Flag to indicate the criticality of the error. */
bool is_critical;
/** Error ID. */
u8 err_id;
/** Size of the error message. */
u8 err_size;
/** GPU error information. */
union gpu_error_info err_info;
/** Used to get error information from look-up table. */
struct nvgpu_err_desc *err_desc;
};
/**
* This macro is used to initialize the members of nvgpu_err_desc struct.
*/
#define GPU_ERR(err, critical, id, inject_support, hw_inject_fn, sw_inject_fn,\
addr, val, threshold, ecount) \
{ \
.name = (err), \
.is_critical = (critical), \
.error_id = (id), \
.err_inject_info.type = (inject_support), \
.err_threshold = (threshold), \
.err_count = (ecount), \
.inject_hw_fault = (hw_inject_fn), \
.inject_sw_fault = (sw_inject_fn), \
.err_inject_info.get_reg_addr = (addr), \
.err_inject_info.get_reg_val = (val) \
}
/**
* This macro is used to initialize critical errors.
*/
#define GPU_CRITERR(err, id, inject_support, hw_inject_fn, sw_inject_fn, addr,\
val, threshold, ecount) \
NVGPU_COV_WHITELIST(false_positive, NVGPU_MISRA(Rule, 10_3), "Bug 2623654") \
GPU_ERR(err, true, id, inject_support, hw_inject_fn, sw_inject_fn, \
addr, val, threshold, ecount)
/**
* This macro is used to initialize non-critical errors.
*/
#define GPU_NONCRITERR(err, id, inject_support, hw_inject_fn, sw_inject_fn,\
addr, val, threshold, ecount) \
NVGPU_COV_WHITELIST(false_positive, NVGPU_MISRA(Rule, 10_3), "Bug 2623654") \
GPU_ERR(err, false, id, inject_support, hw_inject_fn, sw_inject_fn,\
addr, val, threshold, ecount)
/**
* This defines the various types of error injection supported in SDL.
*/
#define INJECT_NONE (0U)
#define INJECT_HW (1U)
#define INJECT_SW (2U)
struct nvgpu_hw_err_inject_info;
/**
* nvgpu_err_desc structure holds fields which describe an error along with
* function callback which can be used to inject the error.
*/
struct nvgpu_err_desc {
/** String representation of error. */
const char *name;
/** Flag to classify an error as critical or non-critical. */
bool is_critical;
/**
* Error Threshold: once this threshold value is reached, then the
* corresponding error counter will be reset to 0 and the error will be
* propagated to Safety_Services.
*/
int err_threshold;
/**
* Total number of times an error has occurred (since its last reset).
*/
int err_count;
/** Function to support HW based error injection. */
void (*inject_hw_fault)(struct gk20a *g,
struct nvgpu_hw_err_inject_info *err, u32 err_info);
/** Function to support SW based error injection. */
void (*inject_sw_fault)(struct gk20a *g,
u32 hw_unit, u32 err_index, u32 inst);
/** Error ID. */
u8 error_id;
/**
* err_inject_info structure describes the type of error injection and
* the required register address and a write value.
*/
struct err_inject_info {
/** Type of error injection: HW / SW / None. */
u32 type;
/** Function to get register address for error injection. */
u32 (*get_reg_addr)(void);
/** Function to get register value for error injection. */
u32 (*get_reg_val)(u32 val);
} err_inject_info;
};
/**
* nvgpu_err_hw_module structure holds fields which describe the h/w modules
* error reporting capabilities.
*/
struct nvgpu_err_hw_module {
/** String representation of a given HW unit. */
const char *name;
/** HW unit ID. */
u32 hw_unit;
/** Total number of instances of a given HW unit. */
u32 num_instances;
/** Total number of errors reported from a given HW unit. */
u32 num_errs;
/** Used to get error description from look-up table. */
struct nvgpu_err_desc *errs;
};
#endif