gpu: nvgpu: Move SDL err info structs to common

The SDL's error reporting code will be leveraged by central interrupt controller (CIC) or common.cic unit. This is a base patch to move SDL error reporting code from QNX to common. Move the data structures used during error reporting to common header - nvgpu_err_info.h JIRA NVGPU-6522 Change-Id: Ie6b209323a14b9bb38e3402c2427fbcdaae52206 Signed-off-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2504726 Reviewed-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2021-03-24 13:38:48 +00:00
parent d4c33de919
commit d67cea61f1
2 changed files with 336 additions and 1 deletions
--- a/arch/nvgpu-common.yaml
+++ b/arch/nvgpu-common.yaml
@@ -175,7 +175,8 @@ nvlink:
 nvgpu_err:
  safe: yes
  owner: Unknown
-  sources: [ include/nvgpu/nvgpu_err.h ]
+  sources: [ include/nvgpu/nvgpu_err.h,
             include/nvgpu/nvgpu_err_info.h]
 pramin:
  safe: yes
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err_info.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err_info.h
@@ -0,0 +1,334 @@
 /*
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.  Any
 * use, reproduction, disclosure or distribution of this software and related
 * documentation without an express license agreement from NVIDIA Corporation
 * is strictly prohibited.
 */
 #ifndef NVGPU_ERR_INFO_H
 #define NVGPU_ERR_INFO_H
 /**
 * @file
 *
 * Declare the format of error message for various hw units in GPU and add
 * designated initializers for them.
 */
 #include <nvgpu/types.h>
 struct gk20a;
 struct nvgpu_hw_err_inject_info;
 /**
 * gpu_err_header structure holds fields which are required to identify the
 * version of header, sub-error type, sub-unit id, error address and time stamp.
 */
 struct gpu_err_header {
 	/** Version of GPU error header. */
 	struct {
 		/** Major version number. */
 		u16 major;
 		/** Minor version number. */
 		u16 minor;
 	} version;
 	/** Sub error type corresponding to the error that is being reported. */
 	u32 sub_err_type;
 	/** ID of the sub-unit in a HW unit which encountered an error. */
 	u64 sub_unit_id;
 	/** Location of the error. */
 	u64 address;
 	/** Timestamp in nano seconds. */
 	u64 timestamp_ns;
 };
 struct gpu_host_error_info {
 	struct gpu_err_header header;
 };
 struct gpu_ecc_error_info {
 	struct gpu_err_header header;
 	/** Number of ECC errors. */
 	u64 err_cnt;
 };
 struct gpu_gr_error_info {
 	struct gpu_err_header header;
 	/** Context which triggerd exception. */
 	u32 curr_ctx;
 	/** Channel bound to the context. */
 	u32 chid;
 	/** TSG to which the channel is bound. */
 	u32 tsgid;
 	/** Exception status. */
 	u32 status;
 };
 struct gpu_sm_error_info {
 	struct gpu_err_header header;
 	/** PC when exception was triggered. */
 	u64 warp_esr_pc;
 	/** SM error status. */
 	u32 warp_esr_status;
 	/** Current context which triggered exception. */
 	u32 curr_ctx;
 	/** Channel ID. */
 	u32 chid;
 	/** TSG ID. */
 	u32 tsgid;
 	/** IDs of TPC, GPC, and SM. */
 	u32 tpc, gpc, sm;
 };
 /**
 * This structure describes the various debug information reported
 * by GMMU during MMU page fault exceptions. The details of each
 * member in this struct can be found in mmu_fault_info structure
 * defined in
 *
 *  + drivers/gpu/nvgpu/include/nvgpu/mmu_fault.h
 */
 struct mmu_page_fault_info {
 	u64	inst_ptr;
 	u32	inst_aperture;
 	u64	fault_addr;
 	u32	fault_addr_aperture;
 	u32	timestamp_lo;
 	u32	timestamp_hi;
 	u32	mmu_engine_id;
 	u32	gpc_id;
 	u32	client_type;
 	u32	client_id;
 	u32	fault_type;
 	u32	access_type;
 	u32	protected_mode;
 	bool	replayable_fault;
 	u32	replay_fault_en;
 	bool	valid;
 	u32	faulted_pbdma;
 	u32	faulted_engine;
 	u32	faulted_subid;
 	u32	chid;
 };
 struct gpu_mmu_error_info {
 	struct gpu_err_header header;
 	struct mmu_page_fault_info info;
 	/** MMU page fault status. */
 	u32 status;
 };
 struct gpu_ce_error_info {
 	struct gpu_err_header header;
 };
 struct gpu_pri_error_info {
 	struct gpu_err_header header;
 };
 struct gpu_pmu_error_info {
 	struct gpu_err_header header;
 	/** PMU bar0 error status value. */
 	u32 status;
 };
 struct gpu_ctxsw_error_info {
 	struct gpu_err_header header;
 	/** Current context. */
 	u32 curr_ctx;
 	/** TSG ID. */
 	u32 tsgid;
 	/** ChanneDl ID. */
 	u32 chid;
 	/** Context switch status registers. */
 	u32 ctxsw_status0, ctxsw_status1;
 	/** Mailbox value. */
 	u32 mailbox_value;
 };
 /**
 * gpu_error_info structure holds fields for error info related to each hardware
 * unit whose errors will be reported.
 */
 union gpu_error_info {
 	struct gpu_host_error_info host_info;
 	struct gpu_ecc_error_info ecc_info;
 	struct gpu_gr_error_info gr_info;
 	struct gpu_sm_error_info sm_info;
 	struct gpu_ce_error_info ce_info;
 	struct gpu_pri_error_info pri_info;
 	struct gpu_pmu_error_info pmu_err_info;
 	struct gpu_ctxsw_error_info ctxsw_info;
 	struct gpu_mmu_error_info mmu_info;
 };
 /**
 * nvgpu_err_msg structure holds fields which are required to identify the
 * source and type and criticality of the reported error.
 */
 struct nvgpu_err_msg {
 	/**
 	 * Identify the hw module which generated the error. List of supported
 	 * hw modules and errors can be found in
 	 *
 	 *  + drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
 	 */
 	u32 hw_unit_id;
 	/** Flag to indicate the criticality of the error. */
 	bool is_critical;
 	/** Error ID. */
 	u8 err_id;
 	/** Size of the error message. */
 	u8 err_size;
 	/** GPU error information. */
 	union gpu_error_info err_info;
 	/** Used to get error information from look-up table. */
 	struct nvgpu_err_desc *err_desc;
 };
 /**
 * This macro is used to initialize the members of nvgpu_err_desc struct.
 */
 #define GPU_ERR(err, critical, id, inject_support, hw_inject_fn, sw_inject_fn,\
 		addr, val, threshold, ecount)				\
 {									\
 		.name = (err),						\
 		.is_critical = (critical),				\
 		.error_id = (id),					\
 		.err_inject_info.type = (inject_support),		\
 		.err_threshold = (threshold),				\
 		.err_count = (ecount),					\
 		.inject_hw_fault = (hw_inject_fn),			\
 		.inject_sw_fault = (sw_inject_fn),			\
 		.err_inject_info.get_reg_addr = (addr),			\
 		.err_inject_info.get_reg_val = (val)			\
 }
 /**
 * This macro is used to initialize critical errors.
 */
 #define GPU_CRITERR(err, id, inject_support, hw_inject_fn, sw_inject_fn, addr,\
 		val, threshold, ecount) \
 NVGPU_COV_WHITELIST(false_positive, NVGPU_MISRA(Rule, 10_3), "Bug 2623654") \
 	GPU_ERR(err, true, id, inject_support, hw_inject_fn, sw_inject_fn, \
 		addr, val, threshold, ecount)
 /**
 * This macro is used to initialize non-critical errors.
 */
 #define GPU_NONCRITERR(err, id, inject_support, hw_inject_fn, sw_inject_fn,\
 		addr, val, threshold, ecount) \
 NVGPU_COV_WHITELIST(false_positive, NVGPU_MISRA(Rule, 10_3), "Bug 2623654") \
 	GPU_ERR(err, false, id, inject_support, hw_inject_fn, sw_inject_fn,\
 		addr, val, threshold, ecount)
 /**
 * This defines the various types of error injection supported in SDL.
 */
 #define INJECT_NONE	(0U)
 #define INJECT_HW	(1U)
 #define INJECT_SW	(2U)
 struct nvgpu_hw_err_inject_info;
 /**
 * nvgpu_err_desc structure holds fields which describe an error along with
 * function callback which can be used to inject the error.
 */
 struct nvgpu_err_desc {
 	/** String representation of error. */
 	const char *name;
 	/** Flag to classify an error as critical or non-critical. */
 	bool is_critical;
 	/**
 	 * Error Threshold: once this threshold value is reached, then the
 	 * corresponding error counter will be reset to 0 and the error will be
 	 * propagated to Safety_Services.
 	 */
 	int err_threshold;
 	/**
 	 * Total number of times an error has occurred (since its last reset).
 	 */
 	int err_count;
 	/** Function to support HW based error injection. */
 	void (*inject_hw_fault)(struct gk20a *g,
 			struct nvgpu_hw_err_inject_info *err, u32 err_info);
 	/** Function to support SW based error injection. */
 	void (*inject_sw_fault)(struct gk20a *g,
 			u32 hw_unit, u32 err_index, u32 inst);
 	/** Error ID. */
 	u8 error_id;
 	/**
 	 * err_inject_info structure describes the type of error injection and
 	 * the required register address and a write value.
 	 */
 	struct err_inject_info {
 		/** Type of error injection: HW / SW / None. */
 		u32 type;
 		/** Function to get register address for error injection. */
 		u32 (*get_reg_addr)(void);
 		/** Function to get register value for error injection. */
 		u32 (*get_reg_val)(u32 val);
 	} err_inject_info;
 };
 /**
 * nvgpu_err_hw_module structure holds fields which describe the h/w modules
 * error reporting capabilities.
 */
 struct nvgpu_err_hw_module {
 	/** String representation of a given HW unit. */
 	const char *name;
 	/** HW unit ID. */
 	u32 hw_unit;
 	/** Total number of instances of a given HW unit. */
 	u32 num_instances;
 	/** Total number of errors reported from a given HW unit. */
 	u32 num_errs;
 	/** Used to get error description from look-up table. */
 	struct nvgpu_err_desc *errs;
 };
 #endif